blob: 5b3fd9e61d3e62bf57e6df4e7bd63c42f72b3e4e [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Brett Cannond5ec98c2007-10-20 02:54:14 +000055
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056/* Token names */
57
Guido van Rossum86bea461997-04-29 21:03:06 +000058char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000059 "ENDMARKER",
60 "NAME",
61 "NUMBER",
62 "STRING",
63 "NEWLINE",
64 "INDENT",
65 "DEDENT",
66 "LPAR",
67 "RPAR",
68 "LSQB",
69 "RSQB",
70 "COLON",
71 "COMMA",
72 "SEMI",
73 "PLUS",
74 "MINUS",
75 "STAR",
76 "SLASH",
77 "VBAR",
78 "AMPER",
79 "LESS",
80 "GREATER",
81 "EQUAL",
82 "DOT",
83 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000084 "LBRACE",
85 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000086 "EQEQUAL",
87 "NOTEQUAL",
88 "LESSEQUAL",
89 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000090 "TILDE",
91 "CIRCUMFLEX",
92 "LEFTSHIFT",
93 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000094 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000095 "PLUSEQUAL",
96 "MINEQUAL",
97 "STAREQUAL",
98 "SLASHEQUAL",
99 "PERCENTEQUAL",
100 "AMPEREQUAL",
101 "VBAREQUAL",
102 "CIRCUMFLEXEQUAL",
103 "LEFTSHIFTEQUAL",
104 "RIGHTSHIFTEQUAL",
105 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000106 "DOUBLESLASH",
107 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000108 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000109 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000110 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000111 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 "OP",
113 "<ERRORTOKEN>",
114 "<N_TOKENS>"
115};
116
117
118/* Create and initialize a new tok_state structure */
119
120static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000121tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000123 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
124 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 if (tok == NULL)
126 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000127 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000128 tok->done = E_OK;
129 tok->fp = NULL;
130 tok->tabsize = TABSIZE;
131 tok->indent = 0;
132 tok->indstack[0] = 0;
133 tok->atbol = 1;
134 tok->pendin = 0;
135 tok->prompt = tok->nextprompt = NULL;
136 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000137 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000139 tok->altwarning = 1;
140 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000141 tok->alttabsize = 1;
142 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000143 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000144 tok->decoding_erred = 0;
145 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000147 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000149 tok->decoding_readline = NULL;
150 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000151#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000152 return tok;
153}
154
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000155#ifdef PGEN
156
157static char *
158decoding_fgets(char *s, int size, struct tok_state *tok)
159{
160 return fgets(s, size, tok->fp);
161}
162
163static int
164decoding_feof(struct tok_state *tok)
165{
166 return feof(tok->fp);
167}
168
169static const char *
170decode_str(const char *str, struct tok_state *tok)
171{
172 return str;
173}
174
175#else /* PGEN */
176
177static char *
178error_ret(struct tok_state *tok) /* XXX */
179{
180 tok->decoding_erred = 1;
181 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 tok->buf = NULL;
184 return NULL; /* as if it were EOF */
185}
186
187static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000191 if (result != NULL) {
192 memcpy(result, s, len);
193 result[len] = '\0';
194 }
195 return result;
196}
197
198static char *
199get_normal_name(char *s) /* for utf-8 and latin-1 */
200{
201 char buf[13];
202 int i;
203 for (i = 0; i < 12; i++) {
204 int c = s[i];
205 if (c == '\0') break;
206 else if (c == '_') buf[i] = '-';
207 else buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
212 else if (strcmp(buf, "latin-1") == 0 ||
213 strcmp(buf, "iso-8859-1") == 0 ||
214 strcmp(buf, "iso-latin-1") == 0 ||
215 strncmp(buf, "latin-1-", 8) == 0 ||
216 strncmp(buf, "iso-8859-1-", 11) == 0 ||
217 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
218 else return s;
219}
220
221/* Return the coding spec in S, or NULL if none is found. */
222
223static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000224get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000225{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000226 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000227 /* Coding spec must be in a comment, and that comment must be
228 * the only statement on the source code line. */
229 for (i = 0; i < size - 6; i++) {
230 if (s[i] == '#')
231 break;
232 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
233 return NULL;
234 }
235 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236 const char* t = s + i;
237 if (strncmp(t, "coding", 6) == 0) {
238 const char* begin = NULL;
239 t += 6;
240 if (t[0] != ':' && t[0] != '=')
241 continue;
242 do {
243 t++;
244 } while (t[0] == '\x20' || t[0] == '\t');
245
246 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000247 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000248 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 t++;
250
251 if (begin < t) {
252 char* r = new_string(begin, t - begin);
253 char* q = get_normal_name(r);
254 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000256 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000257 }
258 return r;
259 }
260 }
261 }
262 return NULL;
263}
264
265/* Check whether the line contains a coding spec. If it does,
266 invoke the set_readline function for the new encoding.
267 This function receives the tok_state and the new encoding.
268 Return 1 on success, 0 on failure. */
269
270static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000271check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int set_readline(struct tok_state *, const char *))
273{
Tim Peters17db21f2002-09-03 15:39:58 +0000274 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000275 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000276
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000277 if (tok->cont_line)
278 /* It's a continuation line, so it can't be a coding spec. */
279 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000280 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 if (cs != NULL) {
282 tok->read_coding_spec = 1;
283 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 if (strcmp(cs, "utf-8") == 0 ||
286 strcmp(cs, "iso-8859-1") == 0) {
287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
326 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
327 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
328#if 0
329 /* Disable support for UTF-16 BOMs until a decision
330 is made whether this needs to be supported. */
331 } else if (ch == 0xFE) {
332 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
333 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000334 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335 } else if (ch == 0xFF) {
336 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
337 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000338 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339#endif
340 } else {
341 unget_char(ch, tok);
342 return 1;
343 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000344 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000345 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
347 return 1;
348 NON_BOM:
349 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
350 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
351 return 1;
352}
353
354/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000355 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 On entry, tok->decoding_buffer will be one of:
358 1) NULL: need to call tok->decoding_readline to get a new line
359 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
360 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000361 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000362 (in the s buffer) to copy entire contents of the line read
363 by tok->decoding_readline. tok->decoding_buffer has the overflow.
364 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 reached): see tok_nextc and its calls to decoding_fgets.
367*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368
369static char *
370fp_readl(char *s, int size, struct tok_state *tok)
371{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000372 PyObject* bufobj = tok->decoding_buffer;
373 const char *buf;
374 Py_ssize_t buflen;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000375 int allocated = 0;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376
377 /* Ask for one less byte so we can terminate it */
378 assert(size > 0);
379 size--;
380
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000381 if (bufobj == NULL) {
382 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
383 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000384 goto error;
385 allocated = 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000387 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
388 if (buf == NULL) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000389 goto error;
390 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000391 if (buflen > size) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000392 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000393 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
394 buflen-size);
395 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000396 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000397 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000398 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000399 memcpy(s, buf, buflen);
400 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000401 if (buflen == 0) /* EOF */
402 s = NULL;
403 if (allocated) {
404 Py_DECREF(bufobj);
405 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000406 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000407
408error:
409 if (allocated) {
410 Py_XDECREF(bufobj);
411 }
412 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000413}
414
415/* Set the readline function for TOK to a StreamReader's
416 readline function. The StreamReader is named ENC.
417
418 This function is called from check_bom and check_coding_spec.
419
420 ENC is usually identical to the future value of tok->encoding,
421 except for the (currently unsupported) case of UTF-16.
422
423 Return 1 on success, 0 on failure. */
424
425static int
426fp_setreadl(struct tok_state *tok, const char* enc)
427{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000428 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000429
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000430 io = PyImport_ImportModule("io");
431 if (io == NULL)
432 goto cleanup;
433
434 stream = PyObject_CallMethod(io, "open", "ssis",
435 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000436 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000437 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000438
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000439 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000440 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000442
443 cleanup:
444 Py_XDECREF(stream);
445 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000446 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447}
448
449/* Fetch the next byte from TOK. */
450
451static int fp_getc(struct tok_state *tok) {
452 return getc(tok->fp);
453}
454
455/* Unfetch the last byte back into TOK. */
456
457static void fp_ungetc(int c, struct tok_state *tok) {
458 ungetc(c, tok->fp);
459}
460
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000461/* Check whether the characters at s start a valid
462 UTF-8 sequence. Return the number of characters forming
463 the sequence if yes, 0 if not. */
464static int valid_utf8(const unsigned char* s)
465{
466 int expected = 0;
467 int length;
468 if (*s < 0x80)
469 /* single-byte code */
470 return 1;
471 if (*s < 0xc0)
472 /* following byte */
473 return 0;
474 if (*s < 0xE0)
475 expected = 1;
476 else if (*s < 0xF0)
477 expected = 2;
478 else if (*s < 0xF8)
479 expected = 3;
480 else
481 return 0;
482 length = expected + 1;
483 for (; expected; expected--)
484 if (s[expected] < 0x80 || s[expected] >= 0xC0)
485 return 0;
486 return length;
487}
488
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000489/* Read a line of input from TOK. Determine encoding
490 if necessary. */
491
492static char *
493decoding_fgets(char *s, int size, struct tok_state *tok)
494{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000495 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000496 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000497 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000498 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000499 /* We already have a codec associated with
500 this input. */
501 line = fp_readl(s, size, tok);
502 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000503 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000505 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000507 break;
508 } else {
509 /* We have not yet determined the encoding.
510 If an encoding is found, use the file-pointer
511 reader functions from now on. */
512 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
513 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000514 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000516 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
518 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
519 return error_ret(tok);
520 }
521 }
522#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000523 /* The default encoding is UTF-8, so make sure we don't have any
524 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000525 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000527 int length;
528 for (c = (unsigned char *)line; *c; c += length)
529 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530 badchar = *c;
531 break;
532 }
533 }
534 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000535 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000536 /* Need to add 1 to the line number, since this line
537 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000538 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000539 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000540 "in file %.200s on line %i, "
541 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000542 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000543 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000544 PyErr_SetString(PyExc_SyntaxError, buf);
545 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546 }
547#endif
548 return line;
549}
550
551static int
552decoding_feof(struct tok_state *tok)
553{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000554 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 return feof(tok->fp);
556 } else {
557 PyObject* buf = tok->decoding_buffer;
558 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000559 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 if (buf == NULL) {
561 error_ret(tok);
562 return 1;
563 } else {
564 tok->decoding_buffer = buf;
565 }
566 }
567 return PyObject_Length(buf) == 0;
568 }
569}
570
571/* Fetch a byte from TOK, using the string buffer. */
572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000573static int
574buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000575 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576}
577
578/* Unfetch a byte from TOK, using the string buffer. */
579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000580static void
581buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000583 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584}
585
586/* Set the readline function for TOK to ENC. For the string-based
587 tokenizer, this means to just record the encoding. */
588
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000589static int
590buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591 tok->enc = enc;
592 return 1;
593}
594
595/* Return a UTF-8 encoding Python string object from the
596 C byte string STR, which is encoded with ENC. */
597
598static PyObject *
599translate_into_utf8(const char* str, const char* enc) {
600 PyObject *utf8;
601 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
602 if (buf == NULL)
603 return NULL;
604 utf8 = PyUnicode_AsUTF8String(buf);
605 Py_DECREF(buf);
606 return utf8;
607}
608
609/* Decode a byte string STR for use as the buffer of TOK.
610 Look for encoding declarations inside STR, and record them
611 inside TOK. */
612
613static const char *
614decode_str(const char *str, struct tok_state *tok)
615{
616 PyObject* utf8 = NULL;
617 const char *s;
618 int lineno = 0;
619 tok->enc = NULL;
620 tok->str = str;
621 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000622 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000624 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 if (tok->enc != NULL) {
626 utf8 = translate_into_utf8(str, tok->enc);
627 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000628 return error_ret(tok);
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000629 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630 }
631 for (s = str;; s++) {
632 if (*s == '\0') break;
633 else if (*s == '\n') {
634 lineno++;
635 if (lineno == 2) break;
636 }
637 }
638 tok->enc = NULL;
639 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641 if (tok->enc != NULL) {
642 assert(utf8 == NULL);
643 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000644 if (utf8 == NULL) {
645 PyErr_Format(PyExc_SyntaxError,
646 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000647 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000648 }
Neal Norwitzf7f28fc2007-08-11 21:31:25 +0000649 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 }
651 assert(tok->decoding_buffer == NULL);
652 tok->decoding_buffer = utf8; /* CAUTION */
653 return str;
654}
655
656#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657
658/* Set up tokenizer for string */
659
660struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000661PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000662{
663 struct tok_state *tok = tok_new();
664 if (tok == NULL)
665 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000667 if (str == NULL) {
668 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000669 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000670 }
671
Martin v. Löwis95292d62002-12-11 14:04:59 +0000672 /* XXX: constify members. */
673 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000674 return tok;
675}
676
677
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000678/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000679
680struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000681PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000682{
683 struct tok_state *tok = tok_new();
684 if (tok == NULL)
685 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000686 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000687 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000688 return NULL;
689 }
690 tok->cur = tok->inp = tok->buf;
691 tok->end = tok->buf + BUFSIZ;
692 tok->fp = fp;
693 tok->prompt = ps1;
694 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000695 if (enc != NULL) {
696 /* Must copy encoding declaration since it
697 gets copied into the parse tree. */
698 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
699 if (!tok->encoding) {
700 PyTokenizer_Free(tok);
701 return NULL;
702 }
703 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000704 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000705 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706 return tok;
707}
708
709
710/* Free a tok_state structure */
711
712void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000713PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000714{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000715 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000716 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000717#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000718 Py_XDECREF(tok->decoding_readline);
719 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000720#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000721 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000722 PyMem_FREE(tok->buf);
723 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724}
725
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726/* Get next char, updating state; error code goes into tok->done */
727
728static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000729tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000730{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000732 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000733 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000734 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000735 if (tok->done != E_OK)
736 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000738 char *end = strchr(tok->inp, '\n');
739 if (end != NULL)
740 end++;
741 else {
742 end = strchr(tok->inp, '\0');
743 if (end == tok->inp) {
744 tok->done = E_EOF;
745 return EOF;
746 }
747 }
748 if (tok->start == NULL)
749 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000750 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000751 tok->lineno++;
752 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000753 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000756 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000757#ifndef PGEN
758 if (tok->encoding && newtok && *newtok) {
759 /* Recode to UTF-8 */
760 Py_ssize_t buflen;
761 const char* buf;
762 PyObject *u = translate_into_utf8(newtok, tok->encoding);
763 PyMem_FREE(newtok);
764 if (!u) {
765 tok->done = E_DECODE;
766 return EOF;
767 }
768 buflen = PyBytes_Size(u);
769 buf = PyBytes_AsString(u);
770 if (!buf) {
771 Py_DECREF(u);
772 tok->done = E_DECODE;
773 return EOF;
774 }
775 newtok = PyMem_MALLOC(buflen+1);
776 strcpy(newtok, buf);
777 Py_DECREF(u);
778 }
779#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000780 if (tok->nextprompt != NULL)
781 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000782 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000784 else if (*newtok == '\0') {
785 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000786 tok->done = E_EOF;
787 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000788 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000789 size_t start = tok->start - tok->buf;
790 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000791 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000792 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000793 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 tok->lineno++;
795 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000796 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000797 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000798 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000799 tok->done = E_NOMEM;
800 return EOF;
801 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000802 tok->buf = buf;
803 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000804 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000805 strcpy(tok->buf + oldlen, newtok);
806 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 tok->inp = tok->buf + newlen;
808 tok->end = tok->inp + 1;
809 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000810 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 else {
812 tok->lineno++;
813 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000814 PyMem_FREE(tok->buf);
815 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000816 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000818 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->inp = strchr(tok->buf, '\0');
820 tok->end = tok->inp + 1;
821 }
822 }
823 else {
824 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000825 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000826 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 if (tok->start == NULL) {
828 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829 tok->buf = (char *)
830 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 if (tok->buf == NULL) {
832 tok->done = E_NOMEM;
833 return EOF;
834 }
835 tok->end = tok->buf + BUFSIZ;
836 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000837 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
838 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 tok->done = E_EOF;
840 done = 1;
841 }
842 else {
843 tok->done = E_OK;
844 tok->inp = strchr(tok->buf, '\0');
845 done = tok->inp[-1] == '\n';
846 }
847 }
848 else {
849 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000850 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000851 tok->done = E_EOF;
852 done = 1;
853 }
854 else
855 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 }
857 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000858 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000859 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000860 Py_ssize_t curstart = tok->start == NULL ? -1 :
861 tok->start - tok->buf;
862 Py_ssize_t curvalid = tok->inp - tok->buf;
863 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000864 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000865 newbuf = (char *)PyMem_REALLOC(newbuf,
866 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 if (newbuf == NULL) {
868 tok->done = E_NOMEM;
869 tok->cur = tok->inp;
870 return EOF;
871 }
872 tok->buf = newbuf;
873 tok->inp = tok->buf + curvalid;
874 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000875 tok->start = curstart < 0 ? NULL :
876 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000877 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000878 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000879 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000880 /* Break out early on decoding
881 errors, as tok->buf will be NULL
882 */
883 if (tok->decoding_erred)
884 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000885 /* Last line does not end in \n,
886 fake one */
887 strcpy(tok->inp, "\n");
888 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000889 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000891 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000892 if (tok->buf != NULL) {
893 tok->cur = tok->buf + cur;
894 tok->line_start = tok->cur;
895 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000896 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000897 pt = tok->inp - 2;
898 if (pt >= tok->buf && *pt == '\r') {
899 *pt++ = '\n';
900 *pt = '\0';
901 tok->inp = pt;
902 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000903 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000904 }
905 if (tok->done != E_OK) {
906 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000907 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000908 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909 return EOF;
910 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000911 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000912 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913}
914
915
916/* Back-up one character */
917
918static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000919tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000920{
921 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000922 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000923 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000924 if (*tok->cur != c)
925 *tok->cur = c;
926 }
927}
928
929
930/* Return the token corresponding to a single character */
931
932int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000933PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000934{
935 switch (c) {
936 case '(': return LPAR;
937 case ')': return RPAR;
938 case '[': return LSQB;
939 case ']': return RSQB;
940 case ':': return COLON;
941 case ',': return COMMA;
942 case ';': return SEMI;
943 case '+': return PLUS;
944 case '-': return MINUS;
945 case '*': return STAR;
946 case '/': return SLASH;
947 case '|': return VBAR;
948 case '&': return AMPER;
949 case '<': return LESS;
950 case '>': return GREATER;
951 case '=': return EQUAL;
952 case '.': return DOT;
953 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000954 case '{': return LBRACE;
955 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000956 case '^': return CIRCUMFLEX;
957 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000958 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000959 default: return OP;
960 }
961}
962
963
Guido van Rossumfbab9051991-10-20 20:25:03 +0000964int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000965PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000966{
967 switch (c1) {
968 case '=':
969 switch (c2) {
970 case '=': return EQEQUAL;
971 }
972 break;
973 case '!':
974 switch (c2) {
975 case '=': return NOTEQUAL;
976 }
977 break;
978 case '<':
979 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000980 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000981 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000982 }
983 break;
984 case '>':
985 switch (c2) {
986 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000987 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000988 }
989 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000990 case '+':
991 switch (c2) {
992 case '=': return PLUSEQUAL;
993 }
994 break;
995 case '-':
996 switch (c2) {
997 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000998 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000999 }
1000 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001001 case '*':
1002 switch (c2) {
1003 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001004 case '=': return STAREQUAL;
1005 }
1006 break;
1007 case '/':
1008 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001009 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001010 case '=': return SLASHEQUAL;
1011 }
1012 break;
1013 case '|':
1014 switch (c2) {
1015 case '=': return VBAREQUAL;
1016 }
1017 break;
1018 case '%':
1019 switch (c2) {
1020 case '=': return PERCENTEQUAL;
1021 }
1022 break;
1023 case '&':
1024 switch (c2) {
1025 case '=': return AMPEREQUAL;
1026 }
1027 break;
1028 case '^':
1029 switch (c2) {
1030 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001031 }
1032 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001033 }
1034 return OP;
1035}
1036
Thomas Wouters434d0822000-08-24 20:11:32 +00001037int
1038PyToken_ThreeChars(int c1, int c2, int c3)
1039{
1040 switch (c1) {
1041 case '<':
1042 switch (c2) {
1043 case '<':
1044 switch (c3) {
1045 case '=':
1046 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001047 }
1048 break;
1049 }
1050 break;
1051 case '>':
1052 switch (c2) {
1053 case '>':
1054 switch (c3) {
1055 case '=':
1056 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001057 }
1058 break;
1059 }
1060 break;
1061 case '*':
1062 switch (c2) {
1063 case '*':
1064 switch (c3) {
1065 case '=':
1066 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001067 }
1068 break;
1069 }
1070 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001071 case '/':
1072 switch (c2) {
1073 case '/':
1074 switch (c3) {
1075 case '=':
1076 return DOUBLESLASHEQUAL;
1077 }
1078 break;
1079 }
1080 break;
Georg Brandldde00282007-03-18 19:01:53 +00001081 case '.':
1082 switch (c2) {
1083 case '.':
1084 switch (c3) {
1085 case '.':
1086 return ELLIPSIS;
1087 }
1088 break;
1089 }
1090 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001091 }
1092 return OP;
1093}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001094
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001096indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001097{
1098 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001099 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001100 tok->cur = tok->inp;
1101 return 1;
1102 }
1103 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001106 tok->altwarning = 0;
1107 }
1108 return 0;
1109}
1110
Martin v. Löwis47383402007-08-15 07:32:56 +00001111#ifdef PGEN
1112#define verify_identifier(s,e) 1
1113#else
1114/* Verify that the identifier follows PEP 3131. */
1115static int
1116verify_identifier(char *start, char *end)
1117{
Guido van Rossume3e37012007-08-29 18:54:41 +00001118 PyObject *s;
1119 int result;
1120 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1121 if (s == NULL) {
1122 PyErr_Clear();
1123 return 0;
1124 }
1125 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001126 Py_DECREF(s);
1127 return result;
1128}
1129#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001130
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001131/* Get next token, after space stripping etc. */
1132
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001133static int
1134tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135{
1136 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001137 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001138
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001139 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001140 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001141 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001142 blankline = 0;
1143
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144 /* Get indentation level */
1145 if (tok->atbol) {
1146 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001147 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001148 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 for (;;) {
1150 c = tok_nextc(tok);
1151 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001152 col++, altcol++;
1153 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001154 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155 altcol = (altcol/tok->alttabsize + 1)
1156 * tok->alttabsize;
1157 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001158 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160 else
1161 break;
1162 }
1163 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 if (c == '#' || c == '\n') {
1165 /* Lines with only whitespace and/or comments
1166 shouldn't affect the indentation and are
1167 not passed to the parser as NEWLINE tokens,
1168 except *totally* empty lines in interactive
1169 mode, which signal the end of a command group. */
1170 if (col == 0 && c == '\n' && tok->prompt != NULL)
1171 blankline = 0; /* Let it through */
1172 else
1173 blankline = 1; /* Ignore completely */
1174 /* We can't jump back right here since we still
1175 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001176 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001177 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001178 if (col == tok->indstack[tok->indent]) {
1179 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001180 if (altcol != tok->altindstack[tok->indent]) {
1181 if (indenterror(tok))
1182 return ERRORTOKEN;
1183 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001185 else if (col > tok->indstack[tok->indent]) {
1186 /* Indent -- always one */
1187 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001188 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001189 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001190 return ERRORTOKEN;
1191 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001192 if (altcol <= tok->altindstack[tok->indent]) {
1193 if (indenterror(tok))
1194 return ERRORTOKEN;
1195 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001196 tok->pendin++;
1197 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001198 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001200 else /* col < tok->indstack[tok->indent] */ {
1201 /* Dedent -- any number, must be consistent */
1202 while (tok->indent > 0 &&
1203 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001204 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001205 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001206 }
1207 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001208 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001209 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001210 return ERRORTOKEN;
1211 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001212 if (altcol != tok->altindstack[tok->indent]) {
1213 if (indenterror(tok))
1214 return ERRORTOKEN;
1215 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216 }
1217 }
1218 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001219
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001220 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001221
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 /* Return pending indents/dedents */
1223 if (tok->pendin != 0) {
1224 if (tok->pendin < 0) {
1225 tok->pendin++;
1226 return DEDENT;
1227 }
1228 else {
1229 tok->pendin--;
1230 return INDENT;
1231 }
1232 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001233
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001235 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001236 /* Skip spaces */
1237 do {
1238 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001239 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001240
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001241 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001242 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001243
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001244 /* Skip comment */
1245 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001246 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001247 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001248
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001250 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001252 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001253
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001255 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001256 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001257 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001258 switch (c) {
1259 case 'r':
1260 case 'R':
1261 c = tok_nextc(tok);
1262 if (c == '"' || c == '\'')
1263 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001264 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001265 case 'b':
1266 case 'B':
1267 c = tok_nextc(tok);
1268 if (c == 'r' || c == 'R')
1269 c = tok_nextc(tok);
1270 if (c == '"' || c == '\'')
1271 goto letter_quote;
1272 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001273 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001274 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001275 if (c >= 128)
1276 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001278 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 tok_backup(tok, c);
Martin v. Löwis47383402007-08-15 07:32:56 +00001280 if (nonascii &&
1281 !verify_identifier(tok->start, tok->cur)) {
1282 tok->done = E_IDENTIFIER;
1283 return ERRORTOKEN;
1284 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001285 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 *p_end = tok->cur;
1287 return NAME;
1288 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 /* Newline */
1291 if (c == '\n') {
1292 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001293 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001294 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001295 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001296 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001297 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 return NEWLINE;
1299 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001300
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001301 /* Period or number starting with period? */
1302 if (c == '.') {
1303 c = tok_nextc(tok);
1304 if (isdigit(c)) {
1305 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001306 } else if (c == '.') {
1307 c = tok_nextc(tok);
1308 if (c == '.') {
1309 *p_start = tok->start;
1310 *p_end = tok->cur;
1311 return ELLIPSIS;
1312 } else {
1313 tok_backup(tok, c);
1314 }
1315 tok_backup(tok, '.');
1316 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001317 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001318 }
Georg Brandldde00282007-03-18 19:01:53 +00001319 *p_start = tok->start;
1320 *p_end = tok->cur;
1321 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001322 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001323
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 /* Number */
1325 if (isdigit(c)) {
1326 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001327 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 c = tok_nextc(tok);
1329 if (c == '.')
1330 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001331#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001332 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001333 goto imaginary;
1334#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 if (c == 'x' || c == 'X') {
1336 /* Hex */
1337 do {
1338 c = tok_nextc(tok);
1339 } while (isxdigit(c));
1340 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001341 else if (c == 'o' || c == 'O') {
1342 /* Octal */
1343 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001345 } while ('0' <= c && c < '8');
1346 }
1347 else if (c == 'b' || c == 'B') {
1348 /* Binary */
1349 do {
1350 c = tok_nextc(tok);
1351 } while (c == '0' || c == '1');
1352 }
1353 else {
1354 int nonzero = 0;
1355 /* maybe old-style octal; c is first char of it */
1356 /* in any case, allow '0' as a literal */
1357 while (c == '0')
1358 c = tok_nextc(tok);
1359 while (isdigit(c)) {
1360 nonzero = 1;
1361 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001362 }
1363 if (c == '.')
1364 goto fraction;
1365 else if (c == 'e' || c == 'E')
1366 goto exponent;
1367#ifndef WITHOUT_COMPLEX
1368 else if (c == 'j' || c == 'J')
1369 goto imaginary;
1370#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001371 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001372 tok->done = E_TOKEN;
1373 tok_backup(tok, c);
1374 return ERRORTOKEN;
1375 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 }
1377 }
1378 else {
1379 /* Decimal */
1380 do {
1381 c = tok_nextc(tok);
1382 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001383 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001384 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001385 if (c == '.') {
1386 fraction:
1387 /* Fraction */
1388 do {
1389 c = tok_nextc(tok);
1390 } while (isdigit(c));
1391 }
1392 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001393 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001394 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001396 if (c == '+' || c == '-')
1397 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001398 if (!isdigit(c)) {
1399 tok->done = E_TOKEN;
1400 tok_backup(tok, c);
1401 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001402 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001403 do {
1404 c = tok_nextc(tok);
1405 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001406 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001407#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001408 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001409 /* Imaginary part */
1410 imaginary:
1411 c = tok_nextc(tok);
1412#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 }
1414 }
1415 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001416 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001417 *p_end = tok->cur;
1418 return NUMBER;
1419 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001420
1421 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001422 /* String */
1423 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001424 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001425 int quote = c;
1426 int triple = 0;
1427 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001428 for (;;) {
1429 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001430 if (c == '\n') {
1431 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001432 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001433 tok_backup(tok, c);
1434 return ERRORTOKEN;
1435 }
1436 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001437 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001438 }
1439 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001440 if (triple)
1441 tok->done = E_EOFS;
1442 else
1443 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001444 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001445 return ERRORTOKEN;
1446 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001447 else if (c == quote) {
1448 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001449 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001450 c = tok_nextc(tok);
1451 if (c == quote) {
1452 triple = 1;
1453 tripcount = 0;
1454 continue;
1455 }
1456 tok_backup(tok, c);
1457 }
1458 if (!triple || tripcount == 3)
1459 break;
1460 }
1461 else if (c == '\\') {
1462 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001464 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001465 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001466 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467 return ERRORTOKEN;
1468 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001470 else
1471 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001472 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001473 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001474 *p_end = tok->cur;
1475 return STRING;
1476 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001477
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478 /* Line continuation */
1479 if (c == '\\') {
1480 c = tok_nextc(tok);
1481 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001482 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001483 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001484 return ERRORTOKEN;
1485 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001486 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001487 goto again; /* Read next line */
1488 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001489
Guido van Rossumfbab9051991-10-20 20:25:03 +00001490 /* Check for two-character token */
1491 {
1492 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001493 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001494 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001495 int c3 = tok_nextc(tok);
1496 int token3 = PyToken_ThreeChars(c, c2, c3);
1497 if (token3 != OP) {
1498 token = token3;
1499 } else {
1500 tok_backup(tok, c3);
1501 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001502 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001503 *p_end = tok->cur;
1504 return token;
1505 }
1506 tok_backup(tok, c2);
1507 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001508
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001509 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001510 switch (c) {
1511 case '(':
1512 case '[':
1513 case '{':
1514 tok->level++;
1515 break;
1516 case ')':
1517 case ']':
1518 case '}':
1519 tok->level--;
1520 break;
1521 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001522
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001524 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001525 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001526 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001527}
1528
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001529int
1530PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1531{
1532 int result = tok_get(tok, p_start, p_end);
1533 if (tok->decoding_erred) {
1534 result = ERRORTOKEN;
1535 tok->done = E_DECODE;
1536 }
1537 return result;
1538}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001539
Thomas Wouters89d996e2007-09-08 17:39:28 +00001540/* This function is only called from parsetok. However, it cannot live
1541 there, as it must be empty for PGEN, and we can check for PGEN only
1542 in this file. */
1543
1544#ifdef PGEN
1545char*
1546PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1547{
1548 return NULL;
1549}
1550#else
1551static PyObject *
1552dec_utf8(const char *enc, const char *text, size_t len) {
1553 PyObject *ret = NULL;
1554 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1555 if (unicode_text) {
1556 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1557 Py_DECREF(unicode_text);
1558 }
1559 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001560 PyErr_Clear();
1561 }
1562 else {
1563 assert(PyBytes_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001564 }
1565 return ret;
1566}
1567
1568char *
1569PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1570{
1571 char *text = NULL;
1572 if (tok->encoding) {
1573 /* convert source to original encondig */
1574 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1575 if (lineobj != NULL) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001576 int linelen = PyBytes_GET_SIZE(lineobj);
1577 const char *line = PyBytes_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001578 text = PyObject_MALLOC(linelen + 1);
1579 if (text != NULL && line != NULL) {
1580 if (linelen)
1581 strncpy(text, line, linelen);
1582 text[linelen] = '\0';
1583 }
1584 Py_DECREF(lineobj);
1585
1586 /* adjust error offset */
1587 if (*offset > 1) {
1588 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001589 tok->buf,
1590 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001591 if (offsetobj) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001592 *offset = 1 +
1593 PyBytes_GET_SIZE(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001594 Py_DECREF(offsetobj);
1595 }
1596 }
1597
1598 }
1599 }
1600 return text;
1601
1602}
1603#endif
1604
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001605/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001606
1607 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Brett Cannone4539892007-10-20 03:46:49 +00001608 the first or second line of the file (in which case the encoding
1609 should be assumed to be PyUnicode_GetDefaultEncoding()).
1610
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001611 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1612 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001613*/
1614char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001615PyTokenizer_FindEncoding(int fd)
1616{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001617 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001618 FILE *fp;
1619 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001620
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001621 fd = dup(fd);
1622 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001623 return NULL;
1624 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001625 fp = fdopen(fd, "r");
1626 if (fp == NULL) {
1627 return NULL;
1628 }
1629 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1630 if (tok == NULL) {
1631 fclose(fp);
1632 return NULL;
1633 }
1634 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001635 PyTokenizer_Get(tok, &p_start, &p_end);
1636 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001637 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001638 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001639 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001640 strcpy(encoding, tok->encoding);
1641 }
1642 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001643 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001644}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001645
Guido van Rossum408027e1996-12-30 16:17:54 +00001646#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001647
1648void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001649tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001650{
Guido van Rossum86bea461997-04-29 21:03:06 +00001651 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001652 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1653 printf("(%.*s)", (int)(end - start), start);
1654}
1655
1656#endif