blob: 0ccd02b58d12c4ac637e5eecfdb0864ddda8816e [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055/* Token names */
56
Guido van Rossum86bea461997-04-29 21:03:06 +000057char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000058 "ENDMARKER",
59 "NAME",
60 "NUMBER",
61 "STRING",
62 "NEWLINE",
63 "INDENT",
64 "DEDENT",
65 "LPAR",
66 "RPAR",
67 "LSQB",
68 "RSQB",
69 "COLON",
70 "COMMA",
71 "SEMI",
72 "PLUS",
73 "MINUS",
74 "STAR",
75 "SLASH",
76 "VBAR",
77 "AMPER",
78 "LESS",
79 "GREATER",
80 "EQUAL",
81 "DOT",
82 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000083 "LBRACE",
84 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000085 "EQEQUAL",
86 "NOTEQUAL",
87 "LESSEQUAL",
88 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000089 "TILDE",
90 "CIRCUMFLEX",
91 "LEFTSHIFT",
92 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000093 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000094 "PLUSEQUAL",
95 "MINEQUAL",
96 "STAREQUAL",
97 "SLASHEQUAL",
98 "PERCENTEQUAL",
99 "AMPEREQUAL",
100 "VBAREQUAL",
101 "CIRCUMFLEXEQUAL",
102 "LEFTSHIFTEQUAL",
103 "RIGHTSHIFTEQUAL",
104 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000105 "DOUBLESLASH",
106 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000107 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000108 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000109 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000110 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 "OP",
112 "<ERRORTOKEN>",
113 "<N_TOKENS>"
114};
115
116
117/* Create and initialize a new tok_state structure */
118
119static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000120tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000121{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000122 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
123 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000124 if (tok == NULL)
125 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000126 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000127 tok->done = E_OK;
128 tok->fp = NULL;
129 tok->tabsize = TABSIZE;
130 tok->indent = 0;
131 tok->indstack[0] = 0;
132 tok->atbol = 1;
133 tok->pendin = 0;
134 tok->prompt = tok->nextprompt = NULL;
135 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000136 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000137 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000138 tok->altwarning = 1;
139 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000140 tok->alttabsize = 1;
141 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000142 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_erred = 0;
144 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000146 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000147#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148 tok->decoding_readline = NULL;
149 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000150#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000151 return tok;
152}
153
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000154#ifdef PGEN
155
156static char *
157decoding_fgets(char *s, int size, struct tok_state *tok)
158{
159 return fgets(s, size, tok->fp);
160}
161
162static int
163decoding_feof(struct tok_state *tok)
164{
165 return feof(tok->fp);
166}
167
168static const char *
169decode_str(const char *str, struct tok_state *tok)
170{
171 return str;
172}
173
174#else /* PGEN */
175
176static char *
177error_ret(struct tok_state *tok) /* XXX */
178{
179 tok->decoding_erred = 1;
180 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000181 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182 tok->buf = NULL;
183 return NULL; /* as if it were EOF */
184}
185
186static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000187new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000189 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190 if (result != NULL) {
191 memcpy(result, s, len);
192 result[len] = '\0';
193 }
194 return result;
195}
196
197static char *
198get_normal_name(char *s) /* for utf-8 and latin-1 */
199{
200 char buf[13];
201 int i;
202 for (i = 0; i < 12; i++) {
203 int c = s[i];
204 if (c == '\0') break;
205 else if (c == '_') buf[i] = '-';
206 else buf[i] = tolower(c);
207 }
208 buf[i] = '\0';
209 if (strcmp(buf, "utf-8") == 0 ||
210 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
217 else return s;
218}
219
220/* Return the coding spec in S, or NULL if none is found. */
221
222static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000226 /* Coding spec must be in a comment, and that comment must be
227 * the only statement on the source code line. */
228 for (i = 0; i < size - 6; i++) {
229 if (s[i] == '#')
230 break;
231 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
232 return NULL;
233 }
234 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 const char* t = s + i;
236 if (strncmp(t, "coding", 6) == 0) {
237 const char* begin = NULL;
238 t += 6;
239 if (t[0] != ':' && t[0] != '=')
240 continue;
241 do {
242 t++;
243 } while (t[0] == '\x20' || t[0] == '\t');
244
245 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000246 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000247 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248 t++;
249
250 if (begin < t) {
251 char* r = new_string(begin, t - begin);
252 char* q = get_normal_name(r);
253 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000255 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 }
257 return r;
258 }
259 }
260 }
261 return NULL;
262}
263
264/* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
268
269static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000270check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 int set_readline(struct tok_state *, const char *))
272{
Tim Peters17db21f2002-09-03 15:39:58 +0000273 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000275
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000276 if (tok->cont_line)
277 /* It's a continuation line, so it can't be a coding spec. */
278 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000279 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280 if (cs != NULL) {
281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000283 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000284 if (strcmp(cs, "utf-8") == 0 ||
285 strcmp(cs, "iso-8859-1") == 0) {
286 tok->encoding = cs;
287 } else {
288 r = set_readline(tok, cs);
289 if (r) {
290 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000291 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000293 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000294 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000295 }
296 } else { /* then, compare cs with BOM */
297 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000298 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000299 }
300 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000301 if (!r) {
302 cs = tok->encoding;
303 if (!cs)
304 cs = "with BOM";
305 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
306 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307 return r;
308}
309
310/* See whether the file starts with a BOM. If it does,
311 invoke the set_readline function with the new encoding.
312 Return 1 on success, 0 on failure. */
313
314static int
315check_bom(int get_char(struct tok_state *),
316 void unget_char(int, struct tok_state *),
317 int set_readline(struct tok_state *, const char *),
318 struct tok_state *tok)
319{
320 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000321 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000322 if (ch == EOF) {
323 return 1;
324 } else if (ch == 0xEF) {
325 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
326 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
327#if 0
328 /* Disable support for UTF-16 BOMs until a decision
329 is made whether this needs to be supported. */
330 } else if (ch == 0xFE) {
331 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
332 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000333 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000334 } else if (ch == 0xFF) {
335 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
336 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000337 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338#endif
339 } else {
340 unget_char(ch, tok);
341 return 1;
342 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000343 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000344 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000345 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
346 return 1;
347 NON_BOM:
348 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
349 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
350 return 1;
351}
352
353/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000356 On entry, tok->decoding_buffer will be one of:
357 1) NULL: need to call tok->decoding_readline to get a new line
358 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
359 stored the result in tok->decoding_buffer
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000360 3) PyBytesObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 (in the s buffer) to copy entire contents of the line read
362 by tok->decoding_readline. tok->decoding_buffer has the overflow.
363 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 reached): see tok_nextc and its calls to decoding_fgets.
366*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367
368static char *
369fp_readl(char *s, int size, struct tok_state *tok)
370{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000371 PyObject* bufobj = tok->decoding_buffer;
372 const char *buf;
373 Py_ssize_t buflen;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000374 int allocated = 0;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000380 if (bufobj == NULL) {
381 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
382 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000383 goto error;
384 allocated = 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000385 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +0000386 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
387 if (buf == NULL) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000388 goto error;
389 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000390 if (buflen > size) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000391 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000392 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
393 buflen-size);
394 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000395 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000396 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000398 memcpy(s, buf, buflen);
399 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000400 if (buflen == 0) /* EOF */
401 s = NULL;
402 if (allocated) {
403 Py_DECREF(bufobj);
404 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000405 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000406
407error:
408 if (allocated) {
409 Py_XDECREF(bufobj);
410 }
411 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412}
413
414/* Set the readline function for TOK to a StreamReader's
415 readline function. The StreamReader is named ENC.
416
417 This function is called from check_bom and check_coding_spec.
418
419 ENC is usually identical to the future value of tok->encoding,
420 except for the (currently unsupported) case of UTF-16.
421
422 Return 1 on success, 0 on failure. */
423
424static int
425fp_setreadl(struct tok_state *tok, const char* enc)
426{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000427 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000429 io = PyImport_ImportModule("io");
430 if (io == NULL)
431 goto cleanup;
432
433 stream = PyObject_CallMethod(io, "open", "ssis",
434 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000435 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000436 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000438 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000439 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000440 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000441
442 cleanup:
443 Py_XDECREF(stream);
444 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000446}
447
448/* Fetch the next byte from TOK. */
449
450static int fp_getc(struct tok_state *tok) {
451 return getc(tok->fp);
452}
453
454/* Unfetch the last byte back into TOK. */
455
456static void fp_ungetc(int c, struct tok_state *tok) {
457 ungetc(c, tok->fp);
458}
459
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000460/* Check whether the characters at s start a valid
461 UTF-8 sequence. Return the number of characters forming
462 the sequence if yes, 0 if not. */
463static int valid_utf8(const unsigned char* s)
464{
465 int expected = 0;
466 int length;
467 if (*s < 0x80)
468 /* single-byte code */
469 return 1;
470 if (*s < 0xc0)
471 /* following byte */
472 return 0;
473 if (*s < 0xE0)
474 expected = 1;
475 else if (*s < 0xF0)
476 expected = 2;
477 else if (*s < 0xF8)
478 expected = 3;
479 else
480 return 0;
481 length = expected + 1;
482 for (; expected; expected--)
483 if (s[expected] < 0x80 || s[expected] >= 0xC0)
484 return 0;
485 return length;
486}
487
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488/* Read a line of input from TOK. Determine encoding
489 if necessary. */
490
491static char *
492decoding_fgets(char *s, int size, struct tok_state *tok)
493{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000494 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000495 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000496 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000497 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 /* We already have a codec associated with
499 this input. */
500 line = fp_readl(s, size, tok);
501 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000502 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506 break;
507 } else {
508 /* We have not yet determined the encoding.
509 If an encoding is found, use the file-pointer
510 reader functions from now on. */
511 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
512 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000513 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000514 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000515 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
517 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
518 return error_ret(tok);
519 }
520 }
521#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000522 /* The default encoding is UTF-8, so make sure we don't have any
523 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000524 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000525 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000526 int length;
527 for (c = (unsigned char *)line; *c; c += length)
528 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 badchar = *c;
530 break;
531 }
532 }
533 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000534 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000535 /* Need to add 1 to the line number, since this line
536 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000537 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000538 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000539 "in file %.200s on line %i, "
540 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000541 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000542 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000543 PyErr_SetString(PyExc_SyntaxError, buf);
544 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545 }
546#endif
547 return line;
548}
549
550static int
551decoding_feof(struct tok_state *tok)
552{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000553 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554 return feof(tok->fp);
555 } else {
556 PyObject* buf = tok->decoding_buffer;
557 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000558 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 if (buf == NULL) {
560 error_ret(tok);
561 return 1;
562 } else {
563 tok->decoding_buffer = buf;
564 }
565 }
566 return PyObject_Length(buf) == 0;
567 }
568}
569
570/* Fetch a byte from TOK, using the string buffer. */
571
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000572static int
573buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000574 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575}
576
577/* Unfetch a byte from TOK, using the string buffer. */
578
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000579static void
580buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000582 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583}
584
585/* Set the readline function for TOK to ENC. For the string-based
586 tokenizer, this means to just record the encoding. */
587
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000588static int
589buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000590 tok->enc = enc;
591 return 1;
592}
593
594/* Return a UTF-8 encoding Python string object from the
595 C byte string STR, which is encoded with ENC. */
596
597static PyObject *
598translate_into_utf8(const char* str, const char* enc) {
599 PyObject *utf8;
600 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
601 if (buf == NULL)
602 return NULL;
603 utf8 = PyUnicode_AsUTF8String(buf);
604 Py_DECREF(buf);
605 return utf8;
606}
607
608/* Decode a byte string STR for use as the buffer of TOK.
609 Look for encoding declarations inside STR, and record them
610 inside TOK. */
611
612static const char *
613decode_str(const char *str, struct tok_state *tok)
614{
615 PyObject* utf8 = NULL;
616 const char *s;
617 int lineno = 0;
618 tok->enc = NULL;
619 tok->str = str;
620 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000621 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000623 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624 if (tok->enc != NULL) {
625 utf8 = translate_into_utf8(str, tok->enc);
626 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000627 return error_ret(tok);
Guido van Rossumdf4ce102007-10-10 18:49:50 +0000628 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000629 }
630 for (s = str;; s++) {
631 if (*s == '\0') break;
632 else if (*s == '\n') {
633 lineno++;
634 if (lineno == 2) break;
635 }
636 }
637 tok->enc = NULL;
638 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000639 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 if (tok->enc != NULL) {
641 assert(utf8 == NULL);
642 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000643 if (utf8 == NULL) {
644 PyErr_Format(PyExc_SyntaxError,
645 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000646 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000647 }
Neal Norwitzf7f28fc2007-08-11 21:31:25 +0000648 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649 }
650 assert(tok->decoding_buffer == NULL);
651 tok->decoding_buffer = utf8; /* CAUTION */
652 return str;
653}
654
655#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656
657/* Set up tokenizer for string */
658
659struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000660PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661{
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 if (str == NULL) {
667 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000668 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000669 }
670
Martin v. Löwis95292d62002-12-11 14:04:59 +0000671 /* XXX: constify members. */
672 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673 return tok;
674}
675
676
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000677/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678
679struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000680PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000681{
682 struct tok_state *tok = tok_new();
683 if (tok == NULL)
684 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000685 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000686 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000687 return NULL;
688 }
689 tok->cur = tok->inp = tok->buf;
690 tok->end = tok->buf + BUFSIZ;
691 tok->fp = fp;
692 tok->prompt = ps1;
693 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000694 if (enc != NULL) {
695 /* Must copy encoding declaration since it
696 gets copied into the parse tree. */
697 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
698 if (!tok->encoding) {
699 PyTokenizer_Free(tok);
700 return NULL;
701 }
702 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000703 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000704 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705 return tok;
706}
707
708
709/* Free a tok_state structure */
710
711void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000712PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000713{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000715 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000716#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717 Py_XDECREF(tok->decoding_readline);
718 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000719#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000721 PyMem_FREE(tok->buf);
722 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000723}
724
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000725/* Get next char, updating state; error code goes into tok->done */
726
727static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000728tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000730 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000731 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000732 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000733 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000734 if (tok->done != E_OK)
735 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000736 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000737 char *end = strchr(tok->inp, '\n');
738 if (end != NULL)
739 end++;
740 else {
741 end = strchr(tok->inp, '\0');
742 if (end == tok->inp) {
743 tok->done = E_EOF;
744 return EOF;
745 }
746 }
747 if (tok->start == NULL)
748 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000749 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000750 tok->lineno++;
751 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000752 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000755 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000756#ifndef PGEN
757 if (tok->encoding && newtok && *newtok) {
758 /* Recode to UTF-8 */
759 Py_ssize_t buflen;
760 const char* buf;
761 PyObject *u = translate_into_utf8(newtok, tok->encoding);
762 PyMem_FREE(newtok);
763 if (!u) {
764 tok->done = E_DECODE;
765 return EOF;
766 }
767 buflen = PyBytes_Size(u);
768 buf = PyBytes_AsString(u);
769 if (!buf) {
770 Py_DECREF(u);
771 tok->done = E_DECODE;
772 return EOF;
773 }
774 newtok = PyMem_MALLOC(buflen+1);
775 strcpy(newtok, buf);
776 Py_DECREF(u);
777 }
778#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->nextprompt != NULL)
780 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000782 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000783 else if (*newtok == '\0') {
784 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785 tok->done = E_EOF;
786 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000787 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000788 size_t start = tok->start - tok->buf;
789 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000790 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000791 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000792 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 tok->lineno++;
794 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000795 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000796 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000797 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000798 tok->done = E_NOMEM;
799 return EOF;
800 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000801 tok->buf = buf;
802 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000803 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000804 strcpy(tok->buf + oldlen, newtok);
805 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000806 tok->inp = tok->buf + newlen;
807 tok->end = tok->inp + 1;
808 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000809 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000810 else {
811 tok->lineno++;
812 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000813 PyMem_FREE(tok->buf);
814 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000815 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000817 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 tok->inp = strchr(tok->buf, '\0');
819 tok->end = tok->inp + 1;
820 }
821 }
822 else {
823 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000824 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000825 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000826 if (tok->start == NULL) {
827 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000828 tok->buf = (char *)
829 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 if (tok->buf == NULL) {
831 tok->done = E_NOMEM;
832 return EOF;
833 }
834 tok->end = tok->buf + BUFSIZ;
835 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000836 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
837 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000838 tok->done = E_EOF;
839 done = 1;
840 }
841 else {
842 tok->done = E_OK;
843 tok->inp = strchr(tok->buf, '\0');
844 done = tok->inp[-1] == '\n';
845 }
846 }
847 else {
848 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000849 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000850 tok->done = E_EOF;
851 done = 1;
852 }
853 else
854 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000855 }
856 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000857 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000858 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 Py_ssize_t curstart = tok->start == NULL ? -1 :
860 tok->start - tok->buf;
861 Py_ssize_t curvalid = tok->inp - tok->buf;
862 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000864 newbuf = (char *)PyMem_REALLOC(newbuf,
865 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000866 if (newbuf == NULL) {
867 tok->done = E_NOMEM;
868 tok->cur = tok->inp;
869 return EOF;
870 }
871 tok->buf = newbuf;
872 tok->inp = tok->buf + curvalid;
873 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 tok->start = curstart < 0 ? NULL :
875 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000876 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000877 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000878 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000879 /* Break out early on decoding
880 errors, as tok->buf will be NULL
881 */
882 if (tok->decoding_erred)
883 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000884 /* Last line does not end in \n,
885 fake one */
886 strcpy(tok->inp, "\n");
887 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000888 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000890 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000891 if (tok->buf != NULL) {
892 tok->cur = tok->buf + cur;
893 tok->line_start = tok->cur;
894 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000895 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000896 pt = tok->inp - 2;
897 if (pt >= tok->buf && *pt == '\r') {
898 *pt++ = '\n';
899 *pt = '\0';
900 tok->inp = pt;
901 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000902 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000903 }
904 if (tok->done != E_OK) {
905 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000906 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000907 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908 return EOF;
909 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000911 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912}
913
914
915/* Back-up one character */
916
917static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000918tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919{
920 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000921 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000922 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000923 if (*tok->cur != c)
924 *tok->cur = c;
925 }
926}
927
928
929/* Return the token corresponding to a single character */
930
931int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000932PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000933{
934 switch (c) {
935 case '(': return LPAR;
936 case ')': return RPAR;
937 case '[': return LSQB;
938 case ']': return RSQB;
939 case ':': return COLON;
940 case ',': return COMMA;
941 case ';': return SEMI;
942 case '+': return PLUS;
943 case '-': return MINUS;
944 case '*': return STAR;
945 case '/': return SLASH;
946 case '|': return VBAR;
947 case '&': return AMPER;
948 case '<': return LESS;
949 case '>': return GREATER;
950 case '=': return EQUAL;
951 case '.': return DOT;
952 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953 case '{': return LBRACE;
954 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000955 case '^': return CIRCUMFLEX;
956 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000957 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000958 default: return OP;
959 }
960}
961
962
Guido van Rossumfbab9051991-10-20 20:25:03 +0000963int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000964PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000965{
966 switch (c1) {
967 case '=':
968 switch (c2) {
969 case '=': return EQEQUAL;
970 }
971 break;
972 case '!':
973 switch (c2) {
974 case '=': return NOTEQUAL;
975 }
976 break;
977 case '<':
978 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000979 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000980 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000981 }
982 break;
983 case '>':
984 switch (c2) {
985 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000986 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000987 }
988 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000989 case '+':
990 switch (c2) {
991 case '=': return PLUSEQUAL;
992 }
993 break;
994 case '-':
995 switch (c2) {
996 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000997 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000998 }
999 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001000 case '*':
1001 switch (c2) {
1002 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001003 case '=': return STAREQUAL;
1004 }
1005 break;
1006 case '/':
1007 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001008 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 case '=': return SLASHEQUAL;
1010 }
1011 break;
1012 case '|':
1013 switch (c2) {
1014 case '=': return VBAREQUAL;
1015 }
1016 break;
1017 case '%':
1018 switch (c2) {
1019 case '=': return PERCENTEQUAL;
1020 }
1021 break;
1022 case '&':
1023 switch (c2) {
1024 case '=': return AMPEREQUAL;
1025 }
1026 break;
1027 case '^':
1028 switch (c2) {
1029 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001030 }
1031 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001032 }
1033 return OP;
1034}
1035
Thomas Wouters434d0822000-08-24 20:11:32 +00001036int
1037PyToken_ThreeChars(int c1, int c2, int c3)
1038{
1039 switch (c1) {
1040 case '<':
1041 switch (c2) {
1042 case '<':
1043 switch (c3) {
1044 case '=':
1045 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001046 }
1047 break;
1048 }
1049 break;
1050 case '>':
1051 switch (c2) {
1052 case '>':
1053 switch (c3) {
1054 case '=':
1055 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001056 }
1057 break;
1058 }
1059 break;
1060 case '*':
1061 switch (c2) {
1062 case '*':
1063 switch (c3) {
1064 case '=':
1065 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001066 }
1067 break;
1068 }
1069 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001070 case '/':
1071 switch (c2) {
1072 case '/':
1073 switch (c3) {
1074 case '=':
1075 return DOUBLESLASHEQUAL;
1076 }
1077 break;
1078 }
1079 break;
Georg Brandldde00282007-03-18 19:01:53 +00001080 case '.':
1081 switch (c2) {
1082 case '.':
1083 switch (c3) {
1084 case '.':
1085 return ELLIPSIS;
1086 }
1087 break;
1088 }
1089 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001090 }
1091 return OP;
1092}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001093
Guido van Rossum926f13a1998-04-09 21:38:06 +00001094static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001095indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001096{
1097 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001098 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001099 tok->cur = tok->inp;
1100 return 1;
1101 }
1102 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001103 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1104 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105 tok->altwarning = 0;
1106 }
1107 return 0;
1108}
1109
Martin v. Löwis47383402007-08-15 07:32:56 +00001110#ifdef PGEN
1111#define verify_identifier(s,e) 1
1112#else
1113/* Verify that the identifier follows PEP 3131. */
1114static int
1115verify_identifier(char *start, char *end)
1116{
Guido van Rossume3e37012007-08-29 18:54:41 +00001117 PyObject *s;
1118 int result;
1119 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1120 if (s == NULL) {
1121 PyErr_Clear();
1122 return 0;
1123 }
1124 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001125 Py_DECREF(s);
1126 return result;
1127}
1128#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001129
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130/* Get next token, after space stripping etc. */
1131
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001132static int
1133tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134{
1135 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001136 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001137
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001138 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001139 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001140 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001141 blankline = 0;
1142
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001143 /* Get indentation level */
1144 if (tok->atbol) {
1145 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001146 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001147 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001148 for (;;) {
1149 c = tok_nextc(tok);
1150 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001151 col++, altcol++;
1152 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001154 altcol = (altcol/tok->alttabsize + 1)
1155 * tok->alttabsize;
1156 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001157 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001158 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 else
1160 break;
1161 }
1162 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 if (c == '#' || c == '\n') {
1164 /* Lines with only whitespace and/or comments
1165 shouldn't affect the indentation and are
1166 not passed to the parser as NEWLINE tokens,
1167 except *totally* empty lines in interactive
1168 mode, which signal the end of a command group. */
1169 if (col == 0 && c == '\n' && tok->prompt != NULL)
1170 blankline = 0; /* Let it through */
1171 else
1172 blankline = 1; /* Ignore completely */
1173 /* We can't jump back right here since we still
1174 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001176 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 if (col == tok->indstack[tok->indent]) {
1178 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001179 if (altcol != tok->altindstack[tok->indent]) {
1180 if (indenterror(tok))
1181 return ERRORTOKEN;
1182 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001183 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001184 else if (col > tok->indstack[tok->indent]) {
1185 /* Indent -- always one */
1186 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001187 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001188 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001189 return ERRORTOKEN;
1190 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001191 if (altcol <= tok->altindstack[tok->indent]) {
1192 if (indenterror(tok))
1193 return ERRORTOKEN;
1194 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001195 tok->pendin++;
1196 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001197 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001199 else /* col < tok->indstack[tok->indent] */ {
1200 /* Dedent -- any number, must be consistent */
1201 while (tok->indent > 0 &&
1202 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001203 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001204 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001205 }
1206 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001207 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001208 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001209 return ERRORTOKEN;
1210 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001211 if (altcol != tok->altindstack[tok->indent]) {
1212 if (indenterror(tok))
1213 return ERRORTOKEN;
1214 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 }
1216 }
1217 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001218
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001219 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001220
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 /* Return pending indents/dedents */
1222 if (tok->pendin != 0) {
1223 if (tok->pendin < 0) {
1224 tok->pendin++;
1225 return DEDENT;
1226 }
1227 else {
1228 tok->pendin--;
1229 return INDENT;
1230 }
1231 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001232
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001234 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 /* Skip spaces */
1236 do {
1237 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001238 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001239
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001241 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001242
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001243 /* Skip comment */
1244 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001245 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001247
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001249 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001251 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001252
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001254 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001255 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001256 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001257 switch (c) {
1258 case 'r':
1259 case 'R':
1260 c = tok_nextc(tok);
1261 if (c == '"' || c == '\'')
1262 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001263 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001264 case 'b':
1265 case 'B':
1266 c = tok_nextc(tok);
1267 if (c == 'r' || c == 'R')
1268 c = tok_nextc(tok);
1269 if (c == '"' || c == '\'')
1270 goto letter_quote;
1271 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001272 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001273 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001274 if (c >= 128)
1275 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001276 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001277 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 tok_backup(tok, c);
Martin v. Löwis47383402007-08-15 07:32:56 +00001279 if (nonascii &&
1280 !verify_identifier(tok->start, tok->cur)) {
1281 tok->done = E_IDENTIFIER;
1282 return ERRORTOKEN;
1283 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001284 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285 *p_end = tok->cur;
1286 return NAME;
1287 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001288
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 /* Newline */
1290 if (c == '\n') {
1291 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001292 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001293 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001294 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001296 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001297 return NEWLINE;
1298 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001299
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001300 /* Period or number starting with period? */
1301 if (c == '.') {
1302 c = tok_nextc(tok);
1303 if (isdigit(c)) {
1304 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001305 } else if (c == '.') {
1306 c = tok_nextc(tok);
1307 if (c == '.') {
1308 *p_start = tok->start;
1309 *p_end = tok->cur;
1310 return ELLIPSIS;
1311 } else {
1312 tok_backup(tok, c);
1313 }
1314 tok_backup(tok, '.');
1315 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001316 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001317 }
Georg Brandldde00282007-03-18 19:01:53 +00001318 *p_start = tok->start;
1319 *p_end = tok->cur;
1320 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001321 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001322
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001323 /* Number */
1324 if (isdigit(c)) {
1325 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001326 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 c = tok_nextc(tok);
1328 if (c == '.')
1329 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001330#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001331 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001332 goto imaginary;
1333#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 if (c == 'x' || c == 'X') {
1335 /* Hex */
1336 do {
1337 c = tok_nextc(tok);
1338 } while (isxdigit(c));
1339 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001340 else if (c == 'o' || c == 'O') {
1341 /* Octal */
1342 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001343 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001344 } while ('0' <= c && c < '8');
1345 }
1346 else if (c == 'b' || c == 'B') {
1347 /* Binary */
1348 do {
1349 c = tok_nextc(tok);
1350 } while (c == '0' || c == '1');
1351 }
1352 else {
1353 int nonzero = 0;
1354 /* maybe old-style octal; c is first char of it */
1355 /* in any case, allow '0' as a literal */
1356 while (c == '0')
1357 c = tok_nextc(tok);
1358 while (isdigit(c)) {
1359 nonzero = 1;
1360 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001361 }
1362 if (c == '.')
1363 goto fraction;
1364 else if (c == 'e' || c == 'E')
1365 goto exponent;
1366#ifndef WITHOUT_COMPLEX
1367 else if (c == 'j' || c == 'J')
1368 goto imaginary;
1369#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001370 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001371 tok->done = E_TOKEN;
1372 tok_backup(tok, c);
1373 return ERRORTOKEN;
1374 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 }
1376 }
1377 else {
1378 /* Decimal */
1379 do {
1380 c = tok_nextc(tok);
1381 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001382 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001383 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001384 if (c == '.') {
1385 fraction:
1386 /* Fraction */
1387 do {
1388 c = tok_nextc(tok);
1389 } while (isdigit(c));
1390 }
1391 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001392 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001393 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001395 if (c == '+' || c == '-')
1396 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001397 if (!isdigit(c)) {
1398 tok->done = E_TOKEN;
1399 tok_backup(tok, c);
1400 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001401 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001402 do {
1403 c = tok_nextc(tok);
1404 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001406#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001407 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001408 /* Imaginary part */
1409 imaginary:
1410 c = tok_nextc(tok);
1411#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412 }
1413 }
1414 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001416 *p_end = tok->cur;
1417 return NUMBER;
1418 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001419
1420 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001421 /* String */
1422 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 int quote = c;
1425 int triple = 0;
1426 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001427 for (;;) {
1428 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 if (c == '\n') {
1430 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001431 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001432 tok_backup(tok, c);
1433 return ERRORTOKEN;
1434 }
1435 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001436 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001437 }
1438 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001439 if (triple)
1440 tok->done = E_EOFS;
1441 else
1442 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001443 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 return ERRORTOKEN;
1445 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001446 else if (c == quote) {
1447 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001448 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001449 c = tok_nextc(tok);
1450 if (c == quote) {
1451 triple = 1;
1452 tripcount = 0;
1453 continue;
1454 }
1455 tok_backup(tok, c);
1456 }
1457 if (!triple || tripcount == 3)
1458 break;
1459 }
1460 else if (c == '\\') {
1461 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001463 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001464 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001465 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 return ERRORTOKEN;
1467 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001468 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001469 else
1470 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001471 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001472 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001473 *p_end = tok->cur;
1474 return STRING;
1475 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477 /* Line continuation */
1478 if (c == '\\') {
1479 c = tok_nextc(tok);
1480 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001481 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001482 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001483 return ERRORTOKEN;
1484 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001485 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 goto again; /* Read next line */
1487 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001488
Guido van Rossumfbab9051991-10-20 20:25:03 +00001489 /* Check for two-character token */
1490 {
1491 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001492 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001493 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001494 int c3 = tok_nextc(tok);
1495 int token3 = PyToken_ThreeChars(c, c2, c3);
1496 if (token3 != OP) {
1497 token = token3;
1498 } else {
1499 tok_backup(tok, c3);
1500 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001501 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001502 *p_end = tok->cur;
1503 return token;
1504 }
1505 tok_backup(tok, c2);
1506 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001507
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001508 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001509 switch (c) {
1510 case '(':
1511 case '[':
1512 case '{':
1513 tok->level++;
1514 break;
1515 case ')':
1516 case ']':
1517 case '}':
1518 tok->level--;
1519 break;
1520 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001521
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001522 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001523 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001524 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001525 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526}
1527
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001528int
1529PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1530{
1531 int result = tok_get(tok, p_start, p_end);
1532 if (tok->decoding_erred) {
1533 result = ERRORTOKEN;
1534 tok->done = E_DECODE;
1535 }
1536 return result;
1537}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001538
Thomas Wouters89d996e2007-09-08 17:39:28 +00001539/* This function is only called from parsetok. However, it cannot live
1540 there, as it must be empty for PGEN, and we can check for PGEN only
1541 in this file. */
1542
1543#ifdef PGEN
1544char*
1545PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1546{
1547 return NULL;
1548}
1549#else
1550static PyObject *
1551dec_utf8(const char *enc, const char *text, size_t len) {
1552 PyObject *ret = NULL;
1553 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1554 if (unicode_text) {
1555 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1556 Py_DECREF(unicode_text);
1557 }
1558 if (!ret) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001559 PyErr_Clear();
1560 }
1561 else {
1562 assert(PyBytes_Check(ret));
Thomas Wouters89d996e2007-09-08 17:39:28 +00001563 }
1564 return ret;
1565}
1566
1567char *
1568PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1569{
1570 char *text = NULL;
1571 if (tok->encoding) {
1572 /* convert source to original encondig */
1573 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1574 if (lineobj != NULL) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001575 int linelen = PyBytes_GET_SIZE(lineobj);
1576 const char *line = PyBytes_AS_STRING(lineobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001577 text = PyObject_MALLOC(linelen + 1);
1578 if (text != NULL && line != NULL) {
1579 if (linelen)
1580 strncpy(text, line, linelen);
1581 text[linelen] = '\0';
1582 }
1583 Py_DECREF(lineobj);
1584
1585 /* adjust error offset */
1586 if (*offset > 1) {
1587 PyObject *offsetobj = dec_utf8(tok->encoding,
Guido van Rossum641591c2007-10-10 18:44:39 +00001588 tok->buf,
1589 *offset-1);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001590 if (offsetobj) {
Guido van Rossum641591c2007-10-10 18:44:39 +00001591 *offset = 1 +
1592 PyBytes_GET_SIZE(offsetobj);
Thomas Wouters89d996e2007-09-08 17:39:28 +00001593 Py_DECREF(offsetobj);
1594 }
1595 }
1596
1597 }
1598 }
1599 return text;
1600
1601}
1602#endif
1603
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001604/* Get -*- encoding -*- from a Python file
1605
1606 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1607 the first or second line of the file. In this case the encoding is
1608 PyUnicode_GetDefaultEncoding().
1609*/
1610char *
1611PyTokenizer_FindEncoding(FILE *fp) {
1612 struct tok_state *tok;
1613 char *p_start=NULL, *p_end=NULL;
1614
1615 if ((tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL)) == NULL) {
1616 rewind(fp);
1617 return NULL;
1618 }
1619 while(((tok->lineno <= 2) && (tok->done == E_OK))) {
1620 PyTokenizer_Get(tok, &p_start, &p_end);
1621 }
1622
1623 rewind(fp);
1624 return tok->encoding;
1625}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001626
Guido van Rossum408027e1996-12-30 16:17:54 +00001627#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001628
1629void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001630tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001631{
Guido van Rossum86bea461997-04-29 21:03:06 +00001632 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001633 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1634 printf("(%.*s)", (int)(end - start), start);
1635}
1636
1637#endif