blob: ec3c5db4b985630b57903cea6b2b26b1a15058ed [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000070 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000092 "DOUBLESLASH",
93 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000094 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +000095 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +000096 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000125 tok->altwarning = 1;
126 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000275#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
280 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000281 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000283#else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000289#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
320 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
321 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
322#if 0
323 /* Disable support for UTF-16 BOMs until a decision
324 is made whether this needs to be supported. */
325 } else if (ch == 0xFE) {
326 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
327 if (!set_readline(tok, "utf-16-be")) return 0;
328 tok->decoding_state = -1;
329 } else if (ch == 0xFF) {
330 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
331 if (!set_readline(tok, "utf-16-le")) return 0;
332 tok->decoding_state = -1;
333#endif
334 } else {
335 unget_char(ch, tok);
336 return 1;
337 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000338 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000339 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
341 return 1;
342 NON_BOM:
343 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
344 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
345 return 1;
346}
347
348/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000349 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000351 On entry, tok->decoding_buffer will be one of:
352 1) NULL: need to call tok->decoding_readline to get a new line
353 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
354 stored the result in tok->decoding_buffer
355 3) PyStringObject *: previous call to fp_readl did not have enough room
356 (in the s buffer) to copy entire contents of the line read
357 by tok->decoding_readline. tok->decoding_buffer has the overflow.
358 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000359 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000360 reached): see tok_nextc and its calls to decoding_fgets.
361*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000362
363static char *
364fp_readl(char *s, int size, struct tok_state *tok)
365{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000366#ifndef Py_USING_UNICODE
367 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000368 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000369 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000370#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000374 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000382 if (buf == NULL)
383 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384 } else {
385 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 if (PyString_CheckExact(buf))
387 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000388 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389 if (utf8 == NULL) {
390 utf8 = PyUnicode_AsUTF8String(buf);
391 Py_DECREF(buf);
392 if (utf8 == NULL)
393 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000394 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000395 str = PyString_AsString(utf8);
396 utf8len = PyString_GET_SIZE(utf8);
397 if (utf8len > size) {
398 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
399 if (tok->decoding_buffer == NULL) {
400 Py_DECREF(utf8);
401 return error_ret(tok);
402 }
403 utf8len = size;
404 }
405 memcpy(s, str, utf8len);
406 s[utf8len] = '\0';
407 Py_DECREF(utf8);
408 if (utf8len == 0) return NULL; /* EOF */
409 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000410#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000411}
412
413/* Set the readline function for TOK to a StreamReader's
414 readline function. The StreamReader is named ENC.
415
416 This function is called from check_bom and check_coding_spec.
417
418 ENC is usually identical to the future value of tok->encoding,
419 except for the (currently unsupported) case of UTF-16.
420
421 Return 1 on success, 0 on failure. */
422
423static int
424fp_setreadl(struct tok_state *tok, const char* enc)
425{
426 PyObject *reader, *stream, *readline;
427
Martin v. Löwis95292d62002-12-11 14:04:59 +0000428 /* XXX: constify filename argument. */
429 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000430 if (stream == NULL)
431 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000432
433 reader = PyCodec_StreamReader(enc, stream, NULL);
434 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000435 if (reader == NULL)
436 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437
438 readline = PyObject_GetAttrString(reader, "readline");
439 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000440 if (readline == NULL)
441 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000442
443 tok->decoding_readline = readline;
444 return 1;
445}
446
447/* Fetch the next byte from TOK. */
448
449static int fp_getc(struct tok_state *tok) {
450 return getc(tok->fp);
451}
452
453/* Unfetch the last byte back into TOK. */
454
455static void fp_ungetc(int c, struct tok_state *tok) {
456 ungetc(c, tok->fp);
457}
458
459/* Read a line of input from TOK. Determine encoding
460 if necessary. */
461
462static char *
463decoding_fgets(char *s, int size, struct tok_state *tok)
464{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000465 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000466 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000467 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468 if (tok->decoding_state < 0) {
469 /* We already have a codec associated with
470 this input. */
471 line = fp_readl(s, size, tok);
472 break;
473 } else if (tok->decoding_state > 0) {
474 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000475 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000477 break;
478 } else {
479 /* We have not yet determined the encoding.
480 If an encoding is found, use the file-pointer
481 reader functions from now on. */
482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483 return error_ret(tok);
484 assert(tok->decoding_state != 0);
485 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000486 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489 return error_ret(tok);
490 }
491 }
492#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000493 /* The default encoding is ASCII, so make sure we don't have any
494 non-ASCII bytes in it. */
495 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000497 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000498 if (*c > 127) {
499 badchar = *c;
500 break;
501 }
502 }
503 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000504 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000505 /* Need to add 1 to the line number, since this line
506 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000508 "Non-ASCII character '\\x%.2x' "
509 "in file %.200s on line %i, "
510 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000511 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000512 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000513 PyErr_SetString(PyExc_SyntaxError, buf);
514 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 }
516#endif
517 return line;
518}
519
520static int
521decoding_feof(struct tok_state *tok)
522{
523 if (tok->decoding_state >= 0) {
524 return feof(tok->fp);
525 } else {
526 PyObject* buf = tok->decoding_buffer;
527 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000528 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000529 if (buf == NULL) {
530 error_ret(tok);
531 return 1;
532 } else {
533 tok->decoding_buffer = buf;
534 }
535 }
536 return PyObject_Length(buf) == 0;
537 }
538}
539
540/* Fetch a byte from TOK, using the string buffer. */
541
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000542static int
543buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000544 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545}
546
547/* Unfetch a byte from TOK, using the string buffer. */
548
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000549static void
550buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553}
554
555/* Set the readline function for TOK to ENC. For the string-based
556 tokenizer, this means to just record the encoding. */
557
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000558static int
559buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 tok->enc = enc;
561 return 1;
562}
563
564/* Return a UTF-8 encoding Python string object from the
565 C byte string STR, which is encoded with ENC. */
566
Martin v. Löwis019934b2002-08-07 12:33:18 +0000567#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568static PyObject *
569translate_into_utf8(const char* str, const char* enc) {
570 PyObject *utf8;
571 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
572 if (buf == NULL)
573 return NULL;
574 utf8 = PyUnicode_AsUTF8String(buf);
575 Py_DECREF(buf);
576 return utf8;
577}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000578#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000579
580/* Decode a byte string STR for use as the buffer of TOK.
581 Look for encoding declarations inside STR, and record them
582 inside TOK. */
583
584static const char *
585decode_str(const char *str, struct tok_state *tok)
586{
587 PyObject* utf8 = NULL;
588 const char *s;
589 int lineno = 0;
590 tok->enc = NULL;
591 tok->str = str;
592 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000595 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000596#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597 if (tok->enc != NULL) {
598 utf8 = translate_into_utf8(str, tok->enc);
599 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000600 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 str = PyString_AsString(utf8);
602 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000603#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 for (s = str;; s++) {
605 if (*s == '\0') break;
606 else if (*s == '\n') {
607 lineno++;
608 if (lineno == 2) break;
609 }
610 }
611 tok->enc = NULL;
612 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000613 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000614#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 if (tok->enc != NULL) {
616 assert(utf8 == NULL);
617 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000618 if (utf8 == NULL) {
619 PyErr_Format(PyExc_SyntaxError,
620 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000621 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000622 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 str = PyString_AsString(utf8);
624 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000625#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626 assert(tok->decoding_buffer == NULL);
627 tok->decoding_buffer = utf8; /* CAUTION */
628 return str;
629}
630
631#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000632
633/* Set up tokenizer for string */
634
635struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000636PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000637{
638 struct tok_state *tok = tok_new();
639 if (tok == NULL)
640 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000642 if (str == NULL) {
643 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000644 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000645 }
646
Martin v. Löwis95292d62002-12-11 14:04:59 +0000647 /* XXX: constify members. */
648 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649 return tok;
650}
651
652
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000653/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000654
655struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000656PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000657{
658 struct tok_state *tok = tok_new();
659 if (tok == NULL)
660 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000661 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000662 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000663 return NULL;
664 }
665 tok->cur = tok->inp = tok->buf;
666 tok->end = tok->buf + BUFSIZ;
667 tok->fp = fp;
668 tok->prompt = ps1;
669 tok->nextprompt = ps2;
670 return tok;
671}
672
673
674/* Free a tok_state structure */
675
676void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000677PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000680 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000681#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682 Py_XDECREF(tok->decoding_readline);
683 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000684#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000685 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000686 PyMem_FREE(tok->buf);
687 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000688}
689
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000690#if !defined(PGEN) && defined(Py_USING_UNICODE)
691static int
692tok_stdin_decode(struct tok_state *tok, char **inp)
693{
694 PyObject *enc, *sysstdin, *decoded, *utf8;
695 const char *encoding;
696 char *converted;
697
698 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
699 return 0;
700 sysstdin = PySys_GetObject("stdin");
701 if (sysstdin == NULL || !PyFile_Check(sysstdin))
702 return 0;
703
704 enc = ((PyFileObject *)sysstdin)->f_encoding;
705 if (enc == NULL || !PyString_Check(enc))
706 return 0;
707 Py_INCREF(enc);
708
709 encoding = PyString_AsString(enc);
710 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
711 if (decoded == NULL)
712 goto error_clear;
713
714 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
715 Py_DECREF(decoded);
716 if (utf8 == NULL)
717 goto error_clear;
718
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000719 assert(PyString_Check(utf8));
720 converted = new_string(PyString_AS_STRING(utf8),
721 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000722 Py_DECREF(utf8);
723 if (converted == NULL)
724 goto error_nomem;
725
726 PyMem_FREE(*inp);
727 *inp = converted;
728 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000729 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000730 tok->encoding = new_string(encoding, strlen(encoding));
731 if (tok->encoding == NULL)
732 goto error_nomem;
733
734 Py_DECREF(enc);
735 return 0;
736
737error_nomem:
738 Py_DECREF(enc);
739 tok->done = E_NOMEM;
740 return -1;
741
742error_clear:
743 /* Fallback to iso-8859-1: for backward compatibility */
744 Py_DECREF(enc);
745 PyErr_Clear();
746 return 0;
747}
748#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749
750/* Get next char, updating state; error code goes into tok->done */
751
752static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000753tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000756 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000757 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000758 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000759 if (tok->done != E_OK)
760 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000762 char *end = strchr(tok->inp, '\n');
763 if (end != NULL)
764 end++;
765 else {
766 end = strchr(tok->inp, '\0');
767 if (end == tok->inp) {
768 tok->done = E_EOF;
769 return EOF;
770 }
771 }
772 if (tok->start == NULL)
773 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000774 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000775 tok->lineno++;
776 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000777 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000780 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000781 if (tok->nextprompt != NULL)
782 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000783 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000784 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000785 else if (*newtok == '\0') {
786 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787 tok->done = E_EOF;
788 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000789#if !defined(PGEN) && defined(Py_USING_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000790 else if (tok_stdin_decode(tok, &newtok) != 0)
791 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000792#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000794 size_t start = tok->start - tok->buf;
795 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000796 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000797 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000798 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000799 tok->lineno++;
800 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000801 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000802 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000803 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000804 tok->done = E_NOMEM;
805 return EOF;
806 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 tok->buf = buf;
808 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000809 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000810 strcpy(tok->buf + oldlen, newtok);
811 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000812 tok->inp = tok->buf + newlen;
813 tok->end = tok->inp + 1;
814 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000815 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 else {
817 tok->lineno++;
818 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000819 PyMem_FREE(tok->buf);
820 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000821 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000823 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 tok->inp = strchr(tok->buf, '\0');
825 tok->end = tok->inp + 1;
826 }
827 }
828 else {
829 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000830 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000831 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 if (tok->start == NULL) {
833 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000834 tok->buf = (char *)
835 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 if (tok->buf == NULL) {
837 tok->done = E_NOMEM;
838 return EOF;
839 }
840 tok->end = tok->buf + BUFSIZ;
841 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000842 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
843 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 tok->done = E_EOF;
845 done = 1;
846 }
847 else {
848 tok->done = E_OK;
849 tok->inp = strchr(tok->buf, '\0');
850 done = tok->inp[-1] == '\n';
851 }
852 }
853 else {
854 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000855 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000856 tok->done = E_EOF;
857 done = 1;
858 }
859 else
860 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000861 }
862 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000865 Py_ssize_t curstart = tok->start == NULL ? -1 :
866 tok->start - tok->buf;
867 Py_ssize_t curvalid = tok->inp - tok->buf;
868 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000869 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000870 newbuf = (char *)PyMem_REALLOC(newbuf,
871 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000872 if (newbuf == NULL) {
873 tok->done = E_NOMEM;
874 tok->cur = tok->inp;
875 return EOF;
876 }
877 tok->buf = newbuf;
878 tok->inp = tok->buf + curvalid;
879 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->start = curstart < 0 ? NULL :
881 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000882 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000883 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000884 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000885 /* Break out early on decoding
886 errors, as tok->buf will be NULL
887 */
888 if (tok->decoding_erred)
889 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 /* Last line does not end in \n,
891 fake one */
892 strcpy(tok->inp, "\n");
893 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000895 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000897 if (tok->buf != NULL) {
898 tok->cur = tok->buf + cur;
899 tok->line_start = tok->cur;
900 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000901 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000902 pt = tok->inp - 2;
903 if (pt >= tok->buf && *pt == '\r') {
904 *pt++ = '\n';
905 *pt = '\0';
906 tok->inp = pt;
907 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000908 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909 }
910 if (tok->done != E_OK) {
911 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000912 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000913 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914 return EOF;
915 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000917 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918}
919
920
921/* Back-up one character */
922
923static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000924tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000925{
926 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000927 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000928 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000929 if (*tok->cur != c)
930 *tok->cur = c;
931 }
932}
933
934
935/* Return the token corresponding to a single character */
936
937int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000938PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000939{
940 switch (c) {
941 case '(': return LPAR;
942 case ')': return RPAR;
943 case '[': return LSQB;
944 case ']': return RSQB;
945 case ':': return COLON;
946 case ',': return COMMA;
947 case ';': return SEMI;
948 case '+': return PLUS;
949 case '-': return MINUS;
950 case '*': return STAR;
951 case '/': return SLASH;
952 case '|': return VBAR;
953 case '&': return AMPER;
954 case '<': return LESS;
955 case '>': return GREATER;
956 case '=': return EQUAL;
957 case '.': return DOT;
958 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000959 case '{': return LBRACE;
960 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000961 case '^': return CIRCUMFLEX;
962 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000963 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000964 default: return OP;
965 }
966}
967
968
Guido van Rossumfbab9051991-10-20 20:25:03 +0000969int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000970PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000971{
972 switch (c1) {
973 case '=':
974 switch (c2) {
975 case '=': return EQEQUAL;
976 }
977 break;
978 case '!':
979 switch (c2) {
980 case '=': return NOTEQUAL;
981 }
982 break;
983 case '<':
984 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000985 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000986 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000987 }
988 break;
989 case '>':
990 switch (c2) {
991 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000992 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000993 }
994 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000995 case '+':
996 switch (c2) {
997 case '=': return PLUSEQUAL;
998 }
999 break;
1000 case '-':
1001 switch (c2) {
1002 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001003 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001004 }
1005 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001006 case '*':
1007 switch (c2) {
1008 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 case '=': return STAREQUAL;
1010 }
1011 break;
1012 case '/':
1013 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001014 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001015 case '=': return SLASHEQUAL;
1016 }
1017 break;
1018 case '|':
1019 switch (c2) {
1020 case '=': return VBAREQUAL;
1021 }
1022 break;
1023 case '%':
1024 switch (c2) {
1025 case '=': return PERCENTEQUAL;
1026 }
1027 break;
1028 case '&':
1029 switch (c2) {
1030 case '=': return AMPEREQUAL;
1031 }
1032 break;
1033 case '^':
1034 switch (c2) {
1035 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001036 }
1037 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001038 }
1039 return OP;
1040}
1041
Thomas Wouters434d0822000-08-24 20:11:32 +00001042int
1043PyToken_ThreeChars(int c1, int c2, int c3)
1044{
1045 switch (c1) {
1046 case '<':
1047 switch (c2) {
1048 case '<':
1049 switch (c3) {
1050 case '=':
1051 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001052 }
1053 break;
1054 }
1055 break;
1056 case '>':
1057 switch (c2) {
1058 case '>':
1059 switch (c3) {
1060 case '=':
1061 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001062 }
1063 break;
1064 }
1065 break;
1066 case '*':
1067 switch (c2) {
1068 case '*':
1069 switch (c3) {
1070 case '=':
1071 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001072 }
1073 break;
1074 }
1075 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001076 case '/':
1077 switch (c2) {
1078 case '/':
1079 switch (c3) {
1080 case '=':
1081 return DOUBLESLASHEQUAL;
1082 }
1083 break;
1084 }
1085 break;
Georg Brandldde00282007-03-18 19:01:53 +00001086 case '.':
1087 switch (c2) {
1088 case '.':
1089 switch (c3) {
1090 case '.':
1091 return ELLIPSIS;
1092 }
1093 break;
1094 }
1095 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001096 }
1097 return OP;
1098}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001099
Guido van Rossum926f13a1998-04-09 21:38:06 +00001100static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001101indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001102{
1103 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001104 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105 tok->cur = tok->inp;
1106 return 1;
1107 }
1108 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001109 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1110 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001111 tok->altwarning = 0;
1112 }
1113 return 0;
1114}
1115
1116
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117/* Get next token, after space stripping etc. */
1118
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001119static int
1120tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001121{
1122 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001123 int blankline;
1124
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001125 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001126 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001127 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001128 blankline = 0;
1129
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001130 /* Get indentation level */
1131 if (tok->atbol) {
1132 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001135 for (;;) {
1136 c = tok_nextc(tok);
1137 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001138 col++, altcol++;
1139 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001140 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001141 altcol = (altcol/tok->alttabsize + 1)
1142 * tok->alttabsize;
1143 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001144 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001145 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001146 else
1147 break;
1148 }
1149 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001150 if (c == '#' || c == '\n') {
1151 /* Lines with only whitespace and/or comments
1152 shouldn't affect the indentation and are
1153 not passed to the parser as NEWLINE tokens,
1154 except *totally* empty lines in interactive
1155 mode, which signal the end of a command group. */
1156 if (col == 0 && c == '\n' && tok->prompt != NULL)
1157 blankline = 0; /* Let it through */
1158 else
1159 blankline = 1; /* Ignore completely */
1160 /* We can't jump back right here since we still
1161 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001162 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001163 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 if (col == tok->indstack[tok->indent]) {
1165 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001166 if (altcol != tok->altindstack[tok->indent]) {
1167 if (indenterror(tok))
1168 return ERRORTOKEN;
1169 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001170 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001171 else if (col > tok->indstack[tok->indent]) {
1172 /* Indent -- always one */
1173 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001174 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001175 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001176 return ERRORTOKEN;
1177 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001178 if (altcol <= tok->altindstack[tok->indent]) {
1179 if (indenterror(tok))
1180 return ERRORTOKEN;
1181 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001182 tok->pendin++;
1183 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001184 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001185 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001186 else /* col < tok->indstack[tok->indent] */ {
1187 /* Dedent -- any number, must be consistent */
1188 while (tok->indent > 0 &&
1189 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001190 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001191 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001192 }
1193 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001194 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001195 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001196 return ERRORTOKEN;
1197 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001198 if (altcol != tok->altindstack[tok->indent]) {
1199 if (indenterror(tok))
1200 return ERRORTOKEN;
1201 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 }
1203 }
1204 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001205
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001206 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001207
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208 /* Return pending indents/dedents */
1209 if (tok->pendin != 0) {
1210 if (tok->pendin < 0) {
1211 tok->pendin++;
1212 return DEDENT;
1213 }
1214 else {
1215 tok->pendin--;
1216 return INDENT;
1217 }
1218 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001219
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001221 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 /* Skip spaces */
1223 do {
1224 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001225 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001226
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001228 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001229
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001230 /* Skip comment */
1231 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001232 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001236 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001238 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001239
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 /* Identifier (most frequent token!) */
1241 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001242 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001243 switch (c) {
1244 case 'r':
1245 case 'R':
1246 c = tok_nextc(tok);
1247 if (c == '"' || c == '\'')
1248 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001249 break;
1250 case 'u':
1251 case 'U':
1252 c = tok_nextc(tok);
1253 if (c == 'r' || c == 'R')
1254 c = tok_nextc(tok);
1255 if (c == '"' || c == '\'')
1256 goto letter_quote;
1257 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001258 case 'b':
1259 case 'B':
1260 c = tok_nextc(tok);
1261 if (c == 'r' || c == 'R')
1262 c = tok_nextc(tok);
1263 if (c == '"' || c == '\'')
1264 goto letter_quote;
1265 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001266 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001267 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001268 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001269 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001270 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001271 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 *p_end = tok->cur;
1273 return NAME;
1274 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001275
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001276 /* Newline */
1277 if (c == '\n') {
1278 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001279 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001280 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001281 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001283 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 return NEWLINE;
1285 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001286
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001287 /* Period or number starting with period? */
1288 if (c == '.') {
1289 c = tok_nextc(tok);
1290 if (isdigit(c)) {
1291 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001292 } else if (c == '.') {
1293 c = tok_nextc(tok);
1294 if (c == '.') {
1295 *p_start = tok->start;
1296 *p_end = tok->cur;
1297 return ELLIPSIS;
1298 } else {
1299 tok_backup(tok, c);
1300 }
1301 tok_backup(tok, '.');
1302 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001303 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001304 }
Georg Brandldde00282007-03-18 19:01:53 +00001305 *p_start = tok->start;
1306 *p_end = tok->cur;
1307 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001308 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001309
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 /* Number */
1311 if (isdigit(c)) {
1312 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001313 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 c = tok_nextc(tok);
1315 if (c == '.')
1316 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001317#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001318 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001319 goto imaginary;
1320#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 if (c == 'x' || c == 'X') {
1322 /* Hex */
1323 do {
1324 c = tok_nextc(tok);
1325 } while (isxdigit(c));
1326 }
1327 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001328 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001329 /* Octal; c is first char of it */
1330 /* There's no 'isoctdigit' macro, sigh */
1331 while ('0' <= c && c < '8') {
1332 c = tok_nextc(tok);
1333 }
Tim Petersd507dab2001-08-30 20:51:59 +00001334 if (isdigit(c)) {
1335 found_decimal = 1;
1336 do {
1337 c = tok_nextc(tok);
1338 } while (isdigit(c));
1339 }
1340 if (c == '.')
1341 goto fraction;
1342 else if (c == 'e' || c == 'E')
1343 goto exponent;
1344#ifndef WITHOUT_COMPLEX
1345 else if (c == 'j' || c == 'J')
1346 goto imaginary;
1347#endif
1348 else if (found_decimal) {
1349 tok->done = E_TOKEN;
1350 tok_backup(tok, c);
1351 return ERRORTOKEN;
1352 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 }
1354 }
1355 else {
1356 /* Decimal */
1357 do {
1358 c = tok_nextc(tok);
1359 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001360 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001361 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001362 if (c == '.') {
1363 fraction:
1364 /* Fraction */
1365 do {
1366 c = tok_nextc(tok);
1367 } while (isdigit(c));
1368 }
1369 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001370 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001371 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001373 if (c == '+' || c == '-')
1374 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001375 if (!isdigit(c)) {
1376 tok->done = E_TOKEN;
1377 tok_backup(tok, c);
1378 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001379 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001380 do {
1381 c = tok_nextc(tok);
1382 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001384#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001385 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001386 /* Imaginary part */
1387 imaginary:
1388 c = tok_nextc(tok);
1389#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001390 }
1391 }
1392 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001393 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 *p_end = tok->cur;
1395 return NUMBER;
1396 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001397
1398 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 /* String */
1400 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001401 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001402 int quote = c;
1403 int triple = 0;
1404 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 for (;;) {
1406 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 if (c == '\n') {
1408 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001409 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 tok_backup(tok, c);
1411 return ERRORTOKEN;
1412 }
1413 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001414 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 }
1416 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001417 if (triple)
1418 tok->done = E_EOFS;
1419 else
1420 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001421 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 return ERRORTOKEN;
1423 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 else if (c == quote) {
1425 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001426 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 c = tok_nextc(tok);
1428 if (c == quote) {
1429 triple = 1;
1430 tripcount = 0;
1431 continue;
1432 }
1433 tok_backup(tok, c);
1434 }
1435 if (!triple || tripcount == 3)
1436 break;
1437 }
1438 else if (c == '\\') {
1439 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001440 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001441 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001442 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001443 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 return ERRORTOKEN;
1445 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001446 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001447 else
1448 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001449 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001450 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001451 *p_end = tok->cur;
1452 return STRING;
1453 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001454
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 /* Line continuation */
1456 if (c == '\\') {
1457 c = tok_nextc(tok);
1458 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001459 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001460 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001461 return ERRORTOKEN;
1462 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001463 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001464 goto again; /* Read next line */
1465 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001466
Guido van Rossumfbab9051991-10-20 20:25:03 +00001467 /* Check for two-character token */
1468 {
1469 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001470 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001471 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001472 int c3 = tok_nextc(tok);
1473 int token3 = PyToken_ThreeChars(c, c2, c3);
1474 if (token3 != OP) {
1475 token = token3;
1476 } else {
1477 tok_backup(tok, c3);
1478 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001479 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001480 *p_end = tok->cur;
1481 return token;
1482 }
1483 tok_backup(tok, c2);
1484 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001485
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001486 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001487 switch (c) {
1488 case '(':
1489 case '[':
1490 case '{':
1491 tok->level++;
1492 break;
1493 case ')':
1494 case ']':
1495 case '}':
1496 tok->level--;
1497 break;
1498 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001499
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001500 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001501 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001503 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504}
1505
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001506int
1507PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1508{
1509 int result = tok_get(tok, p_start, p_end);
1510 if (tok->decoding_erred) {
1511 result = ERRORTOKEN;
1512 tok->done = E_DECODE;
1513 }
1514 return result;
1515}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516
Guido van Rossum408027e1996-12-30 16:17:54 +00001517#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001518
1519void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001520tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521{
Guido van Rossum86bea461997-04-29 21:03:06 +00001522 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1524 printf("(%.*s)", (int)(end - start), start);
1525}
1526
1527#endif