blob: 947ad9c3454b911a020beb43f635dfe3d12bd98b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000070 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000092 "DOUBLESLASH",
93 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000094 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000095 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000096 "OP",
97 "<ERRORTOKEN>",
98 "<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000105tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000106{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000107 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
108 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000123 tok->altwarning = 1;
124 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000174 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000208get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000231 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000239 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000279 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 PyMem_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000287#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000291 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
319 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
320#if 0
321 /* Disable support for UTF-16 BOMs until a decision
322 is made whether this needs to be supported. */
323 } else if (ch == 0xFE) {
324 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
325 if (!set_readline(tok, "utf-16-be")) return 0;
326 tok->decoding_state = -1;
327 } else if (ch == 0xFF) {
328 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
329 if (!set_readline(tok, "utf-16-le")) return 0;
330 tok->decoding_state = -1;
331#endif
332 } else {
333 unget_char(ch, tok);
334 return 1;
335 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000336 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000337 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
339 return 1;
340 NON_BOM:
341 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
342 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
343 return 1;
344}
345
346/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000348
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000349 On entry, tok->decoding_buffer will be one of:
350 1) NULL: need to call tok->decoding_readline to get a new line
351 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
352 stored the result in tok->decoding_buffer
353 3) PyStringObject *: previous call to fp_readl did not have enough room
354 (in the s buffer) to copy entire contents of the line read
355 by tok->decoding_readline. tok->decoding_buffer has the overflow.
356 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000357 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000358 reached): see tok_nextc and its calls to decoding_fgets.
359*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000360
361static char *
362fp_readl(char *s, int size, struct tok_state *tok)
363{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000364#ifndef Py_USING_UNICODE
365 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000366 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000367 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000368#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000372 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373
374 /* Ask for one less byte so we can terminate it */
375 assert(size > 0);
376 size--;
377
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000380 if (buf == NULL)
381 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 } else {
383 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000384 if (PyString_CheckExact(buf))
385 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 if (utf8 == NULL) {
388 utf8 = PyUnicode_AsUTF8String(buf);
389 Py_DECREF(buf);
390 if (utf8 == NULL)
391 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 str = PyString_AsString(utf8);
394 utf8len = PyString_GET_SIZE(utf8);
395 if (utf8len > size) {
396 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
397 if (tok->decoding_buffer == NULL) {
398 Py_DECREF(utf8);
399 return error_ret(tok);
400 }
401 utf8len = size;
402 }
403 memcpy(s, str, utf8len);
404 s[utf8len] = '\0';
405 Py_DECREF(utf8);
406 if (utf8len == 0) return NULL; /* EOF */
407 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000408#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Set the readline function for TOK to a StreamReader's
412 readline function. The StreamReader is named ENC.
413
414 This function is called from check_bom and check_coding_spec.
415
416 ENC is usually identical to the future value of tok->encoding,
417 except for the (currently unsupported) case of UTF-16.
418
419 Return 1 on success, 0 on failure. */
420
421static int
422fp_setreadl(struct tok_state *tok, const char* enc)
423{
424 PyObject *reader, *stream, *readline;
425
Martin v. Löwis95292d62002-12-11 14:04:59 +0000426 /* XXX: constify filename argument. */
427 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 if (stream == NULL)
429 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430
431 reader = PyCodec_StreamReader(enc, stream, NULL);
432 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000433 if (reader == NULL)
434 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000435
436 readline = PyObject_GetAttrString(reader, "readline");
437 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000438 if (readline == NULL)
439 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000440
441 tok->decoding_readline = readline;
442 return 1;
443}
444
445/* Fetch the next byte from TOK. */
446
447static int fp_getc(struct tok_state *tok) {
448 return getc(tok->fp);
449}
450
451/* Unfetch the last byte back into TOK. */
452
453static void fp_ungetc(int c, struct tok_state *tok) {
454 ungetc(c, tok->fp);
455}
456
457/* Read a line of input from TOK. Determine encoding
458 if necessary. */
459
460static char *
461decoding_fgets(char *s, int size, struct tok_state *tok)
462{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000463 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000464 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000465 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 if (tok->decoding_state < 0) {
467 /* We already have a codec associated with
468 this input. */
469 line = fp_readl(s, size, tok);
470 break;
471 } else if (tok->decoding_state > 0) {
472 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000473 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 break;
476 } else {
477 /* We have not yet determined the encoding.
478 If an encoding is found, use the file-pointer
479 reader functions from now on. */
480 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
481 return error_ret(tok);
482 assert(tok->decoding_state != 0);
483 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000484 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
486 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
487 return error_ret(tok);
488 }
489 }
490#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000491 /* The default encoding is ASCII, so make sure we don't have any
492 non-ASCII bytes in it. */
493 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000495 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (*c > 127) {
497 badchar = *c;
498 break;
499 }
500 }
501 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000502 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000503 /* Need to add 1 to the line number, since this line
504 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000505 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000506 "Non-ASCII character '\\x%.2x' "
507 "in file %.200s on line %i, "
508 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000509 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000510 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000511 PyErr_SetString(PyExc_SyntaxError, buf);
512 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000513 }
514#endif
515 return line;
516}
517
518static int
519decoding_feof(struct tok_state *tok)
520{
521 if (tok->decoding_state >= 0) {
522 return feof(tok->fp);
523 } else {
524 PyObject* buf = tok->decoding_buffer;
525 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000526 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 if (buf == NULL) {
528 error_ret(tok);
529 return 1;
530 } else {
531 tok->decoding_buffer = buf;
532 }
533 }
534 return PyObject_Length(buf) == 0;
535 }
536}
537
538/* Fetch a byte from TOK, using the string buffer. */
539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000540static int
541buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000542 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
545/* Unfetch a byte from TOK, using the string buffer. */
546
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000547static void
548buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000550 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551}
552
553/* Set the readline function for TOK to ENC. For the string-based
554 tokenizer, this means to just record the encoding. */
555
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000556static int
557buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 tok->enc = enc;
559 return 1;
560}
561
562/* Return a UTF-8 encoding Python string object from the
563 C byte string STR, which is encoded with ENC. */
564
Martin v. Löwis019934b2002-08-07 12:33:18 +0000565#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000566static PyObject *
567translate_into_utf8(const char* str, const char* enc) {
568 PyObject *utf8;
569 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
570 if (buf == NULL)
571 return NULL;
572 utf8 = PyUnicode_AsUTF8String(buf);
573 Py_DECREF(buf);
574 return utf8;
575}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000576#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577
578/* Decode a byte string STR for use as the buffer of TOK.
579 Look for encoding declarations inside STR, and record them
580 inside TOK. */
581
582static const char *
583decode_str(const char *str, struct tok_state *tok)
584{
585 PyObject* utf8 = NULL;
586 const char *s;
587 int lineno = 0;
588 tok->enc = NULL;
589 tok->str = str;
590 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000591 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000593 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000594#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595 if (tok->enc != NULL) {
596 utf8 = translate_into_utf8(str, tok->enc);
597 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000598 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 str = PyString_AsString(utf8);
600 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000601#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 for (s = str;; s++) {
603 if (*s == '\0') break;
604 else if (*s == '\n') {
605 lineno++;
606 if (lineno == 2) break;
607 }
608 }
609 tok->enc = NULL;
610 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000611 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000612#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613 if (tok->enc != NULL) {
614 assert(utf8 == NULL);
615 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000616 if (utf8 == NULL) {
617 PyErr_Format(PyExc_SyntaxError,
618 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000619 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000620 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 str = PyString_AsString(utf8);
622 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000623#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624 assert(tok->decoding_buffer == NULL);
625 tok->decoding_buffer = utf8; /* CAUTION */
626 return str;
627}
628
629#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630
631/* Set up tokenizer for string */
632
633struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000634PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000635{
636 struct tok_state *tok = tok_new();
637 if (tok == NULL)
638 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 if (str == NULL) {
641 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000643 }
644
Martin v. Löwis95292d62002-12-11 14:04:59 +0000645 /* XXX: constify members. */
646 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647 return tok;
648}
649
650
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000651/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652
653struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000654PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655{
656 struct tok_state *tok = tok_new();
657 if (tok == NULL)
658 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000659 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000660 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661 return NULL;
662 }
663 tok->cur = tok->inp = tok->buf;
664 tok->end = tok->buf + BUFSIZ;
665 tok->fp = fp;
666 tok->prompt = ps1;
667 tok->nextprompt = ps2;
668 return tok;
669}
670
671
672/* Free a tok_state structure */
673
674void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000675PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000676{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000677 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000678 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000679#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000680 Py_XDECREF(tok->decoding_readline);
681 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000682#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000684 PyMem_FREE(tok->buf);
685 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686}
687
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000688#if !defined(PGEN) && defined(Py_USING_UNICODE)
689static int
690tok_stdin_decode(struct tok_state *tok, char **inp)
691{
692 PyObject *enc, *sysstdin, *decoded, *utf8;
693 const char *encoding;
694 char *converted;
695
696 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
697 return 0;
698 sysstdin = PySys_GetObject("stdin");
699 if (sysstdin == NULL || !PyFile_Check(sysstdin))
700 return 0;
701
702 enc = ((PyFileObject *)sysstdin)->f_encoding;
703 if (enc == NULL || !PyString_Check(enc))
704 return 0;
705 Py_INCREF(enc);
706
707 encoding = PyString_AsString(enc);
708 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
709 if (decoded == NULL)
710 goto error_clear;
711
712 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
713 Py_DECREF(decoded);
714 if (utf8 == NULL)
715 goto error_clear;
716
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000717 assert(PyString_Check(utf8));
718 converted = new_string(PyString_AS_STRING(utf8),
719 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000720 Py_DECREF(utf8);
721 if (converted == NULL)
722 goto error_nomem;
723
724 PyMem_FREE(*inp);
725 *inp = converted;
726 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000727 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000728 tok->encoding = new_string(encoding, strlen(encoding));
729 if (tok->encoding == NULL)
730 goto error_nomem;
731
732 Py_DECREF(enc);
733 return 0;
734
735error_nomem:
736 Py_DECREF(enc);
737 tok->done = E_NOMEM;
738 return -1;
739
740error_clear:
741 /* Fallback to iso-8859-1: for backward compatibility */
742 Py_DECREF(enc);
743 PyErr_Clear();
744 return 0;
745}
746#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747
748/* Get next char, updating state; error code goes into tok->done */
749
750static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000751tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000754 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000755 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000756 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000757 if (tok->done != E_OK)
758 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000760 char *end = strchr(tok->inp, '\n');
761 if (end != NULL)
762 end++;
763 else {
764 end = strchr(tok->inp, '\0');
765 if (end == tok->inp) {
766 tok->done = E_EOF;
767 return EOF;
768 }
769 }
770 if (tok->start == NULL)
771 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000772 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000773 tok->lineno++;
774 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000775 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000778 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->nextprompt != NULL)
780 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000782 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000783 else if (*newtok == '\0') {
784 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785 tok->done = E_EOF;
786 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000787#if !defined(PGEN) && defined(Py_USING_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000788 else if (tok_stdin_decode(tok, &newtok) != 0)
789 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000792 size_t start = tok->start - tok->buf;
793 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000794 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000795 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000796 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000797 tok->lineno++;
798 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000799 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000800 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000801 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000802 tok->done = E_NOMEM;
803 return EOF;
804 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000805 tok->buf = buf;
806 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000807 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000808 strcpy(tok->buf + oldlen, newtok);
809 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000810 tok->inp = tok->buf + newlen;
811 tok->end = tok->inp + 1;
812 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000813 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000814 else {
815 tok->lineno++;
816 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000817 PyMem_FREE(tok->buf);
818 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000819 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000820 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000821 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->inp = strchr(tok->buf, '\0');
823 tok->end = tok->inp + 1;
824 }
825 }
826 else {
827 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000828 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000829 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 if (tok->start == NULL) {
831 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000832 tok->buf = (char *)
833 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 if (tok->buf == NULL) {
835 tok->done = E_NOMEM;
836 return EOF;
837 }
838 tok->end = tok->buf + BUFSIZ;
839 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000840 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
841 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 tok->done = E_EOF;
843 done = 1;
844 }
845 else {
846 tok->done = E_OK;
847 tok->inp = strchr(tok->buf, '\0');
848 done = tok->inp[-1] == '\n';
849 }
850 }
851 else {
852 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000853 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000854 tok->done = E_EOF;
855 done = 1;
856 }
857 else
858 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000859 }
860 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000861 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000863 Py_ssize_t curstart = tok->start == NULL ? -1 :
864 tok->start - tok->buf;
865 Py_ssize_t curvalid = tok->inp - tok->buf;
866 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000868 newbuf = (char *)PyMem_REALLOC(newbuf,
869 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000870 if (newbuf == NULL) {
871 tok->done = E_NOMEM;
872 tok->cur = tok->inp;
873 return EOF;
874 }
875 tok->buf = newbuf;
876 tok->inp = tok->buf + curvalid;
877 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000878 tok->start = curstart < 0 ? NULL :
879 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000880 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000881 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000882 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000883 /* Break out early on decoding
884 errors, as tok->buf will be NULL
885 */
886 if (tok->decoding_erred)
887 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 /* Last line does not end in \n,
889 fake one */
890 strcpy(tok->inp, "\n");
891 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000892 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000893 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000895 if (tok->buf != NULL) {
896 tok->cur = tok->buf + cur;
897 tok->line_start = tok->cur;
898 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000899 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000900 pt = tok->inp - 2;
901 if (pt >= tok->buf && *pt == '\r') {
902 *pt++ = '\n';
903 *pt = '\0';
904 tok->inp = pt;
905 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000906 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907 }
908 if (tok->done != E_OK) {
909 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000910 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000911 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 return EOF;
913 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000915 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916}
917
918
919/* Back-up one character */
920
921static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000922tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000923{
924 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000925 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000926 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000927 if (*tok->cur != c)
928 *tok->cur = c;
929 }
930}
931
932
933/* Return the token corresponding to a single character */
934
935int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000936PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000937{
938 switch (c) {
939 case '(': return LPAR;
940 case ')': return RPAR;
941 case '[': return LSQB;
942 case ']': return RSQB;
943 case ':': return COLON;
944 case ',': return COMMA;
945 case ';': return SEMI;
946 case '+': return PLUS;
947 case '-': return MINUS;
948 case '*': return STAR;
949 case '/': return SLASH;
950 case '|': return VBAR;
951 case '&': return AMPER;
952 case '<': return LESS;
953 case '>': return GREATER;
954 case '=': return EQUAL;
955 case '.': return DOT;
956 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957 case '{': return LBRACE;
958 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000959 case '^': return CIRCUMFLEX;
960 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000961 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000962 default: return OP;
963 }
964}
965
966
Guido van Rossumfbab9051991-10-20 20:25:03 +0000967int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000968PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000969{
970 switch (c1) {
971 case '=':
972 switch (c2) {
973 case '=': return EQEQUAL;
974 }
975 break;
976 case '!':
977 switch (c2) {
978 case '=': return NOTEQUAL;
979 }
980 break;
981 case '<':
982 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000983 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000984 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000985 }
986 break;
987 case '>':
988 switch (c2) {
989 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000990 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000991 }
992 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000993 case '+':
994 switch (c2) {
995 case '=': return PLUSEQUAL;
996 }
997 break;
998 case '-':
999 switch (c2) {
1000 case '=': return MINEQUAL;
1001 }
1002 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001003 case '*':
1004 switch (c2) {
1005 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001006 case '=': return STAREQUAL;
1007 }
1008 break;
1009 case '/':
1010 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001011 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001012 case '=': return SLASHEQUAL;
1013 }
1014 break;
1015 case '|':
1016 switch (c2) {
1017 case '=': return VBAREQUAL;
1018 }
1019 break;
1020 case '%':
1021 switch (c2) {
1022 case '=': return PERCENTEQUAL;
1023 }
1024 break;
1025 case '&':
1026 switch (c2) {
1027 case '=': return AMPEREQUAL;
1028 }
1029 break;
1030 case '^':
1031 switch (c2) {
1032 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001033 }
1034 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001035 }
1036 return OP;
1037}
1038
Thomas Wouters434d0822000-08-24 20:11:32 +00001039int
1040PyToken_ThreeChars(int c1, int c2, int c3)
1041{
1042 switch (c1) {
1043 case '<':
1044 switch (c2) {
1045 case '<':
1046 switch (c3) {
1047 case '=':
1048 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001049 }
1050 break;
1051 }
1052 break;
1053 case '>':
1054 switch (c2) {
1055 case '>':
1056 switch (c3) {
1057 case '=':
1058 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001059 }
1060 break;
1061 }
1062 break;
1063 case '*':
1064 switch (c2) {
1065 case '*':
1066 switch (c3) {
1067 case '=':
1068 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001069 }
1070 break;
1071 }
1072 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001073 case '/':
1074 switch (c2) {
1075 case '/':
1076 switch (c3) {
1077 case '=':
1078 return DOUBLESLASHEQUAL;
1079 }
1080 break;
1081 }
1082 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 }
1084 return OP;
1085}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001086
Guido van Rossum926f13a1998-04-09 21:38:06 +00001087static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001088indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001089{
1090 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001091 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001092 tok->cur = tok->inp;
1093 return 1;
1094 }
1095 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001096 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1097 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001098 tok->altwarning = 0;
1099 }
1100 return 0;
1101}
1102
1103
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104/* Get next token, after space stripping etc. */
1105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001106static int
1107tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108{
1109 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 int blankline;
1111
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001112 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001113 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001114 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001115 blankline = 0;
1116
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 /* Get indentation level */
1118 if (tok->atbol) {
1119 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001121 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122 for (;;) {
1123 c = tok_nextc(tok);
1124 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 col++, altcol++;
1126 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001128 altcol = (altcol/tok->alttabsize + 1)
1129 * tok->alttabsize;
1130 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001131 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001132 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001133 else
1134 break;
1135 }
1136 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001137 if (c == '#' || c == '\n') {
1138 /* Lines with only whitespace and/or comments
1139 shouldn't affect the indentation and are
1140 not passed to the parser as NEWLINE tokens,
1141 except *totally* empty lines in interactive
1142 mode, which signal the end of a command group. */
1143 if (col == 0 && c == '\n' && tok->prompt != NULL)
1144 blankline = 0; /* Let it through */
1145 else
1146 blankline = 1; /* Ignore completely */
1147 /* We can't jump back right here since we still
1148 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001150 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001151 if (col == tok->indstack[tok->indent]) {
1152 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001153 if (altcol != tok->altindstack[tok->indent]) {
1154 if (indenterror(tok))
1155 return ERRORTOKEN;
1156 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001157 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 else if (col > tok->indstack[tok->indent]) {
1159 /* Indent -- always one */
1160 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001161 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001162 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 return ERRORTOKEN;
1164 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001165 if (altcol <= tok->altindstack[tok->indent]) {
1166 if (indenterror(tok))
1167 return ERRORTOKEN;
1168 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001169 tok->pendin++;
1170 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001171 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001172 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001173 else /* col < tok->indstack[tok->indent] */ {
1174 /* Dedent -- any number, must be consistent */
1175 while (tok->indent > 0 &&
1176 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001178 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 }
1180 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001181 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001182 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001183 return ERRORTOKEN;
1184 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001185 if (altcol != tok->altindstack[tok->indent]) {
1186 if (indenterror(tok))
1187 return ERRORTOKEN;
1188 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 }
1190 }
1191 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001193 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001194
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001195 /* Return pending indents/dedents */
1196 if (tok->pendin != 0) {
1197 if (tok->pendin < 0) {
1198 tok->pendin++;
1199 return DEDENT;
1200 }
1201 else {
1202 tok->pendin--;
1203 return INDENT;
1204 }
1205 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001208 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 /* Skip spaces */
1210 do {
1211 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001212 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001213
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001214 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001215 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001216
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001217 /* Skip comment */
1218 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001219 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001221
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001223 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001225 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001226
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227 /* Identifier (most frequent token!) */
1228 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001229 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001230 switch (c) {
1231 case 'r':
1232 case 'R':
1233 c = tok_nextc(tok);
1234 if (c == '"' || c == '\'')
1235 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001236 break;
1237 case 'u':
1238 case 'U':
1239 c = tok_nextc(tok);
1240 if (c == 'r' || c == 'R')
1241 c = tok_nextc(tok);
1242 if (c == '"' || c == '\'')
1243 goto letter_quote;
1244 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001245 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001246 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001247 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001248 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001250 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 *p_end = tok->cur;
1252 return NAME;
1253 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001254
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 /* Newline */
1256 if (c == '\n') {
1257 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001258 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001259 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001260 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001262 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 return NEWLINE;
1264 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001265
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001266 /* Period or number starting with period? */
1267 if (c == '.') {
1268 c = tok_nextc(tok);
1269 if (isdigit(c)) {
1270 goto fraction;
1271 }
1272 else {
1273 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001274 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001275 *p_end = tok->cur;
1276 return DOT;
1277 }
1278 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001279
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 /* Number */
1281 if (isdigit(c)) {
1282 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001283 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 c = tok_nextc(tok);
1285 if (c == '.')
1286 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001287#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001288 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001289 goto imaginary;
1290#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 if (c == 'x' || c == 'X') {
1292 /* Hex */
1293 do {
1294 c = tok_nextc(tok);
1295 } while (isxdigit(c));
1296 }
1297 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001298 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 /* Octal; c is first char of it */
1300 /* There's no 'isoctdigit' macro, sigh */
1301 while ('0' <= c && c < '8') {
1302 c = tok_nextc(tok);
1303 }
Tim Petersd507dab2001-08-30 20:51:59 +00001304 if (isdigit(c)) {
1305 found_decimal = 1;
1306 do {
1307 c = tok_nextc(tok);
1308 } while (isdigit(c));
1309 }
1310 if (c == '.')
1311 goto fraction;
1312 else if (c == 'e' || c == 'E')
1313 goto exponent;
1314#ifndef WITHOUT_COMPLEX
1315 else if (c == 'j' || c == 'J')
1316 goto imaginary;
1317#endif
1318 else if (found_decimal) {
1319 tok->done = E_TOKEN;
1320 tok_backup(tok, c);
1321 return ERRORTOKEN;
1322 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001323 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001324 if (c == 'l' || c == 'L')
1325 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 }
1327 else {
1328 /* Decimal */
1329 do {
1330 c = tok_nextc(tok);
1331 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001332 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001334 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001335 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001336 if (c == '.') {
1337 fraction:
1338 /* Fraction */
1339 do {
1340 c = tok_nextc(tok);
1341 } while (isdigit(c));
1342 }
1343 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001344 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001345 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001347 if (c == '+' || c == '-')
1348 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001349 if (!isdigit(c)) {
1350 tok->done = E_TOKEN;
1351 tok_backup(tok, c);
1352 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001353 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001354 do {
1355 c = tok_nextc(tok);
1356 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001358#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001359 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001360 /* Imaginary part */
1361 imaginary:
1362 c = tok_nextc(tok);
1363#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 }
1365 }
1366 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001367 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 *p_end = tok->cur;
1369 return NUMBER;
1370 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001371
1372 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 /* String */
1374 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001375 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001376 int quote = c;
1377 int triple = 0;
1378 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 for (;;) {
1380 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001381 if (c == '\n') {
1382 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001383 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 tok_backup(tok, c);
1385 return ERRORTOKEN;
1386 }
1387 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001388 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001389 }
1390 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001391 if (triple)
1392 tok->done = E_EOFS;
1393 else
1394 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001395 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001396 return ERRORTOKEN;
1397 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 else if (c == quote) {
1399 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001400 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001401 c = tok_nextc(tok);
1402 if (c == quote) {
1403 triple = 1;
1404 tripcount = 0;
1405 continue;
1406 }
1407 tok_backup(tok, c);
1408 }
1409 if (!triple || tripcount == 3)
1410 break;
1411 }
1412 else if (c == '\\') {
1413 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001416 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001417 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 return ERRORTOKEN;
1419 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001421 else
1422 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001425 *p_end = tok->cur;
1426 return STRING;
1427 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001429 /* Line continuation */
1430 if (c == '\\') {
1431 c = tok_nextc(tok);
1432 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001433 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001434 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 return ERRORTOKEN;
1436 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001437 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 goto again; /* Read next line */
1439 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Guido van Rossumfbab9051991-10-20 20:25:03 +00001441 /* Check for two-character token */
1442 {
1443 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001444 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001445 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001446 int c3 = tok_nextc(tok);
1447 int token3 = PyToken_ThreeChars(c, c2, c3);
1448 if (token3 != OP) {
1449 token = token3;
1450 } else {
1451 tok_backup(tok, c3);
1452 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001453 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001454 *p_end = tok->cur;
1455 return token;
1456 }
1457 tok_backup(tok, c2);
1458 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001461 switch (c) {
1462 case '(':
1463 case '[':
1464 case '{':
1465 tok->level++;
1466 break;
1467 case ')':
1468 case ']':
1469 case '}':
1470 tok->level--;
1471 break;
1472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001475 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001477 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478}
1479
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001480int
1481PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1482{
1483 int result = tok_get(tok, p_start, p_end);
1484 if (tok->decoding_erred) {
1485 result = ERRORTOKEN;
1486 tok->done = E_DECODE;
1487 }
1488 return result;
1489}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001490
Guido van Rossum408027e1996-12-30 16:17:54 +00001491#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492
1493void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001494tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001495{
Guido van Rossum86bea461997-04-29 21:03:06 +00001496 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001497 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1498 printf("(%.*s)", (int)(end - start), start);
1499}
1500
1501#endif