blob: b0d9b80c3274fb0784d78cf464f396a03c573833 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Guido van Rossum86bea461997-04-29 21:03:06 +0000108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyMem_DEL(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173{
174 char* result = PyMem_NEW(char, len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000208get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000231 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000239 PyMem_DEL(r);
240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000279 else
280 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000286 PyMem_DEL(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000287#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
291 PyMem_DEL(cs);
292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
319 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
320#if 0
321 /* Disable support for UTF-16 BOMs until a decision
322 is made whether this needs to be supported. */
323 } else if (ch == 0xFE) {
324 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
325 if (!set_readline(tok, "utf-16-be")) return 0;
326 tok->decoding_state = -1;
327 } else if (ch == 0xFF) {
328 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
329 if (!set_readline(tok, "utf-16-le")) return 0;
330 tok->decoding_state = -1;
331#endif
332 } else {
333 unget_char(ch, tok);
334 return 1;
335 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000336 if (tok->encoding != NULL)
337 PyMem_DEL(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
339 return 1;
340 NON_BOM:
341 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
342 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
343 return 1;
344}
345
346/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347 Return NULL on failure, else S.
348
349 On entry, tok->decoding_buffer will be one of:
350 1) NULL: need to call tok->decoding_readline to get a new line
351 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
352 stored the result in tok->decoding_buffer
353 3) PyStringObject *: previous call to fp_readl did not have enough room
354 (in the s buffer) to copy entire contents of the line read
355 by tok->decoding_readline. tok->decoding_buffer has the overflow.
356 In this case, fp_readl is called in a loop (with an expanded buffer)
357 until the buffer ends with a '\n' (or until the end of the file is
358 reached): see tok_nextc and its calls to decoding_fgets.
359*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000360
361static char *
362fp_readl(char *s, int size, struct tok_state *tok)
363{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000364#ifndef Py_USING_UNICODE
365 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000366 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000367 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000368#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000372 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373
374 /* Ask for one less byte so we can terminate it */
375 assert(size > 0);
376 size--;
377
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000380 if (buf == NULL)
381 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 } else {
383 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000384 if (PyString_CheckExact(buf))
385 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 if (utf8 == NULL) {
388 utf8 = PyUnicode_AsUTF8String(buf);
389 Py_DECREF(buf);
390 if (utf8 == NULL)
391 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 str = PyString_AsString(utf8);
394 utf8len = PyString_GET_SIZE(utf8);
395 if (utf8len > size) {
396 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
397 if (tok->decoding_buffer == NULL) {
398 Py_DECREF(utf8);
399 return error_ret(tok);
400 }
401 utf8len = size;
402 }
403 memcpy(s, str, utf8len);
404 s[utf8len] = '\0';
405 Py_DECREF(utf8);
406 if (utf8len == 0) return NULL; /* EOF */
407 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000408#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Set the readline function for TOK to a StreamReader's
412 readline function. The StreamReader is named ENC.
413
414 This function is called from check_bom and check_coding_spec.
415
416 ENC is usually identical to the future value of tok->encoding,
417 except for the (currently unsupported) case of UTF-16.
418
419 Return 1 on success, 0 on failure. */
420
421static int
422fp_setreadl(struct tok_state *tok, const char* enc)
423{
424 PyObject *reader, *stream, *readline;
425
Martin v. Löwis95292d62002-12-11 14:04:59 +0000426 /* XXX: constify filename argument. */
427 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 if (stream == NULL)
429 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430
431 reader = PyCodec_StreamReader(enc, stream, NULL);
432 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000433 if (reader == NULL)
434 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000435
436 readline = PyObject_GetAttrString(reader, "readline");
437 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000438 if (readline == NULL)
439 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000440
441 tok->decoding_readline = readline;
442 return 1;
443}
444
445/* Fetch the next byte from TOK. */
446
447static int fp_getc(struct tok_state *tok) {
448 return getc(tok->fp);
449}
450
451/* Unfetch the last byte back into TOK. */
452
453static void fp_ungetc(int c, struct tok_state *tok) {
454 ungetc(c, tok->fp);
455}
456
457/* Read a line of input from TOK. Determine encoding
458 if necessary. */
459
460static char *
461decoding_fgets(char *s, int size, struct tok_state *tok)
462{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000463 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000464 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000465 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 if (tok->decoding_state < 0) {
467 /* We already have a codec associated with
468 this input. */
469 line = fp_readl(s, size, tok);
470 break;
471 } else if (tok->decoding_state > 0) {
472 /* We want a 'raw' read. */
473 line = Py_UniversalNewlineFgets(s, size,
474 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 break;
476 } else {
477 /* We have not yet determined the encoding.
478 If an encoding is found, use the file-pointer
479 reader functions from now on. */
480 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
481 return error_ret(tok);
482 assert(tok->decoding_state != 0);
483 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000484 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
486 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
487 return error_ret(tok);
488 }
489 }
490#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000491 /* The default encoding is ASCII, so make sure we don't have any
492 non-ASCII bytes in it. */
493 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000495 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (*c > 127) {
497 badchar = *c;
498 break;
499 }
500 }
501 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000502 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000503 /* Need to add 1 to the line number, since this line
504 has not been counted, yet. */
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000505 sprintf(buf,
506 "Non-ASCII character '\\x%.2x' "
507 "in file %.200s on line %i, "
508 "but no encoding declared; "
509 "see http://www.python.org/peps/pep-0263.html for details",
510 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000511 PyErr_SetString(PyExc_SyntaxError, buf);
512 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000513 }
514#endif
515 return line;
516}
517
518static int
519decoding_feof(struct tok_state *tok)
520{
521 if (tok->decoding_state >= 0) {
522 return feof(tok->fp);
523 } else {
524 PyObject* buf = tok->decoding_buffer;
525 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000526 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 if (buf == NULL) {
528 error_ret(tok);
529 return 1;
530 } else {
531 tok->decoding_buffer = buf;
532 }
533 }
534 return PyObject_Length(buf) == 0;
535 }
536}
537
538/* Fetch a byte from TOK, using the string buffer. */
539
540static int buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000541 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542}
543
544/* Unfetch a byte from TOK, using the string buffer. */
545
546static void buf_ungetc(int c, struct tok_state *tok) {
547 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000548 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549}
550
551/* Set the readline function for TOK to ENC. For the string-based
552 tokenizer, this means to just record the encoding. */
553
554static int buf_setreadl(struct tok_state *tok, const char* enc) {
555 tok->enc = enc;
556 return 1;
557}
558
559/* Return a UTF-8 encoding Python string object from the
560 C byte string STR, which is encoded with ENC. */
561
Martin v. Löwis019934b2002-08-07 12:33:18 +0000562#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563static PyObject *
564translate_into_utf8(const char* str, const char* enc) {
565 PyObject *utf8;
566 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
567 if (buf == NULL)
568 return NULL;
569 utf8 = PyUnicode_AsUTF8String(buf);
570 Py_DECREF(buf);
571 return utf8;
572}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000573#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574
575/* Decode a byte string STR for use as the buffer of TOK.
576 Look for encoding declarations inside STR, and record them
577 inside TOK. */
578
579static const char *
580decode_str(const char *str, struct tok_state *tok)
581{
582 PyObject* utf8 = NULL;
583 const char *s;
584 int lineno = 0;
585 tok->enc = NULL;
586 tok->str = str;
587 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000588 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000590 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000591#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 if (tok->enc != NULL) {
593 utf8 = translate_into_utf8(str, tok->enc);
594 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000595 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 str = PyString_AsString(utf8);
597 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000598#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 for (s = str;; s++) {
600 if (*s == '\0') break;
601 else if (*s == '\n') {
602 lineno++;
603 if (lineno == 2) break;
604 }
605 }
606 tok->enc = NULL;
607 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000608 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000609#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000610 if (tok->enc != NULL) {
611 assert(utf8 == NULL);
612 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000613 if (utf8 == NULL) {
614 PyErr_Format(PyExc_SyntaxError,
615 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000616 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000617 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618 str = PyString_AsString(utf8);
619 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000620#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 assert(tok->decoding_buffer == NULL);
622 tok->decoding_buffer = utf8; /* CAUTION */
623 return str;
624}
625
626#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000627
628/* Set up tokenizer for string */
629
630struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000631PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000632{
633 struct tok_state *tok = tok_new();
634 if (tok == NULL)
635 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000637 if (str == NULL) {
638 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 }
641
Martin v. Löwis95292d62002-12-11 14:04:59 +0000642 /* XXX: constify members. */
643 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000644 return tok;
645}
646
647
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000648/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649
650struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000651PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652{
653 struct tok_state *tok = tok_new();
654 if (tok == NULL)
655 return NULL;
Guido van Rossum86bea461997-04-29 21:03:06 +0000656 if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000657 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658 return NULL;
659 }
660 tok->cur = tok->inp = tok->buf;
661 tok->end = tok->buf + BUFSIZ;
662 tok->fp = fp;
663 tok->prompt = ps1;
664 tok->nextprompt = ps2;
665 return tok;
666}
667
668
669/* Free a tok_state structure */
670
671void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000672PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000673{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000674 if (tok->encoding != NULL)
675 PyMem_DEL(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000676#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000677 Py_XDECREF(tok->decoding_readline);
678 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000679#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000680 if (tok->fp != NULL && tok->buf != NULL)
Guido van Rossum86bea461997-04-29 21:03:06 +0000681 PyMem_DEL(tok->buf);
682 PyMem_DEL(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683}
684
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000685#if !defined(PGEN) && defined(Py_USING_UNICODE)
686static int
687tok_stdin_decode(struct tok_state *tok, char **inp)
688{
689 PyObject *enc, *sysstdin, *decoded, *utf8;
690 const char *encoding;
691 char *converted;
692
693 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
694 return 0;
695 sysstdin = PySys_GetObject("stdin");
696 if (sysstdin == NULL || !PyFile_Check(sysstdin))
697 return 0;
698
699 enc = ((PyFileObject *)sysstdin)->f_encoding;
700 if (enc == NULL || !PyString_Check(enc))
701 return 0;
702 Py_INCREF(enc);
703
704 encoding = PyString_AsString(enc);
705 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
706 if (decoded == NULL)
707 goto error_clear;
708
709 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
710 Py_DECREF(decoded);
711 if (utf8 == NULL)
712 goto error_clear;
713
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000714 assert(PyString_Check(utf8));
715 converted = new_string(PyString_AS_STRING(utf8),
716 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000717 Py_DECREF(utf8);
718 if (converted == NULL)
719 goto error_nomem;
720
721 PyMem_FREE(*inp);
722 *inp = converted;
723 if (tok->encoding != NULL)
724 PyMem_DEL(tok->encoding);
725 tok->encoding = new_string(encoding, strlen(encoding));
726 if (tok->encoding == NULL)
727 goto error_nomem;
728
729 Py_DECREF(enc);
730 return 0;
731
732error_nomem:
733 Py_DECREF(enc);
734 tok->done = E_NOMEM;
735 return -1;
736
737error_clear:
738 /* Fallback to iso-8859-1: for backward compatibility */
739 Py_DECREF(enc);
740 PyErr_Clear();
741 return 0;
742}
743#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744
745/* Get next char, updating state; error code goes into tok->done */
746
747static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000748tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000751 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000752 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000753 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000754 if (tok->done != E_OK)
755 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000757 char *end = strchr(tok->inp, '\n');
758 if (end != NULL)
759 end++;
760 else {
761 end = strchr(tok->inp, '\0');
762 if (end == tok->inp) {
763 tok->done = E_EOF;
764 return EOF;
765 }
766 }
767 if (tok->start == NULL)
768 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000769 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000770 tok->lineno++;
771 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000772 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000773 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000775 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 if (tok->nextprompt != NULL)
777 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000778 if (new == NULL)
779 tok->done = E_INTR;
780 else if (*new == '\0') {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000781 PyMem_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000782 tok->done = E_EOF;
783 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000784#if !defined(PGEN) && defined(Py_USING_UNICODE)
785 else if (tok_stdin_decode(tok, &new) != 0)
786 PyMem_FREE(new);
787#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000788 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000789 size_t start = tok->start - tok->buf;
790 size_t oldlen = tok->cur - tok->buf;
791 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000792 char *buf = tok->buf;
793 PyMem_RESIZE(buf, char, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 tok->lineno++;
795 if (buf == NULL) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000796 PyMem_DEL(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000797 tok->buf = NULL;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000798 PyMem_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000799 tok->done = E_NOMEM;
800 return EOF;
801 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000802 tok->buf = buf;
803 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000804 tok->line_start = tok->cur;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000805 strcpy(tok->buf + oldlen, new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000806 PyMem_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 tok->inp = tok->buf + newlen;
808 tok->end = tok->inp + 1;
809 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000810 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 else {
812 tok->lineno++;
813 if (tok->buf != NULL)
Guido van Rossumb18618d2000-05-03 23:44:39 +0000814 PyMem_DEL(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000815 tok->buf = new;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000816 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000818 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000819 tok->inp = strchr(tok->buf, '\0');
820 tok->end = tok->inp + 1;
821 }
822 }
823 else {
824 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000825 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000826 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 if (tok->start == NULL) {
828 if (tok->buf == NULL) {
Guido van Rossum86bea461997-04-29 21:03:06 +0000829 tok->buf = PyMem_NEW(char, BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 if (tok->buf == NULL) {
831 tok->done = E_NOMEM;
832 return EOF;
833 }
834 tok->end = tok->buf + BUFSIZ;
835 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000836 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
837 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000838 tok->done = E_EOF;
839 done = 1;
840 }
841 else {
842 tok->done = E_OK;
843 tok->inp = strchr(tok->buf, '\0');
844 done = tok->inp[-1] == '\n';
845 }
846 }
847 else {
848 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000849 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000850 tok->done = E_EOF;
851 done = 1;
852 }
853 else
854 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000855 }
856 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000857 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000858 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 Py_ssize_t curstart = tok->start == NULL ? -1 :
860 tok->start - tok->buf;
861 Py_ssize_t curvalid = tok->inp - tok->buf;
862 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 char *newbuf = tok->buf;
Guido van Rossum86bea461997-04-29 21:03:06 +0000864 PyMem_RESIZE(newbuf, char, newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 if (newbuf == NULL) {
866 tok->done = E_NOMEM;
867 tok->cur = tok->inp;
868 return EOF;
869 }
870 tok->buf = newbuf;
871 tok->inp = tok->buf + curvalid;
872 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000873 tok->start = curstart < 0 ? NULL :
874 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000875 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000877 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000878 /* Break out early on decoding
879 errors, as tok->buf will be NULL
880 */
881 if (tok->decoding_erred)
882 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000883 /* Last line does not end in \n,
884 fake one */
885 strcpy(tok->inp, "\n");
886 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000889 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 tok->cur = tok->buf + cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000891 tok->line_start = tok->cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000892 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000893 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000894 pt = tok->inp - 2;
895 if (pt >= tok->buf && *pt == '\r') {
896 *pt++ = '\n';
897 *pt = '\0';
898 tok->inp = pt;
899 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000900 }
901 if (tok->done != E_OK) {
902 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000903 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000904 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905 return EOF;
906 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000908 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909}
910
911
912/* Back-up one character */
913
914static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000915tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916{
917 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000918 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000919 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000920 if (*tok->cur != c)
921 *tok->cur = c;
922 }
923}
924
925
926/* Return the token corresponding to a single character */
927
928int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000929PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000930{
931 switch (c) {
932 case '(': return LPAR;
933 case ')': return RPAR;
934 case '[': return LSQB;
935 case ']': return RSQB;
936 case ':': return COLON;
937 case ',': return COMMA;
938 case ';': return SEMI;
939 case '+': return PLUS;
940 case '-': return MINUS;
941 case '*': return STAR;
942 case '/': return SLASH;
943 case '|': return VBAR;
944 case '&': return AMPER;
945 case '<': return LESS;
946 case '>': return GREATER;
947 case '=': return EQUAL;
948 case '.': return DOT;
949 case '%': return PERCENT;
950 case '`': return BACKQUOTE;
951 case '{': return LBRACE;
952 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000953 case '^': return CIRCUMFLEX;
954 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000955 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000956 default: return OP;
957 }
958}
959
960
Guido van Rossumfbab9051991-10-20 20:25:03 +0000961int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000962PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000963{
964 switch (c1) {
965 case '=':
966 switch (c2) {
967 case '=': return EQEQUAL;
968 }
969 break;
970 case '!':
971 switch (c2) {
972 case '=': return NOTEQUAL;
973 }
974 break;
975 case '<':
976 switch (c2) {
977 case '>': return NOTEQUAL;
978 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000979 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000980 }
981 break;
982 case '>':
983 switch (c2) {
984 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000985 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000986 }
987 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000988 case '+':
989 switch (c2) {
990 case '=': return PLUSEQUAL;
991 }
992 break;
993 case '-':
994 switch (c2) {
995 case '=': return MINEQUAL;
996 }
997 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000998 case '*':
999 switch (c2) {
1000 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001001 case '=': return STAREQUAL;
1002 }
1003 break;
1004 case '/':
1005 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001006 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001007 case '=': return SLASHEQUAL;
1008 }
1009 break;
1010 case '|':
1011 switch (c2) {
1012 case '=': return VBAREQUAL;
1013 }
1014 break;
1015 case '%':
1016 switch (c2) {
1017 case '=': return PERCENTEQUAL;
1018 }
1019 break;
1020 case '&':
1021 switch (c2) {
1022 case '=': return AMPEREQUAL;
1023 }
1024 break;
1025 case '^':
1026 switch (c2) {
1027 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001028 }
1029 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001030 }
1031 return OP;
1032}
1033
Thomas Wouters434d0822000-08-24 20:11:32 +00001034int
1035PyToken_ThreeChars(int c1, int c2, int c3)
1036{
1037 switch (c1) {
1038 case '<':
1039 switch (c2) {
1040 case '<':
1041 switch (c3) {
1042 case '=':
1043 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001044 }
1045 break;
1046 }
1047 break;
1048 case '>':
1049 switch (c2) {
1050 case '>':
1051 switch (c3) {
1052 case '=':
1053 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001054 }
1055 break;
1056 }
1057 break;
1058 case '*':
1059 switch (c2) {
1060 case '*':
1061 switch (c3) {
1062 case '=':
1063 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001064 }
1065 break;
1066 }
1067 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001068 case '/':
1069 switch (c2) {
1070 case '/':
1071 switch (c3) {
1072 case '=':
1073 return DOUBLESLASHEQUAL;
1074 }
1075 break;
1076 }
1077 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001078 }
1079 return OP;
1080}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001081
Guido van Rossum926f13a1998-04-09 21:38:06 +00001082static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001083indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001084{
1085 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001086 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001087 tok->cur = tok->inp;
1088 return 1;
1089 }
1090 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001091 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1092 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001093 tok->altwarning = 0;
1094 }
1095 return 0;
1096}
1097
1098
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001099/* Get next token, after space stripping etc. */
1100
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001101static int
1102tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103{
1104 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001105 int blankline;
1106
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001107 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001108 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001109 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 blankline = 0;
1111
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112 /* Get indentation level */
1113 if (tok->atbol) {
1114 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001115 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 for (;;) {
1118 c = tok_nextc(tok);
1119 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 col++, altcol++;
1121 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 altcol = (altcol/tok->alttabsize + 1)
1124 * tok->alttabsize;
1125 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001126 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001127 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001128 else
1129 break;
1130 }
1131 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001132 if (c == '#' || c == '\n') {
1133 /* Lines with only whitespace and/or comments
1134 shouldn't affect the indentation and are
1135 not passed to the parser as NEWLINE tokens,
1136 except *totally* empty lines in interactive
1137 mode, which signal the end of a command group. */
1138 if (col == 0 && c == '\n' && tok->prompt != NULL)
1139 blankline = 0; /* Let it through */
1140 else
1141 blankline = 1; /* Ignore completely */
1142 /* We can't jump back right here since we still
1143 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001145 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146 if (col == tok->indstack[tok->indent]) {
1147 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001148 if (altcol != tok->altindstack[tok->indent]) {
1149 if (indenterror(tok))
1150 return ERRORTOKEN;
1151 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001152 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 else if (col > tok->indstack[tok->indent]) {
1154 /* Indent -- always one */
1155 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001156 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001157 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 return ERRORTOKEN;
1159 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160 if (altcol <= tok->altindstack[tok->indent]) {
1161 if (indenterror(tok))
1162 return ERRORTOKEN;
1163 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 tok->pendin++;
1165 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001166 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001167 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001168 else /* col < tok->indstack[tok->indent] */ {
1169 /* Dedent -- any number, must be consistent */
1170 while (tok->indent > 0 &&
1171 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001173 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174 }
1175 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001176 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001177 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001178 return ERRORTOKEN;
1179 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001180 if (altcol != tok->altindstack[tok->indent]) {
1181 if (indenterror(tok))
1182 return ERRORTOKEN;
1183 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 }
1185 }
1186 }
1187
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001188 tok->start = tok->cur;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189
1190 /* Return pending indents/dedents */
1191 if (tok->pendin != 0) {
1192 if (tok->pendin < 0) {
1193 tok->pendin++;
1194 return DEDENT;
1195 }
1196 else {
1197 tok->pendin--;
1198 return INDENT;
1199 }
1200 }
1201
1202 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001203 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 /* Skip spaces */
1205 do {
1206 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001207 } while (c == ' ' || c == '\t' || c == '\014');
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208
1209 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001210 tok->start = tok->cur - 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211
Guido van Rossumab5ca152000-03-31 00:52:27 +00001212 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001214 static char *tabforms[] = {
1215 "tab-width:", /* Emacs */
1216 ":tabstop=", /* vim, full form */
1217 ":ts=", /* vim, abbreviated form */
1218 "set tabsize=", /* will vi never die? */
1219 /* more templates can be added here to support other editors */
1220 };
1221 char cbuf[80];
1222 char *tp, **cp;
1223 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001225 *tp++ = c = tok_nextc(tok);
1226 } while (c != EOF && c != '\n' &&
1227 tp - cbuf + 1 < sizeof(cbuf));
1228 *tp = '\0';
1229 for (cp = tabforms;
1230 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1231 cp++) {
1232 if ((tp = strstr(cbuf, *cp))) {
1233 int newsize = atoi(tp + strlen(*cp));
1234
1235 if (newsize >= 1 && newsize <= 40) {
1236 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001237 if (Py_VerboseFlag)
1238 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001239 "Tab size set to %d\n",
1240 newsize);
1241 }
1242 }
1243 }
1244 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 }
1247
1248 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001249 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001251 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001252
1253 /* Identifier (most frequent token!) */
1254 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001255 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001256 switch (c) {
1257 case 'r':
1258 case 'R':
1259 c = tok_nextc(tok);
1260 if (c == '"' || c == '\'')
1261 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001262 break;
1263 case 'u':
1264 case 'U':
1265 c = tok_nextc(tok);
1266 if (c == 'r' || c == 'R')
1267 c = tok_nextc(tok);
1268 if (c == '"' || c == '\'')
1269 goto letter_quote;
1270 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001271 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001272 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001274 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001276 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 *p_end = tok->cur;
1278 return NAME;
1279 }
1280
1281 /* Newline */
1282 if (c == '\n') {
1283 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001284 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001285 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001286 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001288 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 return NEWLINE;
1290 }
1291
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001292 /* Period or number starting with period? */
1293 if (c == '.') {
1294 c = tok_nextc(tok);
1295 if (isdigit(c)) {
1296 goto fraction;
1297 }
1298 else {
1299 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001300 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001301 *p_end = tok->cur;
1302 return DOT;
1303 }
1304 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001305
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 /* Number */
1307 if (isdigit(c)) {
1308 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001309 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 c = tok_nextc(tok);
1311 if (c == '.')
1312 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001313#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001314 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001315 goto imaginary;
1316#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 if (c == 'x' || c == 'X') {
1318 /* Hex */
1319 do {
1320 c = tok_nextc(tok);
1321 } while (isxdigit(c));
1322 }
1323 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001324 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001325 /* Octal; c is first char of it */
1326 /* There's no 'isoctdigit' macro, sigh */
1327 while ('0' <= c && c < '8') {
1328 c = tok_nextc(tok);
1329 }
Tim Petersd507dab2001-08-30 20:51:59 +00001330 if (isdigit(c)) {
1331 found_decimal = 1;
1332 do {
1333 c = tok_nextc(tok);
1334 } while (isdigit(c));
1335 }
1336 if (c == '.')
1337 goto fraction;
1338 else if (c == 'e' || c == 'E')
1339 goto exponent;
1340#ifndef WITHOUT_COMPLEX
1341 else if (c == 'j' || c == 'J')
1342 goto imaginary;
1343#endif
1344 else if (found_decimal) {
1345 tok->done = E_TOKEN;
1346 tok_backup(tok, c);
1347 return ERRORTOKEN;
1348 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001350 if (c == 'l' || c == 'L')
1351 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352 }
1353 else {
1354 /* Decimal */
1355 do {
1356 c = tok_nextc(tok);
1357 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001358 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001359 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001360 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001361 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001362 if (c == '.') {
1363 fraction:
1364 /* Fraction */
1365 do {
1366 c = tok_nextc(tok);
1367 } while (isdigit(c));
1368 }
1369 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001370 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001371 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001373 if (c == '+' || c == '-')
1374 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001375 if (!isdigit(c)) {
1376 tok->done = E_TOKEN;
1377 tok_backup(tok, c);
1378 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001379 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001380 do {
1381 c = tok_nextc(tok);
1382 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001384#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001385 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001386 /* Imaginary part */
1387 imaginary:
1388 c = tok_nextc(tok);
1389#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001390 }
1391 }
1392 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001393 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 *p_end = tok->cur;
1395 return NUMBER;
1396 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001397
1398 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 /* String */
1400 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001401 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001402 int quote = c;
1403 int triple = 0;
1404 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 for (;;) {
1406 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 if (c == '\n') {
1408 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001409 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 tok_backup(tok, c);
1411 return ERRORTOKEN;
1412 }
1413 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001414 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 }
1416 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001417 if (triple)
1418 tok->done = E_EOFS;
1419 else
1420 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001421 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 return ERRORTOKEN;
1423 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 else if (c == quote) {
1425 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001426 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 c = tok_nextc(tok);
1428 if (c == quote) {
1429 triple = 1;
1430 tripcount = 0;
1431 continue;
1432 }
1433 tok_backup(tok, c);
1434 }
1435 if (!triple || tripcount == 3)
1436 break;
1437 }
1438 else if (c == '\\') {
1439 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001440 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001441 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001442 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001443 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 return ERRORTOKEN;
1445 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001446 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001447 else
1448 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001449 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001450 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001451 *p_end = tok->cur;
1452 return STRING;
1453 }
1454
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 /* Line continuation */
1456 if (c == '\\') {
1457 c = tok_nextc(tok);
1458 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001459 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001460 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001461 return ERRORTOKEN;
1462 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001463 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001464 goto again; /* Read next line */
1465 }
1466
Guido van Rossumfbab9051991-10-20 20:25:03 +00001467 /* Check for two-character token */
1468 {
1469 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001470 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001471 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001472 int c3 = tok_nextc(tok);
1473 int token3 = PyToken_ThreeChars(c, c2, c3);
1474 if (token3 != OP) {
1475 token = token3;
1476 } else {
1477 tok_backup(tok, c3);
1478 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001479 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001480 *p_end = tok->cur;
1481 return token;
1482 }
1483 tok_backup(tok, c2);
1484 }
1485
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001486 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001487 switch (c) {
1488 case '(':
1489 case '[':
1490 case '{':
1491 tok->level++;
1492 break;
1493 case ')':
1494 case ']':
1495 case '}':
1496 tok->level--;
1497 break;
1498 }
1499
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001500 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001501 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001503 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504}
1505
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001506int
1507PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1508{
1509 int result = tok_get(tok, p_start, p_end);
1510 if (tok->decoding_erred) {
1511 result = ERRORTOKEN;
1512 tok->done = E_DECODE;
1513 }
1514 return result;
1515}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516
Guido van Rossum408027e1996-12-30 16:17:54 +00001517#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001518
1519void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001520tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521{
Guido van Rossum86bea461997-04-29 21:03:06 +00001522 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1524 printf("(%.*s)", (int)(end - start), start);
1525}
1526
1527#endif