blob: 02f33e22cb2404b8430d974ffd293b81990ccf41 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000070 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000092 "DOUBLESLASH",
93 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000094 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +000095 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +000096 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000125 tok->altwarning = 1;
126 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000330 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000331 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000341 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000358 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000359 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000360 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000361 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000362
363 /* Ask for one less byte so we can terminate it */
364 assert(size > 0);
365 size--;
366
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000369 if (buf == NULL)
370 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000371 } else {
372 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 if (PyString_CheckExact(buf))
374 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000375 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 if (utf8 == NULL) {
377 utf8 = PyUnicode_AsUTF8String(buf);
378 Py_DECREF(buf);
379 if (utf8 == NULL)
380 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000382 str = PyString_AsString(utf8);
383 utf8len = PyString_GET_SIZE(utf8);
384 if (utf8len > size) {
385 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
386 if (tok->decoding_buffer == NULL) {
387 Py_DECREF(utf8);
388 return error_ret(tok);
389 }
390 utf8len = size;
391 }
392 memcpy(s, str, utf8len);
393 s[utf8len] = '\0';
394 Py_DECREF(utf8);
395 if (utf8len == 0) return NULL; /* EOF */
396 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000397}
398
399/* Set the readline function for TOK to a StreamReader's
400 readline function. The StreamReader is named ENC.
401
402 This function is called from check_bom and check_coding_spec.
403
404 ENC is usually identical to the future value of tok->encoding,
405 except for the (currently unsupported) case of UTF-16.
406
407 Return 1 on success, 0 on failure. */
408
409static int
410fp_setreadl(struct tok_state *tok, const char* enc)
411{
412 PyObject *reader, *stream, *readline;
413
Martin v. Löwis95292d62002-12-11 14:04:59 +0000414 /* XXX: constify filename argument. */
415 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000416 if (stream == NULL)
417 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000418
419 reader = PyCodec_StreamReader(enc, stream, NULL);
420 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000421 if (reader == NULL)
422 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000423
424 readline = PyObject_GetAttrString(reader, "readline");
425 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000426 if (readline == NULL)
427 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000428
429 tok->decoding_readline = readline;
430 return 1;
431}
432
433/* Fetch the next byte from TOK. */
434
435static int fp_getc(struct tok_state *tok) {
436 return getc(tok->fp);
437}
438
439/* Unfetch the last byte back into TOK. */
440
441static void fp_ungetc(int c, struct tok_state *tok) {
442 ungetc(c, tok->fp);
443}
444
445/* Read a line of input from TOK. Determine encoding
446 if necessary. */
447
448static char *
449decoding_fgets(char *s, int size, struct tok_state *tok)
450{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000451 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000452 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000453 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454 if (tok->decoding_state < 0) {
455 /* We already have a codec associated with
456 this input. */
457 line = fp_readl(s, size, tok);
458 break;
459 } else if (tok->decoding_state > 0) {
460 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000461 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463 break;
464 } else {
465 /* We have not yet determined the encoding.
466 If an encoding is found, use the file-pointer
467 reader functions from now on. */
468 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
469 return error_ret(tok);
470 assert(tok->decoding_state != 0);
471 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000472 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000473 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
474 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
475 return error_ret(tok);
476 }
477 }
478#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000479 /* The default encoding is ASCII, so make sure we don't have any
480 non-ASCII bytes in it. */
481 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000482 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000483 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000484 if (*c > 127) {
485 badchar = *c;
486 break;
487 }
488 }
489 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000490 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000491 /* Need to add 1 to the line number, since this line
492 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000493 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000494 "Non-ASCII character '\\x%.2x' "
495 "in file %.200s on line %i, "
496 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000497 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000498 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000499 PyErr_SetString(PyExc_SyntaxError, buf);
500 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000501 }
502#endif
503 return line;
504}
505
506static int
507decoding_feof(struct tok_state *tok)
508{
509 if (tok->decoding_state >= 0) {
510 return feof(tok->fp);
511 } else {
512 PyObject* buf = tok->decoding_buffer;
513 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000514 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 if (buf == NULL) {
516 error_ret(tok);
517 return 1;
518 } else {
519 tok->decoding_buffer = buf;
520 }
521 }
522 return PyObject_Length(buf) == 0;
523 }
524}
525
526/* Fetch a byte from TOK, using the string buffer. */
527
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000528static int
529buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000530 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000531}
532
533/* Unfetch a byte from TOK, using the string buffer. */
534
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000535static void
536buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000538 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000539}
540
541/* Set the readline function for TOK to ENC. For the string-based
542 tokenizer, this means to just record the encoding. */
543
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000544static int
545buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546 tok->enc = enc;
547 return 1;
548}
549
550/* Return a UTF-8 encoding Python string object from the
551 C byte string STR, which is encoded with ENC. */
552
553static PyObject *
554translate_into_utf8(const char* str, const char* enc) {
555 PyObject *utf8;
556 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
557 if (buf == NULL)
558 return NULL;
559 utf8 = PyUnicode_AsUTF8String(buf);
560 Py_DECREF(buf);
561 return utf8;
562}
563
564/* Decode a byte string STR for use as the buffer of TOK.
565 Look for encoding declarations inside STR, and record them
566 inside TOK. */
567
568static const char *
569decode_str(const char *str, struct tok_state *tok)
570{
571 PyObject* utf8 = NULL;
572 const char *s;
573 int lineno = 0;
574 tok->enc = NULL;
575 tok->str = str;
576 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000577 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000579 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 if (tok->enc != NULL) {
581 utf8 = translate_into_utf8(str, tok->enc);
582 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000583 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584 str = PyString_AsString(utf8);
585 }
586 for (s = str;; s++) {
587 if (*s == '\0') break;
588 else if (*s == '\n') {
589 lineno++;
590 if (lineno == 2) break;
591 }
592 }
593 tok->enc = NULL;
594 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000595 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 if (tok->enc != NULL) {
597 assert(utf8 == NULL);
598 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000599 if (utf8 == NULL) {
600 PyErr_Format(PyExc_SyntaxError,
601 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000602 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000603 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 str = PyString_AsString(utf8);
605 }
606 assert(tok->decoding_buffer == NULL);
607 tok->decoding_buffer = utf8; /* CAUTION */
608 return str;
609}
610
611#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000612
613/* Set up tokenizer for string */
614
615struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000616PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000617{
618 struct tok_state *tok = tok_new();
619 if (tok == NULL)
620 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000622 if (str == NULL) {
623 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000625 }
626
Martin v. Löwis95292d62002-12-11 14:04:59 +0000627 /* XXX: constify members. */
628 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000629 return tok;
630}
631
632
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000633/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000634
635struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000636PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000637{
638 struct tok_state *tok = tok_new();
639 if (tok == NULL)
640 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000641 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000642 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000643 return NULL;
644 }
645 tok->cur = tok->inp = tok->buf;
646 tok->end = tok->buf + BUFSIZ;
647 tok->fp = fp;
648 tok->prompt = ps1;
649 tok->nextprompt = ps2;
650 return tok;
651}
652
653
654/* Free a tok_state structure */
655
656void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000657PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000659 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000660 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000661#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662 Py_XDECREF(tok->decoding_readline);
663 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000664#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000665 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000666 PyMem_FREE(tok->buf);
667 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000668}
669
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000670#if !defined(PGEN)
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000671static int
672tok_stdin_decode(struct tok_state *tok, char **inp)
673{
674 PyObject *enc, *sysstdin, *decoded, *utf8;
675 const char *encoding;
676 char *converted;
677
678 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
679 return 0;
680 sysstdin = PySys_GetObject("stdin");
681 if (sysstdin == NULL || !PyFile_Check(sysstdin))
682 return 0;
683
684 enc = ((PyFileObject *)sysstdin)->f_encoding;
685 if (enc == NULL || !PyString_Check(enc))
686 return 0;
687 Py_INCREF(enc);
688
689 encoding = PyString_AsString(enc);
690 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
691 if (decoded == NULL)
692 goto error_clear;
693
694 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
695 Py_DECREF(decoded);
696 if (utf8 == NULL)
697 goto error_clear;
698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000699 assert(PyString_Check(utf8));
700 converted = new_string(PyString_AS_STRING(utf8),
701 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000702 Py_DECREF(utf8);
703 if (converted == NULL)
704 goto error_nomem;
705
706 PyMem_FREE(*inp);
707 *inp = converted;
708 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000709 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000710 tok->encoding = new_string(encoding, strlen(encoding));
711 if (tok->encoding == NULL)
712 goto error_nomem;
713
714 Py_DECREF(enc);
715 return 0;
716
717error_nomem:
718 Py_DECREF(enc);
719 tok->done = E_NOMEM;
720 return -1;
721
722error_clear:
723 /* Fallback to iso-8859-1: for backward compatibility */
724 Py_DECREF(enc);
725 PyErr_Clear();
726 return 0;
727}
728#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729
730/* Get next char, updating state; error code goes into tok->done */
731
732static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000733tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000734{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000736 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000737 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000738 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000739 if (tok->done != E_OK)
740 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000742 char *end = strchr(tok->inp, '\n');
743 if (end != NULL)
744 end++;
745 else {
746 end = strchr(tok->inp, '\0');
747 if (end == tok->inp) {
748 tok->done = E_EOF;
749 return EOF;
750 }
751 }
752 if (tok->start == NULL)
753 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000754 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000755 tok->lineno++;
756 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000757 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000760 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 if (tok->nextprompt != NULL)
762 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000763 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000764 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000765 else if (*newtok == '\0') {
766 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000767 tok->done = E_EOF;
768 }
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000769#if !defined(PGEN)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000770 else if (tok_stdin_decode(tok, &newtok) != 0)
771 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000772#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000773 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000774 size_t start = tok->start - tok->buf;
775 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000776 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000777 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000778 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 tok->lineno++;
780 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000782 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000783 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000784 tok->done = E_NOMEM;
785 return EOF;
786 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000787 tok->buf = buf;
788 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000789 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000790 strcpy(tok->buf + oldlen, newtok);
791 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000792 tok->inp = tok->buf + newlen;
793 tok->end = tok->inp + 1;
794 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000795 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000796 else {
797 tok->lineno++;
798 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000799 PyMem_FREE(tok->buf);
800 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000801 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000802 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000803 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000804 tok->inp = strchr(tok->buf, '\0');
805 tok->end = tok->inp + 1;
806 }
807 }
808 else {
809 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000811 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000812 if (tok->start == NULL) {
813 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000814 tok->buf = (char *)
815 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 if (tok->buf == NULL) {
817 tok->done = E_NOMEM;
818 return EOF;
819 }
820 tok->end = tok->buf + BUFSIZ;
821 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000822 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
823 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000824 tok->done = E_EOF;
825 done = 1;
826 }
827 else {
828 tok->done = E_OK;
829 tok->inp = strchr(tok->buf, '\0');
830 done = tok->inp[-1] == '\n';
831 }
832 }
833 else {
834 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000835 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000836 tok->done = E_EOF;
837 done = 1;
838 }
839 else
840 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000841 }
842 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000843 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000845 Py_ssize_t curstart = tok->start == NULL ? -1 :
846 tok->start - tok->buf;
847 Py_ssize_t curvalid = tok->inp - tok->buf;
848 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000849 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000850 newbuf = (char *)PyMem_REALLOC(newbuf,
851 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000852 if (newbuf == NULL) {
853 tok->done = E_NOMEM;
854 tok->cur = tok->inp;
855 return EOF;
856 }
857 tok->buf = newbuf;
858 tok->inp = tok->buf + curvalid;
859 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 tok->start = curstart < 0 ? NULL :
861 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000862 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000864 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000865 /* Break out early on decoding
866 errors, as tok->buf will be NULL
867 */
868 if (tok->decoding_erred)
869 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000870 /* Last line does not end in \n,
871 fake one */
872 strcpy(tok->inp, "\n");
873 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000874 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000875 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000877 if (tok->buf != NULL) {
878 tok->cur = tok->buf + cur;
879 tok->line_start = tok->cur;
880 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000881 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000882 pt = tok->inp - 2;
883 if (pt >= tok->buf && *pt == '\r') {
884 *pt++ = '\n';
885 *pt = '\0';
886 tok->inp = pt;
887 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000888 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889 }
890 if (tok->done != E_OK) {
891 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000892 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000894 return EOF;
895 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000897 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000898}
899
900
901/* Back-up one character */
902
903static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000904tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905{
906 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000907 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000908 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000909 if (*tok->cur != c)
910 *tok->cur = c;
911 }
912}
913
914
915/* Return the token corresponding to a single character */
916
917int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000918PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000919{
920 switch (c) {
921 case '(': return LPAR;
922 case ')': return RPAR;
923 case '[': return LSQB;
924 case ']': return RSQB;
925 case ':': return COLON;
926 case ',': return COMMA;
927 case ';': return SEMI;
928 case '+': return PLUS;
929 case '-': return MINUS;
930 case '*': return STAR;
931 case '/': return SLASH;
932 case '|': return VBAR;
933 case '&': return AMPER;
934 case '<': return LESS;
935 case '>': return GREATER;
936 case '=': return EQUAL;
937 case '.': return DOT;
938 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000939 case '{': return LBRACE;
940 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000941 case '^': return CIRCUMFLEX;
942 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000943 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000944 default: return OP;
945 }
946}
947
948
Guido van Rossumfbab9051991-10-20 20:25:03 +0000949int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000950PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000951{
952 switch (c1) {
953 case '=':
954 switch (c2) {
955 case '=': return EQEQUAL;
956 }
957 break;
958 case '!':
959 switch (c2) {
960 case '=': return NOTEQUAL;
961 }
962 break;
963 case '<':
964 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000965 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000966 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000967 }
968 break;
969 case '>':
970 switch (c2) {
971 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000972 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000973 }
974 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000975 case '+':
976 switch (c2) {
977 case '=': return PLUSEQUAL;
978 }
979 break;
980 case '-':
981 switch (c2) {
982 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000983 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000984 }
985 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000986 case '*':
987 switch (c2) {
988 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000989 case '=': return STAREQUAL;
990 }
991 break;
992 case '/':
993 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000994 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000995 case '=': return SLASHEQUAL;
996 }
997 break;
998 case '|':
999 switch (c2) {
1000 case '=': return VBAREQUAL;
1001 }
1002 break;
1003 case '%':
1004 switch (c2) {
1005 case '=': return PERCENTEQUAL;
1006 }
1007 break;
1008 case '&':
1009 switch (c2) {
1010 case '=': return AMPEREQUAL;
1011 }
1012 break;
1013 case '^':
1014 switch (c2) {
1015 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001016 }
1017 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001018 }
1019 return OP;
1020}
1021
Thomas Wouters434d0822000-08-24 20:11:32 +00001022int
1023PyToken_ThreeChars(int c1, int c2, int c3)
1024{
1025 switch (c1) {
1026 case '<':
1027 switch (c2) {
1028 case '<':
1029 switch (c3) {
1030 case '=':
1031 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001032 }
1033 break;
1034 }
1035 break;
1036 case '>':
1037 switch (c2) {
1038 case '>':
1039 switch (c3) {
1040 case '=':
1041 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001042 }
1043 break;
1044 }
1045 break;
1046 case '*':
1047 switch (c2) {
1048 case '*':
1049 switch (c3) {
1050 case '=':
1051 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001052 }
1053 break;
1054 }
1055 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001056 case '/':
1057 switch (c2) {
1058 case '/':
1059 switch (c3) {
1060 case '=':
1061 return DOUBLESLASHEQUAL;
1062 }
1063 break;
1064 }
1065 break;
Georg Brandldde00282007-03-18 19:01:53 +00001066 case '.':
1067 switch (c2) {
1068 case '.':
1069 switch (c3) {
1070 case '.':
1071 return ELLIPSIS;
1072 }
1073 break;
1074 }
1075 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001076 }
1077 return OP;
1078}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001079
Guido van Rossum926f13a1998-04-09 21:38:06 +00001080static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001081indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001082{
1083 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001084 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001085 tok->cur = tok->inp;
1086 return 1;
1087 }
1088 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001089 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1090 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001091 tok->altwarning = 0;
1092 }
1093 return 0;
1094}
1095
1096
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097/* Get next token, after space stripping etc. */
1098
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001099static int
1100tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101{
1102 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001103 int blankline;
1104
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001105 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001106 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001107 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001108 blankline = 0;
1109
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001110 /* Get indentation level */
1111 if (tok->atbol) {
1112 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001113 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001114 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001115 for (;;) {
1116 c = tok_nextc(tok);
1117 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001118 col++, altcol++;
1119 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001121 altcol = (altcol/tok->alttabsize + 1)
1122 * tok->alttabsize;
1123 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001124 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126 else
1127 break;
1128 }
1129 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001130 if (c == '#' || c == '\n') {
1131 /* Lines with only whitespace and/or comments
1132 shouldn't affect the indentation and are
1133 not passed to the parser as NEWLINE tokens,
1134 except *totally* empty lines in interactive
1135 mode, which signal the end of a command group. */
1136 if (col == 0 && c == '\n' && tok->prompt != NULL)
1137 blankline = 0; /* Let it through */
1138 else
1139 blankline = 1; /* Ignore completely */
1140 /* We can't jump back right here since we still
1141 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001143 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001144 if (col == tok->indstack[tok->indent]) {
1145 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001146 if (altcol != tok->altindstack[tok->indent]) {
1147 if (indenterror(tok))
1148 return ERRORTOKEN;
1149 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001150 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001151 else if (col > tok->indstack[tok->indent]) {
1152 /* Indent -- always one */
1153 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001154 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001155 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001156 return ERRORTOKEN;
1157 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001158 if (altcol <= tok->altindstack[tok->indent]) {
1159 if (indenterror(tok))
1160 return ERRORTOKEN;
1161 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001162 tok->pendin++;
1163 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001164 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001166 else /* col < tok->indstack[tok->indent] */ {
1167 /* Dedent -- any number, must be consistent */
1168 while (tok->indent > 0 &&
1169 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001170 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001171 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 }
1173 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001174 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001175 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001176 return ERRORTOKEN;
1177 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001178 if (altcol != tok->altindstack[tok->indent]) {
1179 if (indenterror(tok))
1180 return ERRORTOKEN;
1181 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001182 }
1183 }
1184 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001185
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001186 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001187
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001188 /* Return pending indents/dedents */
1189 if (tok->pendin != 0) {
1190 if (tok->pendin < 0) {
1191 tok->pendin++;
1192 return DEDENT;
1193 }
1194 else {
1195 tok->pendin--;
1196 return INDENT;
1197 }
1198 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001199
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001201 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 /* Skip spaces */
1203 do {
1204 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001205 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001208 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001209
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001210 /* Skip comment */
1211 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001212 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001214
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001216 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001218 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001219
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 /* Identifier (most frequent token!) */
1221 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001222 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001223 switch (c) {
1224 case 'r':
1225 case 'R':
1226 c = tok_nextc(tok);
1227 if (c == '"' || c == '\'')
1228 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001229 break;
1230 case 'u':
1231 case 'U':
1232 c = tok_nextc(tok);
1233 if (c == 'r' || c == 'R')
1234 c = tok_nextc(tok);
1235 if (c == '"' || c == '\'')
1236 goto letter_quote;
1237 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001238 case 'b':
1239 case 'B':
1240 c = tok_nextc(tok);
1241 if (c == 'r' || c == 'R')
1242 c = tok_nextc(tok);
1243 if (c == '"' || c == '\'')
1244 goto letter_quote;
1245 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001246 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001247 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001249 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001251 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001252 *p_end = tok->cur;
1253 return NAME;
1254 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001255
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 /* Newline */
1257 if (c == '\n') {
1258 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001259 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001260 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001261 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001262 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001263 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264 return NEWLINE;
1265 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001266
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001267 /* Period or number starting with period? */
1268 if (c == '.') {
1269 c = tok_nextc(tok);
1270 if (isdigit(c)) {
1271 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001272 } else if (c == '.') {
1273 c = tok_nextc(tok);
1274 if (c == '.') {
1275 *p_start = tok->start;
1276 *p_end = tok->cur;
1277 return ELLIPSIS;
1278 } else {
1279 tok_backup(tok, c);
1280 }
1281 tok_backup(tok, '.');
1282 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001283 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001284 }
Georg Brandldde00282007-03-18 19:01:53 +00001285 *p_start = tok->start;
1286 *p_end = tok->cur;
1287 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001288 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001289
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001290 /* Number */
1291 if (isdigit(c)) {
1292 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001293 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 c = tok_nextc(tok);
1295 if (c == '.')
1296 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001297#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001298 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001299 goto imaginary;
1300#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 if (c == 'x' || c == 'X') {
1302 /* Hex */
1303 do {
1304 c = tok_nextc(tok);
1305 } while (isxdigit(c));
1306 }
1307 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001308 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 /* Octal; c is first char of it */
1310 /* There's no 'isoctdigit' macro, sigh */
1311 while ('0' <= c && c < '8') {
1312 c = tok_nextc(tok);
1313 }
Tim Petersd507dab2001-08-30 20:51:59 +00001314 if (isdigit(c)) {
1315 found_decimal = 1;
1316 do {
1317 c = tok_nextc(tok);
1318 } while (isdigit(c));
1319 }
1320 if (c == '.')
1321 goto fraction;
1322 else if (c == 'e' || c == 'E')
1323 goto exponent;
1324#ifndef WITHOUT_COMPLEX
1325 else if (c == 'j' || c == 'J')
1326 goto imaginary;
1327#endif
1328 else if (found_decimal) {
1329 tok->done = E_TOKEN;
1330 tok_backup(tok, c);
1331 return ERRORTOKEN;
1332 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 }
1334 }
1335 else {
1336 /* Decimal */
1337 do {
1338 c = tok_nextc(tok);
1339 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001340 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001341 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001342 if (c == '.') {
1343 fraction:
1344 /* Fraction */
1345 do {
1346 c = tok_nextc(tok);
1347 } while (isdigit(c));
1348 }
1349 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001350 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001351 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001353 if (c == '+' || c == '-')
1354 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001355 if (!isdigit(c)) {
1356 tok->done = E_TOKEN;
1357 tok_backup(tok, c);
1358 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001359 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001360 do {
1361 c = tok_nextc(tok);
1362 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001364#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001365 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001366 /* Imaginary part */
1367 imaginary:
1368 c = tok_nextc(tok);
1369#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001370 }
1371 }
1372 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 *p_end = tok->cur;
1375 return NUMBER;
1376 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001377
1378 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001379 /* String */
1380 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001381 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 int quote = c;
1383 int triple = 0;
1384 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 for (;;) {
1386 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001387 if (c == '\n') {
1388 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001389 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001390 tok_backup(tok, c);
1391 return ERRORTOKEN;
1392 }
1393 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001394 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001395 }
1396 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001397 if (triple)
1398 tok->done = E_EOFS;
1399 else
1400 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001401 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 return ERRORTOKEN;
1403 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001404 else if (c == quote) {
1405 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001406 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 c = tok_nextc(tok);
1408 if (c == quote) {
1409 triple = 1;
1410 tripcount = 0;
1411 continue;
1412 }
1413 tok_backup(tok, c);
1414 }
1415 if (!triple || tripcount == 3)
1416 break;
1417 }
1418 else if (c == '\\') {
1419 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001421 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001422 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001423 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 return ERRORTOKEN;
1425 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001426 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 else
1428 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001429 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001430 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001431 *p_end = tok->cur;
1432 return STRING;
1433 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 /* Line continuation */
1436 if (c == '\\') {
1437 c = tok_nextc(tok);
1438 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001439 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001440 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 return ERRORTOKEN;
1442 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001443 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 goto again; /* Read next line */
1445 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446
Guido van Rossumfbab9051991-10-20 20:25:03 +00001447 /* Check for two-character token */
1448 {
1449 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001450 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001451 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001452 int c3 = tok_nextc(tok);
1453 int token3 = PyToken_ThreeChars(c, c2, c3);
1454 if (token3 != OP) {
1455 token = token3;
1456 } else {
1457 tok_backup(tok, c3);
1458 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001459 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001460 *p_end = tok->cur;
1461 return token;
1462 }
1463 tok_backup(tok, c2);
1464 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001465
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001466 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001467 switch (c) {
1468 case '(':
1469 case '[':
1470 case '{':
1471 tok->level++;
1472 break;
1473 case ')':
1474 case ']':
1475 case '}':
1476 tok->level--;
1477 break;
1478 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001480 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001481 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001483 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001484}
1485
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001486int
1487PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1488{
1489 int result = tok_get(tok, p_start, p_end);
1490 if (tok->decoding_erred) {
1491 result = ERRORTOKEN;
1492 tok->done = E_DECODE;
1493 }
1494 return result;
1495}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001496
Guido van Rossum408027e1996-12-30 16:17:54 +00001497#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001498
1499void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001500tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001501{
Guido van Rossum86bea461997-04-29 21:03:06 +00001502 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001503 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1504 printf("(%.*s)", (int)(end - start), start);
1505}
1506
1507#endif