blob: 469de27b62241b484b938497eb0b76b8c6933039 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000073 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000077 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000081 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000082 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000093 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000095 "AT",
Guido van Rossumfbab9051991-10-20 20:25:03 +000096 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000097 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000106tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107{
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000108 struct tok_state *tok = PyMem_MALLOC(sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 if (tok == NULL)
110 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000121 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000130 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000131 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000132#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000135#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000136 return tok;
137}
138
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144 return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150 return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156 return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000166 PyObject_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
169}
170
171static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000173{
Tim Petersc9d78aa2006-03-26 23:27:58 +0000174 char* result = (char *)PyObject_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
178 }
179 return result;
180}
181
182static char *
183get_normal_name(char *s) /* for utf-8 and latin-1 */
184{
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
192 }
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found. */
206
207static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000208get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000231 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000232 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000239 PyObject_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000240 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 int set_readline(struct tok_state *, const char *))
257{
Tim Peters17db21f2002-09-03 15:39:58 +0000258 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000259 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000260
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000264 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
Martin v. Löwis019934b2002-08-07 12:33:18 +0000273#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000279 else
Tim Petersc9d78aa2006-03-26 23:27:58 +0000280 PyObject_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000281#else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000286 PyObject_FREE(cs);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000287#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Tim Petersc9d78aa2006-03-26 23:27:58 +0000291 PyObject_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
319 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
320#if 0
321 /* Disable support for UTF-16 BOMs until a decision
322 is made whether this needs to be supported. */
323 } else if (ch == 0xFE) {
324 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
325 if (!set_readline(tok, "utf-16-be")) return 0;
326 tok->decoding_state = -1;
327 } else if (ch == 0xFF) {
328 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
329 if (!set_readline(tok, "utf-16-le")) return 0;
330 tok->decoding_state = -1;
331#endif
332 } else {
333 unget_char(ch, tok);
334 return 1;
335 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000336 if (tok->encoding != NULL)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000337 PyObject_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
339 return 1;
340 NON_BOM:
341 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
342 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
343 return 1;
344}
345
346/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000347 Return NULL on failure, else S.
Tim Petersc9d78aa2006-03-26 23:27:58 +0000348
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000349 On entry, tok->decoding_buffer will be one of:
350 1) NULL: need to call tok->decoding_readline to get a new line
351 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
352 stored the result in tok->decoding_buffer
353 3) PyStringObject *: previous call to fp_readl did not have enough room
354 (in the s buffer) to copy entire contents of the line read
355 by tok->decoding_readline. tok->decoding_buffer has the overflow.
356 In this case, fp_readl is called in a loop (with an expanded buffer)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000357 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000358 reached): see tok_nextc and its calls to decoding_fgets.
359*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000360
361static char *
362fp_readl(char *s, int size, struct tok_state *tok)
363{
Martin v. Löwis019934b2002-08-07 12:33:18 +0000364#ifndef Py_USING_UNICODE
365 /* In a non-Unicode built, this should never be called. */
Martin v. Löwis2863c102002-08-07 15:18:57 +0000366 Py_FatalError("fp_readl should not be called in this build.");
Guido van Rossum84b2bed2002-08-16 17:01:09 +0000367 return NULL; /* Keep compiler happy (not reachable) */
Martin v. Löwis019934b2002-08-07 12:33:18 +0000368#else
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 PyObject* utf8 = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 PyObject* buf = tok->decoding_buffer;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 char *str;
Martin v. Löwisf5adf1e2006-02-16 14:35:38 +0000372 Py_ssize_t utf8len;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373
374 /* Ask for one less byte so we can terminate it */
375 assert(size > 0);
376 size--;
377
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000378 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000380 if (buf == NULL)
381 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 } else {
383 tok->decoding_buffer = NULL;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000384 if (PyString_CheckExact(buf))
385 utf8 = buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 if (utf8 == NULL) {
388 utf8 = PyUnicode_AsUTF8String(buf);
389 Py_DECREF(buf);
390 if (utf8 == NULL)
391 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000392 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 str = PyString_AsString(utf8);
394 utf8len = PyString_GET_SIZE(utf8);
395 if (utf8len > size) {
396 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
397 if (tok->decoding_buffer == NULL) {
398 Py_DECREF(utf8);
399 return error_ret(tok);
400 }
401 utf8len = size;
402 }
403 memcpy(s, str, utf8len);
404 s[utf8len] = '\0';
405 Py_DECREF(utf8);
406 if (utf8len == 0) return NULL; /* EOF */
407 return s;
Martin v. Löwis019934b2002-08-07 12:33:18 +0000408#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Set the readline function for TOK to a StreamReader's
412 readline function. The StreamReader is named ENC.
413
414 This function is called from check_bom and check_coding_spec.
415
416 ENC is usually identical to the future value of tok->encoding,
417 except for the (currently unsupported) case of UTF-16.
418
419 Return 1 on success, 0 on failure. */
420
421static int
422fp_setreadl(struct tok_state *tok, const char* enc)
423{
424 PyObject *reader, *stream, *readline;
425
Martin v. Löwis95292d62002-12-11 14:04:59 +0000426 /* XXX: constify filename argument. */
427 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000428 if (stream == NULL)
429 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430
431 reader = PyCodec_StreamReader(enc, stream, NULL);
432 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000433 if (reader == NULL)
434 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000435
436 readline = PyObject_GetAttrString(reader, "readline");
437 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000438 if (readline == NULL)
439 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000440
441 tok->decoding_readline = readline;
442 return 1;
443}
444
445/* Fetch the next byte from TOK. */
446
447static int fp_getc(struct tok_state *tok) {
448 return getc(tok->fp);
449}
450
451/* Unfetch the last byte back into TOK. */
452
453static void fp_ungetc(int c, struct tok_state *tok) {
454 ungetc(c, tok->fp);
455}
456
457/* Read a line of input from TOK. Determine encoding
458 if necessary. */
459
460static char *
461decoding_fgets(char *s, int size, struct tok_state *tok)
462{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000463 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000464 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000465 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000466 if (tok->decoding_state < 0) {
467 /* We already have a codec associated with
468 this input. */
469 line = fp_readl(s, size, tok);
470 break;
471 } else if (tok->decoding_state > 0) {
472 /* We want a 'raw' read. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000473 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 break;
476 } else {
477 /* We have not yet determined the encoding.
478 If an encoding is found, use the file-pointer
479 reader functions from now on. */
480 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
481 return error_ret(tok);
482 assert(tok->decoding_state != 0);
483 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000484 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
486 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
487 return error_ret(tok);
488 }
489 }
490#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000491 /* The default encoding is ASCII, so make sure we don't have any
492 non-ASCII bytes in it. */
493 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000495 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (*c > 127) {
497 badchar = *c;
498 break;
499 }
500 }
501 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000502 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000503 /* Need to add 1 to the line number, since this line
504 has not been counted, yet. */
Tim Petersc9d78aa2006-03-26 23:27:58 +0000505 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000506 "Non-ASCII character '\\x%.2x' "
507 "in file %.200s on line %i, "
508 "but no encoding declared; "
Tim Petersc9d78aa2006-03-26 23:27:58 +0000509 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000510 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000511 PyErr_SetString(PyExc_SyntaxError, buf);
512 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000513 }
514#endif
515 return line;
516}
517
518static int
519decoding_feof(struct tok_state *tok)
520{
521 if (tok->decoding_state >= 0) {
522 return feof(tok->fp);
523 } else {
524 PyObject* buf = tok->decoding_buffer;
525 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000526 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 if (buf == NULL) {
528 error_ret(tok);
529 return 1;
530 } else {
531 tok->decoding_buffer = buf;
532 }
533 }
534 return PyObject_Length(buf) == 0;
535 }
536}
537
538/* Fetch a byte from TOK, using the string buffer. */
539
Tim Petersc9d78aa2006-03-26 23:27:58 +0000540static int
541buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000542 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543}
544
545/* Unfetch a byte from TOK, using the string buffer. */
546
Tim Petersc9d78aa2006-03-26 23:27:58 +0000547static void
548buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000550 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551}
552
553/* Set the readline function for TOK to ENC. For the string-based
554 tokenizer, this means to just record the encoding. */
555
Tim Petersc9d78aa2006-03-26 23:27:58 +0000556static int
557buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 tok->enc = enc;
559 return 1;
560}
561
562/* Return a UTF-8 encoding Python string object from the
563 C byte string STR, which is encoded with ENC. */
564
Martin v. Löwis019934b2002-08-07 12:33:18 +0000565#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000566static PyObject *
567translate_into_utf8(const char* str, const char* enc) {
568 PyObject *utf8;
569 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
570 if (buf == NULL)
571 return NULL;
572 utf8 = PyUnicode_AsUTF8String(buf);
573 Py_DECREF(buf);
574 return utf8;
575}
Martin v. Löwis019934b2002-08-07 12:33:18 +0000576#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577
578/* Decode a byte string STR for use as the buffer of TOK.
579 Look for encoding declarations inside STR, and record them
580 inside TOK. */
581
582static const char *
583decode_str(const char *str, struct tok_state *tok)
584{
585 PyObject* utf8 = NULL;
586 const char *s;
587 int lineno = 0;
588 tok->enc = NULL;
589 tok->str = str;
590 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000591 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000593 assert(str);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000594#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595 if (tok->enc != NULL) {
596 utf8 = translate_into_utf8(str, tok->enc);
597 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000598 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 str = PyString_AsString(utf8);
600 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000601#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000602 for (s = str;; s++) {
603 if (*s == '\0') break;
604 else if (*s == '\n') {
605 lineno++;
606 if (lineno == 2) break;
607 }
608 }
609 tok->enc = NULL;
610 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000611 return error_ret(tok);
Martin v. Löwis019934b2002-08-07 12:33:18 +0000612#ifdef Py_USING_UNICODE
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613 if (tok->enc != NULL) {
614 assert(utf8 == NULL);
615 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000616 if (utf8 == NULL) {
617 PyErr_Format(PyExc_SyntaxError,
618 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000619 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000620 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 str = PyString_AsString(utf8);
622 }
Martin v. Löwis019934b2002-08-07 12:33:18 +0000623#endif
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624 assert(tok->decoding_buffer == NULL);
625 tok->decoding_buffer = utf8; /* CAUTION */
626 return str;
627}
628
629#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630
631/* Set up tokenizer for string */
632
633struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000634PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000635{
636 struct tok_state *tok = tok_new();
637 if (tok == NULL)
638 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 if (str == NULL) {
641 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000643 }
644
Martin v. Löwis95292d62002-12-11 14:04:59 +0000645 /* XXX: constify members. */
646 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000647 return tok;
648}
649
650
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000651/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652
653struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000654PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655{
656 struct tok_state *tok = tok_new();
657 if (tok == NULL)
658 return NULL;
Tim Petersc9d78aa2006-03-26 23:27:58 +0000659 if ((tok->buf = (char *)PyObject_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000660 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000661 return NULL;
662 }
663 tok->cur = tok->inp = tok->buf;
664 tok->end = tok->buf + BUFSIZ;
665 tok->fp = fp;
666 tok->prompt = ps1;
667 tok->nextprompt = ps2;
668 return tok;
669}
670
671
672/* Free a tok_state structure */
673
674void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000675PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000676{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000677 if (tok->encoding != NULL)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000678 PyObject_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000679#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000680 Py_XDECREF(tok->decoding_readline);
681 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000682#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683 if (tok->fp != NULL && tok->buf != NULL)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000684 PyObject_FREE(tok->buf);
685 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686}
687
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000688#if !defined(PGEN) && defined(Py_USING_UNICODE)
689static int
690tok_stdin_decode(struct tok_state *tok, char **inp)
691{
692 PyObject *enc, *sysstdin, *decoded, *utf8;
693 const char *encoding;
694 char *converted;
695
696 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
697 return 0;
698 sysstdin = PySys_GetObject("stdin");
699 if (sysstdin == NULL || !PyFile_Check(sysstdin))
700 return 0;
701
702 enc = ((PyFileObject *)sysstdin)->f_encoding;
703 if (enc == NULL || !PyString_Check(enc))
704 return 0;
705 Py_INCREF(enc);
706
707 encoding = PyString_AsString(enc);
708 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
709 if (decoded == NULL)
710 goto error_clear;
711
712 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
713 Py_DECREF(decoded);
714 if (utf8 == NULL)
715 goto error_clear;
716
Neal Norwitz2aa9a5d2006-03-20 01:53:23 +0000717 assert(PyString_Check(utf8));
718 converted = new_string(PyString_AS_STRING(utf8),
719 PyString_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000720 Py_DECREF(utf8);
721 if (converted == NULL)
722 goto error_nomem;
723
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000724 PyObject_FREE(*inp);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000725 *inp = converted;
726 if (tok->encoding != NULL)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000727 PyObject_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000728 tok->encoding = new_string(encoding, strlen(encoding));
729 if (tok->encoding == NULL)
730 goto error_nomem;
731
732 Py_DECREF(enc);
733 return 0;
734
735error_nomem:
736 Py_DECREF(enc);
737 tok->done = E_NOMEM;
738 return -1;
739
740error_clear:
741 /* Fallback to iso-8859-1: for backward compatibility */
742 Py_DECREF(enc);
743 PyErr_Clear();
744 return 0;
745}
746#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747
748/* Get next char, updating state; error code goes into tok->done */
749
750static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000751tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000754 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000755 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000756 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000757 if (tok->done != E_OK)
758 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000760 char *end = strchr(tok->inp, '\n');
761 if (end != NULL)
762 end++;
763 else {
764 end = strchr(tok->inp, '\0');
765 if (end == tok->inp) {
766 tok->done = E_EOF;
767 return EOF;
768 }
769 }
770 if (tok->start == NULL)
771 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000772 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000773 tok->lineno++;
774 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000775 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777 if (tok->prompt != NULL) {
Martin v. Löwis566f6af2002-10-26 14:39:10 +0000778 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000779 if (tok->nextprompt != NULL)
780 tok->prompt = tok->nextprompt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000781 if (new == NULL)
782 tok->done = E_INTR;
783 else if (*new == '\0') {
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000784 PyObject_FREE(new);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000785 tok->done = E_EOF;
786 }
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000787#if !defined(PGEN) && defined(Py_USING_UNICODE)
788 else if (tok_stdin_decode(tok, &new) != 0)
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000789 PyObject_FREE(new);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000790#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000792 size_t start = tok->start - tok->buf;
793 size_t oldlen = tok->cur - tok->buf;
794 size_t newlen = oldlen + strlen(new);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000795 char *buf = tok->buf;
Tim Petersc9d78aa2006-03-26 23:27:58 +0000796 buf = (char *)PyObject_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000797 tok->lineno++;
798 if (buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000799 PyObject_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000800 tok->buf = NULL;
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000801 PyObject_FREE(new);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000802 tok->done = E_NOMEM;
803 return EOF;
804 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000805 tok->buf = buf;
806 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000807 tok->line_start = tok->cur;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 strcpy(tok->buf + oldlen, new);
Neal Norwitz2c4e4f92006-04-10 06:42:25 +0000809 PyObject_FREE(new);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000810 tok->inp = tok->buf + newlen;
811 tok->end = tok->inp + 1;
812 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000813 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000814 else {
815 tok->lineno++;
816 if (tok->buf != NULL)
Tim Petersc9d78aa2006-03-26 23:27:58 +0000817 PyObject_FREE(tok->buf);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 tok->buf = new;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000819 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000820 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000821 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000822 tok->inp = strchr(tok->buf, '\0');
823 tok->end = tok->inp + 1;
824 }
825 }
826 else {
827 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000828 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000829 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000830 if (tok->start == NULL) {
831 if (tok->buf == NULL) {
Tim Petersc9d78aa2006-03-26 23:27:58 +0000832 tok->buf = (char *)
833 PyObject_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000834 if (tok->buf == NULL) {
835 tok->done = E_NOMEM;
836 return EOF;
837 }
838 tok->end = tok->buf + BUFSIZ;
839 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000840 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
841 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 tok->done = E_EOF;
843 done = 1;
844 }
845 else {
846 tok->done = E_OK;
847 tok->inp = strchr(tok->buf, '\0');
848 done = tok->inp[-1] == '\n';
849 }
850 }
851 else {
852 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000853 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000854 tok->done = E_EOF;
855 done = 1;
856 }
857 else
858 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000859 }
860 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000861 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000863 Py_ssize_t curstart = tok->start == NULL ? -1 :
864 tok->start - tok->buf;
865 Py_ssize_t curvalid = tok->inp - tok->buf;
866 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 char *newbuf = tok->buf;
Tim Petersc9d78aa2006-03-26 23:27:58 +0000868 newbuf = (char *)PyObject_REALLOC(newbuf,
869 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000870 if (newbuf == NULL) {
871 tok->done = E_NOMEM;
872 tok->cur = tok->inp;
873 return EOF;
874 }
875 tok->buf = newbuf;
876 tok->inp = tok->buf + curvalid;
877 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000878 tok->start = curstart < 0 ? NULL :
879 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000880 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000881 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000882 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000883 /* Break out early on decoding
884 errors, as tok->buf will be NULL
885 */
886 if (tok->decoding_erred)
887 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 /* Last line does not end in \n,
889 fake one */
890 strcpy(tok->inp, "\n");
891 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000892 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000893 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000894 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000895 tok->cur = tok->buf + cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000896 tok->line_start = tok->cur;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000897 /* replace "\r\n" with "\n" */
Guido van Rossum2d45be11997-04-11 19:16:25 +0000898 /* For Mac we leave the \r, giving a syntax error */
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000899 pt = tok->inp - 2;
900 if (pt >= tok->buf && *pt == '\r') {
901 *pt++ = '\n';
902 *pt = '\0';
903 tok->inp = pt;
904 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000905 }
906 if (tok->done != E_OK) {
907 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000908 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000909 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910 return EOF;
911 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000913 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914}
915
916
917/* Back-up one character */
918
919static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000920tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000921{
922 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000923 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000924 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000925 if (*tok->cur != c)
926 *tok->cur = c;
927 }
928}
929
930
931/* Return the token corresponding to a single character */
932
933int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000934PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000935{
936 switch (c) {
937 case '(': return LPAR;
938 case ')': return RPAR;
939 case '[': return LSQB;
940 case ']': return RSQB;
941 case ':': return COLON;
942 case ',': return COMMA;
943 case ';': return SEMI;
944 case '+': return PLUS;
945 case '-': return MINUS;
946 case '*': return STAR;
947 case '/': return SLASH;
948 case '|': return VBAR;
949 case '&': return AMPER;
950 case '<': return LESS;
951 case '>': return GREATER;
952 case '=': return EQUAL;
953 case '.': return DOT;
954 case '%': return PERCENT;
955 case '`': return BACKQUOTE;
956 case '{': return LBRACE;
957 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000958 case '^': return CIRCUMFLEX;
959 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000960 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000961 default: return OP;
962 }
963}
964
965
Guido van Rossumfbab9051991-10-20 20:25:03 +0000966int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000967PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000968{
969 switch (c1) {
970 case '=':
971 switch (c2) {
972 case '=': return EQEQUAL;
973 }
974 break;
975 case '!':
976 switch (c2) {
977 case '=': return NOTEQUAL;
978 }
979 break;
980 case '<':
981 switch (c2) {
982 case '>': return NOTEQUAL;
983 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000984 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000985 }
986 break;
987 case '>':
988 switch (c2) {
989 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000990 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000991 }
992 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000993 case '+':
994 switch (c2) {
995 case '=': return PLUSEQUAL;
996 }
997 break;
998 case '-':
999 switch (c2) {
1000 case '=': return MINEQUAL;
1001 }
1002 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001003 case '*':
1004 switch (c2) {
1005 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001006 case '=': return STAREQUAL;
1007 }
1008 break;
1009 case '/':
1010 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001011 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001012 case '=': return SLASHEQUAL;
1013 }
1014 break;
1015 case '|':
1016 switch (c2) {
1017 case '=': return VBAREQUAL;
1018 }
1019 break;
1020 case '%':
1021 switch (c2) {
1022 case '=': return PERCENTEQUAL;
1023 }
1024 break;
1025 case '&':
1026 switch (c2) {
1027 case '=': return AMPEREQUAL;
1028 }
1029 break;
1030 case '^':
1031 switch (c2) {
1032 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001033 }
1034 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001035 }
1036 return OP;
1037}
1038
Thomas Wouters434d0822000-08-24 20:11:32 +00001039int
1040PyToken_ThreeChars(int c1, int c2, int c3)
1041{
1042 switch (c1) {
1043 case '<':
1044 switch (c2) {
1045 case '<':
1046 switch (c3) {
1047 case '=':
1048 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001049 }
1050 break;
1051 }
1052 break;
1053 case '>':
1054 switch (c2) {
1055 case '>':
1056 switch (c3) {
1057 case '=':
1058 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001059 }
1060 break;
1061 }
1062 break;
1063 case '*':
1064 switch (c2) {
1065 case '*':
1066 switch (c3) {
1067 case '=':
1068 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001069 }
1070 break;
1071 }
1072 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001073 case '/':
1074 switch (c2) {
1075 case '/':
1076 switch (c3) {
1077 case '=':
1078 return DOUBLESLASHEQUAL;
1079 }
1080 break;
1081 }
1082 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 }
1084 return OP;
1085}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001086
Guido van Rossum926f13a1998-04-09 21:38:06 +00001087static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001088indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001089{
1090 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001091 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001092 tok->cur = tok->inp;
1093 return 1;
1094 }
1095 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001096 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1097 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001098 tok->altwarning = 0;
1099 }
1100 return 0;
1101}
1102
1103
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001104/* Get next token, after space stripping etc. */
1105
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001106static int
1107tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001108{
1109 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 int blankline;
1111
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001112 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001113 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001114 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001115 blankline = 0;
1116
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 /* Get indentation level */
1118 if (tok->atbol) {
1119 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001121 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122 for (;;) {
1123 c = tok_nextc(tok);
1124 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 col++, altcol++;
1126 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001128 altcol = (altcol/tok->alttabsize + 1)
1129 * tok->alttabsize;
1130 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001131 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001132 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001133 else
1134 break;
1135 }
1136 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001137 if (c == '#' || c == '\n') {
1138 /* Lines with only whitespace and/or comments
1139 shouldn't affect the indentation and are
1140 not passed to the parser as NEWLINE tokens,
1141 except *totally* empty lines in interactive
1142 mode, which signal the end of a command group. */
1143 if (col == 0 && c == '\n' && tok->prompt != NULL)
1144 blankline = 0; /* Let it through */
1145 else
1146 blankline = 1; /* Ignore completely */
1147 /* We can't jump back right here since we still
1148 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001150 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001151 if (col == tok->indstack[tok->indent]) {
1152 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001153 if (altcol != tok->altindstack[tok->indent]) {
1154 if (indenterror(tok))
1155 return ERRORTOKEN;
1156 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001157 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 else if (col > tok->indstack[tok->indent]) {
1159 /* Indent -- always one */
1160 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001161 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001162 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 return ERRORTOKEN;
1164 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001165 if (altcol <= tok->altindstack[tok->indent]) {
1166 if (indenterror(tok))
1167 return ERRORTOKEN;
1168 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001169 tok->pendin++;
1170 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001171 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001172 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001173 else /* col < tok->indstack[tok->indent] */ {
1174 /* Dedent -- any number, must be consistent */
1175 while (tok->indent > 0 &&
1176 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001177 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001178 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001179 }
1180 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001181 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001182 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001183 return ERRORTOKEN;
1184 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001185 if (altcol != tok->altindstack[tok->indent]) {
1186 if (indenterror(tok))
1187 return ERRORTOKEN;
1188 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 }
1190 }
1191 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001192
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001193 tok->start = tok->cur;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001194
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001195 /* Return pending indents/dedents */
1196 if (tok->pendin != 0) {
1197 if (tok->pendin < 0) {
1198 tok->pendin++;
1199 return DEDENT;
1200 }
1201 else {
1202 tok->pendin--;
1203 return INDENT;
1204 }
1205 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001208 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 /* Skip spaces */
1210 do {
1211 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001212 } while (c == ' ' || c == '\t' || c == '\014');
Tim Petersc9d78aa2006-03-26 23:27:58 +00001213
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001214 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001215 tok->start = tok->cur - 1;
Tim Petersc9d78aa2006-03-26 23:27:58 +00001216
Guido van Rossumab5ca152000-03-31 00:52:27 +00001217 /* Skip comment, while looking for tab-setting magic */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001218 if (c == '#') {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001219 static char *tabforms[] = {
1220 "tab-width:", /* Emacs */
1221 ":tabstop=", /* vim, full form */
1222 ":ts=", /* vim, abbreviated form */
1223 "set tabsize=", /* will vi never die? */
1224 /* more templates can be added here to support other editors */
1225 };
1226 char cbuf[80];
1227 char *tp, **cp;
1228 tp = cbuf;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 do {
Guido van Rossumab5ca152000-03-31 00:52:27 +00001230 *tp++ = c = tok_nextc(tok);
1231 } while (c != EOF && c != '\n' &&
1232 tp - cbuf + 1 < sizeof(cbuf));
1233 *tp = '\0';
Tim Petersc9d78aa2006-03-26 23:27:58 +00001234 for (cp = tabforms;
Guido van Rossumab5ca152000-03-31 00:52:27 +00001235 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1236 cp++) {
1237 if ((tp = strstr(cbuf, *cp))) {
1238 int newsize = atoi(tp + strlen(*cp));
1239
1240 if (newsize >= 1 && newsize <= 40) {
1241 tok->tabsize = newsize;
Guido van Rossum6c981ad2000-04-03 23:02:17 +00001242 if (Py_VerboseFlag)
1243 PySys_WriteStderr(
Guido van Rossumab5ca152000-03-31 00:52:27 +00001244 "Tab size set to %d\n",
1245 newsize);
1246 }
1247 }
1248 }
1249 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001252
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001254 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001256 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001257
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 /* Identifier (most frequent token!) */
1259 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001260 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001261 switch (c) {
1262 case 'r':
1263 case 'R':
1264 c = tok_nextc(tok);
1265 if (c == '"' || c == '\'')
1266 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001267 break;
1268 case 'u':
1269 case 'U':
1270 c = tok_nextc(tok);
1271 if (c == 'r' || c == 'R')
1272 c = tok_nextc(tok);
1273 if (c == '"' || c == '\'')
1274 goto letter_quote;
1275 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001276 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001277 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001279 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001281 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 *p_end = tok->cur;
1283 return NAME;
1284 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001285
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 /* Newline */
1287 if (c == '\n') {
1288 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001289 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001290 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001291 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001293 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 return NEWLINE;
1295 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001296
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001297 /* Period or number starting with period? */
1298 if (c == '.') {
1299 c = tok_nextc(tok);
1300 if (isdigit(c)) {
1301 goto fraction;
1302 }
1303 else {
1304 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001305 *p_start = tok->start;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001306 *p_end = tok->cur;
1307 return DOT;
1308 }
1309 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001310
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001311 /* Number */
1312 if (isdigit(c)) {
1313 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001314 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001315 c = tok_nextc(tok);
1316 if (c == '.')
1317 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001318#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001319 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001320 goto imaginary;
1321#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 if (c == 'x' || c == 'X') {
1323 /* Hex */
1324 do {
1325 c = tok_nextc(tok);
1326 } while (isxdigit(c));
1327 }
1328 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001329 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001330 /* Octal; c is first char of it */
1331 /* There's no 'isoctdigit' macro, sigh */
1332 while ('0' <= c && c < '8') {
1333 c = tok_nextc(tok);
1334 }
Tim Petersd507dab2001-08-30 20:51:59 +00001335 if (isdigit(c)) {
1336 found_decimal = 1;
1337 do {
1338 c = tok_nextc(tok);
1339 } while (isdigit(c));
1340 }
1341 if (c == '.')
1342 goto fraction;
1343 else if (c == 'e' || c == 'E')
1344 goto exponent;
1345#ifndef WITHOUT_COMPLEX
1346 else if (c == 'j' || c == 'J')
1347 goto imaginary;
1348#endif
1349 else if (found_decimal) {
1350 tok->done = E_TOKEN;
1351 tok_backup(tok, c);
1352 return ERRORTOKEN;
1353 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 }
Guido van Rossumf023c461991-05-05 20:16:20 +00001355 if (c == 'l' || c == 'L')
1356 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 }
1358 else {
1359 /* Decimal */
1360 do {
1361 c = tok_nextc(tok);
1362 } while (isdigit(c));
Guido van Rossumf023c461991-05-05 20:16:20 +00001363 if (c == 'l' || c == 'L')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001365 else {
Tim Peters9aa70d92001-08-27 19:19:28 +00001366 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001367 if (c == '.') {
1368 fraction:
1369 /* Fraction */
1370 do {
1371 c = tok_nextc(tok);
1372 } while (isdigit(c));
1373 }
1374 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001375 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001376 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001377 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001378 if (c == '+' || c == '-')
1379 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001380 if (!isdigit(c)) {
1381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001384 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001385 do {
1386 c = tok_nextc(tok);
1387 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001389#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001390 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001391 /* Imaginary part */
1392 imaginary:
1393 c = tok_nextc(tok);
1394#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 }
1396 }
1397 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 *p_end = tok->cur;
1400 return NUMBER;
1401 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001402
1403 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001404 /* String */
1405 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001406 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 int quote = c;
1408 int triple = 0;
1409 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001410 for (;;) {
1411 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001412 if (c == '\n') {
1413 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001414 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 tok_backup(tok, c);
1416 return ERRORTOKEN;
1417 }
1418 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001419 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001420 }
1421 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001422 if (triple)
1423 tok->done = E_EOFS;
1424 else
1425 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001426 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001427 return ERRORTOKEN;
1428 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 else if (c == quote) {
1430 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001431 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001432 c = tok_nextc(tok);
1433 if (c == quote) {
1434 triple = 1;
1435 tripcount = 0;
1436 continue;
1437 }
1438 tok_backup(tok, c);
1439 }
1440 if (!triple || tripcount == 3)
1441 break;
1442 }
1443 else if (c == '\\') {
1444 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001445 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001446 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001447 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001448 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001449 return ERRORTOKEN;
1450 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001451 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001452 else
1453 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001454 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001455 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001456 *p_end = tok->cur;
1457 return STRING;
1458 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001459
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001460 /* Line continuation */
1461 if (c == '\\') {
1462 c = tok_nextc(tok);
1463 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001464 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001465 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 return ERRORTOKEN;
1467 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001468 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469 goto again; /* Read next line */
1470 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001471
Guido van Rossumfbab9051991-10-20 20:25:03 +00001472 /* Check for two-character token */
1473 {
1474 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001475 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001476 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001477 int c3 = tok_nextc(tok);
1478 int token3 = PyToken_ThreeChars(c, c2, c3);
1479 if (token3 != OP) {
1480 token = token3;
1481 } else {
1482 tok_backup(tok, c3);
1483 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001484 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001485 *p_end = tok->cur;
1486 return token;
1487 }
1488 tok_backup(tok, c2);
1489 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001490
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001491 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001492 switch (c) {
1493 case '(':
1494 case '[':
1495 case '{':
1496 tok->level++;
1497 break;
1498 case ')':
1499 case ']':
1500 case '}':
1501 tok->level--;
1502 break;
1503 }
Tim Petersc9d78aa2006-03-26 23:27:58 +00001504
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001505 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001506 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001507 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001508 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001509}
1510
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001511int
1512PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1513{
1514 int result = tok_get(tok, p_start, p_end);
1515 if (tok->decoding_erred) {
1516 result = ERRORTOKEN;
1517 tok->done = E_DECODE;
1518 }
1519 return result;
1520}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521
Guido van Rossum408027e1996-12-30 16:17:54 +00001522#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523
1524void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001525tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001526{
Guido van Rossum86bea461997-04-29 21:03:06 +00001527 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001528 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1529 printf("(%.*s)", (int)(end - start), start);
1530}
1531
1532#endif