blob: 8f30fefc5ca918be132a70b2e97cd6048adeadff [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000042/* Convert a possibly signed character to a nonnegative int */
43/* XXX This assumes characters are 8 bits wide */
44#ifdef __CHAR_UNSIGNED__
45#define Py_CHARMASK(c) (c)
46#else
47#define Py_CHARMASK(c) ((c) & 0xff)
48#endif
49
Guido van Rossum3f5da241990-12-20 15:06:42 +000050/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000051static struct tok_state *tok_new(void);
52static int tok_nextc(struct tok_state *tok);
53static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000054
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000055/* Token names */
56
Guido van Rossum86bea461997-04-29 21:03:06 +000057char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000058 "ENDMARKER",
59 "NAME",
60 "NUMBER",
61 "STRING",
62 "NEWLINE",
63 "INDENT",
64 "DEDENT",
65 "LPAR",
66 "RPAR",
67 "LSQB",
68 "RSQB",
69 "COLON",
70 "COMMA",
71 "SEMI",
72 "PLUS",
73 "MINUS",
74 "STAR",
75 "SLASH",
76 "VBAR",
77 "AMPER",
78 "LESS",
79 "GREATER",
80 "EQUAL",
81 "DOT",
82 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000083 "LBRACE",
84 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000085 "EQEQUAL",
86 "NOTEQUAL",
87 "LESSEQUAL",
88 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000089 "TILDE",
90 "CIRCUMFLEX",
91 "LEFTSHIFT",
92 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000093 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000094 "PLUSEQUAL",
95 "MINEQUAL",
96 "STAREQUAL",
97 "SLASHEQUAL",
98 "PERCENTEQUAL",
99 "AMPEREQUAL",
100 "VBAREQUAL",
101 "CIRCUMFLEXEQUAL",
102 "LEFTSHIFTEQUAL",
103 "RIGHTSHIFTEQUAL",
104 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000105 "DOUBLESLASH",
106 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000107 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000108 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000109 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000110 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 "OP",
112 "<ERRORTOKEN>",
113 "<N_TOKENS>"
114};
115
116
117/* Create and initialize a new tok_state structure */
118
119static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000120tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000121{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000122 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
123 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000124 if (tok == NULL)
125 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000126 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000127 tok->done = E_OK;
128 tok->fp = NULL;
129 tok->tabsize = TABSIZE;
130 tok->indent = 0;
131 tok->indstack[0] = 0;
132 tok->atbol = 1;
133 tok->pendin = 0;
134 tok->prompt = tok->nextprompt = NULL;
135 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000136 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000137 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000138 tok->altwarning = 1;
139 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000140 tok->alttabsize = 1;
141 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000142 tok->decoding_state = 0;
143 tok->decoding_erred = 0;
144 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000145 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000146 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000147#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148 tok->decoding_readline = NULL;
149 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000150#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000151 return tok;
152}
153
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000154#ifdef PGEN
155
156static char *
157decoding_fgets(char *s, int size, struct tok_state *tok)
158{
159 return fgets(s, size, tok->fp);
160}
161
162static int
163decoding_feof(struct tok_state *tok)
164{
165 return feof(tok->fp);
166}
167
168static const char *
169decode_str(const char *str, struct tok_state *tok)
170{
171 return str;
172}
173
174#else /* PGEN */
175
176static char *
177error_ret(struct tok_state *tok) /* XXX */
178{
179 tok->decoding_erred = 1;
180 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000181 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182 tok->buf = NULL;
183 return NULL; /* as if it were EOF */
184}
185
186static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000187new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000189 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000190 if (result != NULL) {
191 memcpy(result, s, len);
192 result[len] = '\0';
193 }
194 return result;
195}
196
197static char *
198get_normal_name(char *s) /* for utf-8 and latin-1 */
199{
200 char buf[13];
201 int i;
202 for (i = 0; i < 12; i++) {
203 int c = s[i];
204 if (c == '\0') break;
205 else if (c == '_') buf[i] = '-';
206 else buf[i] = tolower(c);
207 }
208 buf[i] = '\0';
209 if (strcmp(buf, "utf-8") == 0 ||
210 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
217 else return s;
218}
219
220/* Return the coding spec in S, or NULL if none is found. */
221
222static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000226 /* Coding spec must be in a comment, and that comment must be
227 * the only statement on the source code line. */
228 for (i = 0; i < size - 6; i++) {
229 if (s[i] == '#')
230 break;
231 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
232 return NULL;
233 }
234 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 const char* t = s + i;
236 if (strncmp(t, "coding", 6) == 0) {
237 const char* begin = NULL;
238 t += 6;
239 if (t[0] != ':' && t[0] != '=')
240 continue;
241 do {
242 t++;
243 } while (t[0] == '\x20' || t[0] == '\t');
244
245 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000246 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000247 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248 t++;
249
250 if (begin < t) {
251 char* r = new_string(begin, t - begin);
252 char* q = get_normal_name(r);
253 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000255 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 }
257 return r;
258 }
259 }
260 }
261 return NULL;
262}
263
264/* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
268
269static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000270check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 int set_readline(struct tok_state *, const char *))
272{
Tim Peters17db21f2002-09-03 15:39:58 +0000273 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000275
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000276 if (tok->cont_line)
277 /* It's a continuation line, so it can't be a coding spec. */
278 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000279 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280 if (cs != NULL) {
281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
283 assert(tok->decoding_state == 1); /* raw */
284 if (strcmp(cs, "utf-8") == 0 ||
285 strcmp(cs, "iso-8859-1") == 0) {
286 tok->encoding = cs;
287 } else {
288 r = set_readline(tok, cs);
289 if (r) {
290 tok->encoding = cs;
291 tok->decoding_state = -1;
292 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000293 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000294 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000295 }
296 } else { /* then, compare cs with BOM */
297 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000298 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000299 }
300 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000301 if (!r) {
302 cs = tok->encoding;
303 if (!cs)
304 cs = "with BOM";
305 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
306 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000307 return r;
308}
309
310/* See whether the file starts with a BOM. If it does,
311 invoke the set_readline function with the new encoding.
312 Return 1 on success, 0 on failure. */
313
314static int
315check_bom(int get_char(struct tok_state *),
316 void unget_char(int, struct tok_state *),
317 int set_readline(struct tok_state *, const char *),
318 struct tok_state *tok)
319{
320 int ch = get_char(tok);
321 tok->decoding_state = 1;
322 if (ch == EOF) {
323 return 1;
324 } else if (ch == 0xEF) {
325 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
326 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
327#if 0
328 /* Disable support for UTF-16 BOMs until a decision
329 is made whether this needs to be supported. */
330 } else if (ch == 0xFE) {
331 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
332 if (!set_readline(tok, "utf-16-be")) return 0;
333 tok->decoding_state = -1;
334 } else if (ch == 0xFF) {
335 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
336 if (!set_readline(tok, "utf-16-le")) return 0;
337 tok->decoding_state = -1;
338#endif
339 } else {
340 unget_char(ch, tok);
341 return 1;
342 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000343 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000344 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000345 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
346 return 1;
347 NON_BOM:
348 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
349 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
350 return 1;
351}
352
353/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000356 On entry, tok->decoding_buffer will be one of:
357 1) NULL: need to call tok->decoding_readline to get a new line
358 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
359 stored the result in tok->decoding_buffer
360 3) PyStringObject *: previous call to fp_readl did not have enough room
361 (in the s buffer) to copy entire contents of the line read
362 by tok->decoding_readline. tok->decoding_buffer has the overflow.
363 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000365 reached): see tok_nextc and its calls to decoding_fgets.
366*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367
368static char *
369fp_readl(char *s, int size, struct tok_state *tok)
370{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000371 PyObject* bufobj = tok->decoding_buffer;
372 const char *buf;
373 Py_ssize_t buflen;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000374 int allocated = 0;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000375
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
379
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000380 if (bufobj == NULL) {
381 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
382 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000383 goto error;
384 allocated = 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000385 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000386 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0) {
387 goto error;
388 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 if (buflen > size) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000390 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000391 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
392 buflen-size);
393 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000394 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000395 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000397 memcpy(s, buf, buflen);
398 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000399 if (buflen == 0) /* EOF */
400 s = NULL;
401 if (allocated) {
402 Py_DECREF(bufobj);
403 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000404 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000405
406error:
407 if (allocated) {
408 Py_XDECREF(bufobj);
409 }
410 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000411}
412
413/* Set the readline function for TOK to a StreamReader's
414 readline function. The StreamReader is named ENC.
415
416 This function is called from check_bom and check_coding_spec.
417
418 ENC is usually identical to the future value of tok->encoding,
419 except for the (currently unsupported) case of UTF-16.
420
421 Return 1 on success, 0 on failure. */
422
423static int
424fp_setreadl(struct tok_state *tok, const char* enc)
425{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000426 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000427
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000428 io = PyImport_ImportModule("io");
429 if (io == NULL)
430 goto cleanup;
431
432 stream = PyObject_CallMethod(io, "open", "ssis",
433 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000434 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000435 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000436
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000437 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000438 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000439 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000440
441 cleanup:
442 Py_XDECREF(stream);
443 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000444 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000445}
446
447/* Fetch the next byte from TOK. */
448
449static int fp_getc(struct tok_state *tok) {
450 return getc(tok->fp);
451}
452
453/* Unfetch the last byte back into TOK. */
454
455static void fp_ungetc(int c, struct tok_state *tok) {
456 ungetc(c, tok->fp);
457}
458
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000459/* Check whether the characters at s start a valid
460 UTF-8 sequence. Return the number of characters forming
461 the sequence if yes, 0 if not. */
462static int valid_utf8(const unsigned char* s)
463{
464 int expected = 0;
465 int length;
466 if (*s < 0x80)
467 /* single-byte code */
468 return 1;
469 if (*s < 0xc0)
470 /* following byte */
471 return 0;
472 if (*s < 0xE0)
473 expected = 1;
474 else if (*s < 0xF0)
475 expected = 2;
476 else if (*s < 0xF8)
477 expected = 3;
478 else
479 return 0;
480 length = expected + 1;
481 for (; expected; expected--)
482 if (s[expected] < 0x80 || s[expected] >= 0xC0)
483 return 0;
484 return length;
485}
486
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000487/* Read a line of input from TOK. Determine encoding
488 if necessary. */
489
490static char *
491decoding_fgets(char *s, int size, struct tok_state *tok)
492{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000493 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000494 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000495 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000496 if (tok->decoding_state < 0) {
497 /* We already have a codec associated with
498 this input. */
499 line = fp_readl(s, size, tok);
500 break;
501 } else if (tok->decoding_state > 0) {
502 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000504 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505 break;
506 } else {
507 /* We have not yet determined the encoding.
508 If an encoding is found, use the file-pointer
509 reader functions from now on. */
510 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
511 return error_ret(tok);
512 assert(tok->decoding_state != 0);
513 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000514 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000515 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
516 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
517 return error_ret(tok);
518 }
519 }
520#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000521 /* The default encoding is UTF-8, so make sure we don't have any
522 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000523 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000525 int length;
526 for (c = (unsigned char *)line; *c; c += length)
527 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 badchar = *c;
529 break;
530 }
531 }
532 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000533 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000534 /* Need to add 1 to the line number, since this line
535 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000536 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000537 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000538 "in file %.200s on line %i, "
539 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000540 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000541 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000542 PyErr_SetString(PyExc_SyntaxError, buf);
543 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544 }
545#endif
546 return line;
547}
548
549static int
550decoding_feof(struct tok_state *tok)
551{
552 if (tok->decoding_state >= 0) {
553 return feof(tok->fp);
554 } else {
555 PyObject* buf = tok->decoding_buffer;
556 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000557 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 if (buf == NULL) {
559 error_ret(tok);
560 return 1;
561 } else {
562 tok->decoding_buffer = buf;
563 }
564 }
565 return PyObject_Length(buf) == 0;
566 }
567}
568
569/* Fetch a byte from TOK, using the string buffer. */
570
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000571static int
572buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000573 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574}
575
576/* Unfetch a byte from TOK, using the string buffer. */
577
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000578static void
579buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000581 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582}
583
584/* Set the readline function for TOK to ENC. For the string-based
585 tokenizer, this means to just record the encoding. */
586
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000587static int
588buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 tok->enc = enc;
590 return 1;
591}
592
593/* Return a UTF-8 encoding Python string object from the
594 C byte string STR, which is encoded with ENC. */
595
596static PyObject *
597translate_into_utf8(const char* str, const char* enc) {
598 PyObject *utf8;
599 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
600 if (buf == NULL)
601 return NULL;
602 utf8 = PyUnicode_AsUTF8String(buf);
603 Py_DECREF(buf);
604 return utf8;
605}
606
607/* Decode a byte string STR for use as the buffer of TOK.
608 Look for encoding declarations inside STR, and record them
609 inside TOK. */
610
611static const char *
612decode_str(const char *str, struct tok_state *tok)
613{
614 PyObject* utf8 = NULL;
615 const char *s;
616 int lineno = 0;
617 tok->enc = NULL;
618 tok->str = str;
619 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000620 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000622 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 if (tok->enc != NULL) {
624 utf8 = translate_into_utf8(str, tok->enc);
625 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000626 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627 str = PyString_AsString(utf8);
628 }
629 for (s = str;; s++) {
630 if (*s == '\0') break;
631 else if (*s == '\n') {
632 lineno++;
633 if (lineno == 2) break;
634 }
635 }
636 tok->enc = NULL;
637 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000638 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 if (tok->enc != NULL) {
640 assert(utf8 == NULL);
641 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000642 if (utf8 == NULL) {
643 PyErr_Format(PyExc_SyntaxError,
644 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000645 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000646 }
Neal Norwitzf7f28fc2007-08-11 21:31:25 +0000647 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 }
649 assert(tok->decoding_buffer == NULL);
650 tok->decoding_buffer = utf8; /* CAUTION */
651 return str;
652}
653
654#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655
656/* Set up tokenizer for string */
657
658struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000659PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660{
661 struct tok_state *tok = tok_new();
662 if (tok == NULL)
663 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000665 if (str == NULL) {
666 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000668 }
669
Martin v. Löwis95292d62002-12-11 14:04:59 +0000670 /* XXX: constify members. */
671 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672 return tok;
673}
674
675
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000676/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677
678struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000679PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000680{
681 struct tok_state *tok = tok_new();
682 if (tok == NULL)
683 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000684 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000685 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686 return NULL;
687 }
688 tok->cur = tok->inp = tok->buf;
689 tok->end = tok->buf + BUFSIZ;
690 tok->fp = fp;
691 tok->prompt = ps1;
692 tok->nextprompt = ps2;
693 return tok;
694}
695
696
697/* Free a tok_state structure */
698
699void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000700PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000703 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000704#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705 Py_XDECREF(tok->decoding_readline);
706 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000707#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000708 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000709 PyMem_FREE(tok->buf);
710 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711}
712
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000713/* Get next char, updating state; error code goes into tok->done */
714
715static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000716tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000718 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000719 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000720 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000721 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000722 if (tok->done != E_OK)
723 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000725 char *end = strchr(tok->inp, '\n');
726 if (end != NULL)
727 end++;
728 else {
729 end = strchr(tok->inp, '\0');
730 if (end == tok->inp) {
731 tok->done = E_EOF;
732 return EOF;
733 }
734 }
735 if (tok->start == NULL)
736 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000737 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000738 tok->lineno++;
739 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000740 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000741 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000743 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000744 if (tok->nextprompt != NULL)
745 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000746 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000747 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000748 else if (*newtok == '\0') {
749 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750 tok->done = E_EOF;
751 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000752 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000753 size_t start = tok->start - tok->buf;
754 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000755 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000756 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000757 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000758 tok->lineno++;
759 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000760 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000761 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000762 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000763 tok->done = E_NOMEM;
764 return EOF;
765 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000766 tok->buf = buf;
767 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000768 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000769 strcpy(tok->buf + oldlen, newtok);
770 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000771 tok->inp = tok->buf + newlen;
772 tok->end = tok->inp + 1;
773 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000774 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000775 else {
776 tok->lineno++;
777 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000778 PyMem_FREE(tok->buf);
779 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000780 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000781 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000782 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 tok->inp = strchr(tok->buf, '\0');
784 tok->end = tok->inp + 1;
785 }
786 }
787 else {
788 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000789 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000790 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 if (tok->start == NULL) {
792 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000793 tok->buf = (char *)
794 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000795 if (tok->buf == NULL) {
796 tok->done = E_NOMEM;
797 return EOF;
798 }
799 tok->end = tok->buf + BUFSIZ;
800 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000801 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
802 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 tok->done = E_EOF;
804 done = 1;
805 }
806 else {
807 tok->done = E_OK;
808 tok->inp = strchr(tok->buf, '\0');
809 done = tok->inp[-1] == '\n';
810 }
811 }
812 else {
813 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000814 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000815 tok->done = E_EOF;
816 done = 1;
817 }
818 else
819 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000820 }
821 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000822 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000824 Py_ssize_t curstart = tok->start == NULL ? -1 :
825 tok->start - tok->buf;
826 Py_ssize_t curvalid = tok->inp - tok->buf;
827 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000828 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829 newbuf = (char *)PyMem_REALLOC(newbuf,
830 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000831 if (newbuf == NULL) {
832 tok->done = E_NOMEM;
833 tok->cur = tok->inp;
834 return EOF;
835 }
836 tok->buf = newbuf;
837 tok->inp = tok->buf + curvalid;
838 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000839 tok->start = curstart < 0 ? NULL :
840 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000841 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000842 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000843 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000844 /* Break out early on decoding
845 errors, as tok->buf will be NULL
846 */
847 if (tok->decoding_erred)
848 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000849 /* Last line does not end in \n,
850 fake one */
851 strcpy(tok->inp, "\n");
852 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000853 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000854 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000855 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000856 if (tok->buf != NULL) {
857 tok->cur = tok->buf + cur;
858 tok->line_start = tok->cur;
859 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000860 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000861 pt = tok->inp - 2;
862 if (pt >= tok->buf && *pt == '\r') {
863 *pt++ = '\n';
864 *pt = '\0';
865 tok->inp = pt;
866 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000867 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868 }
869 if (tok->done != E_OK) {
870 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000871 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000872 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873 return EOF;
874 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000877}
878
879
880/* Back-up one character */
881
882static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000883tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000884{
885 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000886 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000887 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000888 if (*tok->cur != c)
889 *tok->cur = c;
890 }
891}
892
893
894/* Return the token corresponding to a single character */
895
896int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000897PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000898{
899 switch (c) {
900 case '(': return LPAR;
901 case ')': return RPAR;
902 case '[': return LSQB;
903 case ']': return RSQB;
904 case ':': return COLON;
905 case ',': return COMMA;
906 case ';': return SEMI;
907 case '+': return PLUS;
908 case '-': return MINUS;
909 case '*': return STAR;
910 case '/': return SLASH;
911 case '|': return VBAR;
912 case '&': return AMPER;
913 case '<': return LESS;
914 case '>': return GREATER;
915 case '=': return EQUAL;
916 case '.': return DOT;
917 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918 case '{': return LBRACE;
919 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000920 case '^': return CIRCUMFLEX;
921 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000922 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000923 default: return OP;
924 }
925}
926
927
Guido van Rossumfbab9051991-10-20 20:25:03 +0000928int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000929PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000930{
931 switch (c1) {
932 case '=':
933 switch (c2) {
934 case '=': return EQEQUAL;
935 }
936 break;
937 case '!':
938 switch (c2) {
939 case '=': return NOTEQUAL;
940 }
941 break;
942 case '<':
943 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000944 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000945 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000946 }
947 break;
948 case '>':
949 switch (c2) {
950 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000951 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000952 }
953 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000954 case '+':
955 switch (c2) {
956 case '=': return PLUSEQUAL;
957 }
958 break;
959 case '-':
960 switch (c2) {
961 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000962 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000963 }
964 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000965 case '*':
966 switch (c2) {
967 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000968 case '=': return STAREQUAL;
969 }
970 break;
971 case '/':
972 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000973 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000974 case '=': return SLASHEQUAL;
975 }
976 break;
977 case '|':
978 switch (c2) {
979 case '=': return VBAREQUAL;
980 }
981 break;
982 case '%':
983 switch (c2) {
984 case '=': return PERCENTEQUAL;
985 }
986 break;
987 case '&':
988 switch (c2) {
989 case '=': return AMPEREQUAL;
990 }
991 break;
992 case '^':
993 switch (c2) {
994 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000995 }
996 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000997 }
998 return OP;
999}
1000
Thomas Wouters434d0822000-08-24 20:11:32 +00001001int
1002PyToken_ThreeChars(int c1, int c2, int c3)
1003{
1004 switch (c1) {
1005 case '<':
1006 switch (c2) {
1007 case '<':
1008 switch (c3) {
1009 case '=':
1010 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001011 }
1012 break;
1013 }
1014 break;
1015 case '>':
1016 switch (c2) {
1017 case '>':
1018 switch (c3) {
1019 case '=':
1020 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001021 }
1022 break;
1023 }
1024 break;
1025 case '*':
1026 switch (c2) {
1027 case '*':
1028 switch (c3) {
1029 case '=':
1030 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001031 }
1032 break;
1033 }
1034 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001035 case '/':
1036 switch (c2) {
1037 case '/':
1038 switch (c3) {
1039 case '=':
1040 return DOUBLESLASHEQUAL;
1041 }
1042 break;
1043 }
1044 break;
Georg Brandldde00282007-03-18 19:01:53 +00001045 case '.':
1046 switch (c2) {
1047 case '.':
1048 switch (c3) {
1049 case '.':
1050 return ELLIPSIS;
1051 }
1052 break;
1053 }
1054 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001055 }
1056 return OP;
1057}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001058
Guido van Rossum926f13a1998-04-09 21:38:06 +00001059static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001060indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001061{
1062 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001063 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001064 tok->cur = tok->inp;
1065 return 1;
1066 }
1067 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001068 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1069 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001070 tok->altwarning = 0;
1071 }
1072 return 0;
1073}
1074
Martin v. Löwis47383402007-08-15 07:32:56 +00001075#ifdef PGEN
1076#define verify_identifier(s,e) 1
1077#else
1078/* Verify that the identifier follows PEP 3131. */
1079static int
1080verify_identifier(char *start, char *end)
1081{
1082 PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1083 int result = PyUnicode_IsIdentifier(s);
1084 Py_DECREF(s);
1085 return result;
1086}
1087#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001088
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089/* Get next token, after space stripping etc. */
1090
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001091static int
1092tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093{
1094 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001096
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001097 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001098 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001099 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001100 blankline = 0;
1101
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102 /* Get indentation level */
1103 if (tok->atbol) {
1104 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001106 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001107 for (;;) {
1108 c = tok_nextc(tok);
1109 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001110 col++, altcol++;
1111 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001113 altcol = (altcol/tok->alttabsize + 1)
1114 * tok->alttabsize;
1115 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001116 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001117 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001118 else
1119 break;
1120 }
1121 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001122 if (c == '#' || c == '\n') {
1123 /* Lines with only whitespace and/or comments
1124 shouldn't affect the indentation and are
1125 not passed to the parser as NEWLINE tokens,
1126 except *totally* empty lines in interactive
1127 mode, which signal the end of a command group. */
1128 if (col == 0 && c == '\n' && tok->prompt != NULL)
1129 blankline = 0; /* Let it through */
1130 else
1131 blankline = 1; /* Ignore completely */
1132 /* We can't jump back right here since we still
1133 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001134 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001135 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001136 if (col == tok->indstack[tok->indent]) {
1137 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001138 if (altcol != tok->altindstack[tok->indent]) {
1139 if (indenterror(tok))
1140 return ERRORTOKEN;
1141 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 else if (col > tok->indstack[tok->indent]) {
1144 /* Indent -- always one */
1145 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001146 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001147 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001148 return ERRORTOKEN;
1149 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001150 if (altcol <= tok->altindstack[tok->indent]) {
1151 if (indenterror(tok))
1152 return ERRORTOKEN;
1153 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001154 tok->pendin++;
1155 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001156 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001157 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 else /* col < tok->indstack[tok->indent] */ {
1159 /* Dedent -- any number, must be consistent */
1160 while (tok->indent > 0 &&
1161 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001162 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001163 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 }
1165 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001166 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001167 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001168 return ERRORTOKEN;
1169 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170 if (altcol != tok->altindstack[tok->indent]) {
1171 if (indenterror(tok))
1172 return ERRORTOKEN;
1173 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001174 }
1175 }
1176 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001177
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001178 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001179
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 /* Return pending indents/dedents */
1181 if (tok->pendin != 0) {
1182 if (tok->pendin < 0) {
1183 tok->pendin++;
1184 return DEDENT;
1185 }
1186 else {
1187 tok->pendin--;
1188 return INDENT;
1189 }
1190 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001191
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001193 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 /* Skip spaces */
1195 do {
1196 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001197 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001198
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001200 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001201
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001202 /* Skip comment */
1203 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001204 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001208 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001210 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001211
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001213 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001214 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001215 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001216 switch (c) {
1217 case 'r':
1218 case 'R':
1219 c = tok_nextc(tok);
1220 if (c == '"' || c == '\'')
1221 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001222 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001223 case 'b':
1224 case 'B':
1225 c = tok_nextc(tok);
1226 if (c == 'r' || c == 'R')
1227 c = tok_nextc(tok);
1228 if (c == '"' || c == '\'')
1229 goto letter_quote;
1230 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001231 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001232 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001233 if (c >= 128)
1234 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001236 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001237 tok_backup(tok, c);
Martin v. Löwis47383402007-08-15 07:32:56 +00001238 if (nonascii &&
1239 !verify_identifier(tok->start, tok->cur)) {
1240 tok->done = E_IDENTIFIER;
1241 return ERRORTOKEN;
1242 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001243 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 *p_end = tok->cur;
1245 return NAME;
1246 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001247
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 /* Newline */
1249 if (c == '\n') {
1250 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001251 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001252 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001253 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001254 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001255 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 return NEWLINE;
1257 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001258
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001259 /* Period or number starting with period? */
1260 if (c == '.') {
1261 c = tok_nextc(tok);
1262 if (isdigit(c)) {
1263 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001264 } else if (c == '.') {
1265 c = tok_nextc(tok);
1266 if (c == '.') {
1267 *p_start = tok->start;
1268 *p_end = tok->cur;
1269 return ELLIPSIS;
1270 } else {
1271 tok_backup(tok, c);
1272 }
1273 tok_backup(tok, '.');
1274 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001275 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001276 }
Georg Brandldde00282007-03-18 19:01:53 +00001277 *p_start = tok->start;
1278 *p_end = tok->cur;
1279 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001280 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001281
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 /* Number */
1283 if (isdigit(c)) {
1284 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001285 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 c = tok_nextc(tok);
1287 if (c == '.')
1288 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001289#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001290 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001291 goto imaginary;
1292#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 if (c == 'x' || c == 'X') {
1294 /* Hex */
1295 do {
1296 c = tok_nextc(tok);
1297 } while (isxdigit(c));
1298 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001299 else if (c == 'o' || c == 'O') {
1300 /* Octal */
1301 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001303 } while ('0' <= c && c < '8');
1304 }
1305 else if (c == 'b' || c == 'B') {
1306 /* Binary */
1307 do {
1308 c = tok_nextc(tok);
1309 } while (c == '0' || c == '1');
1310 }
1311 else {
1312 int nonzero = 0;
1313 /* maybe old-style octal; c is first char of it */
1314 /* in any case, allow '0' as a literal */
1315 while (c == '0')
1316 c = tok_nextc(tok);
1317 while (isdigit(c)) {
1318 nonzero = 1;
1319 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001320 }
1321 if (c == '.')
1322 goto fraction;
1323 else if (c == 'e' || c == 'E')
1324 goto exponent;
1325#ifndef WITHOUT_COMPLEX
1326 else if (c == 'j' || c == 'J')
1327 goto imaginary;
1328#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001329 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001330 tok->done = E_TOKEN;
1331 tok_backup(tok, c);
1332 return ERRORTOKEN;
1333 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 }
1335 }
1336 else {
1337 /* Decimal */
1338 do {
1339 c = tok_nextc(tok);
1340 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001341 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001342 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001343 if (c == '.') {
1344 fraction:
1345 /* Fraction */
1346 do {
1347 c = tok_nextc(tok);
1348 } while (isdigit(c));
1349 }
1350 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001351 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001352 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001354 if (c == '+' || c == '-')
1355 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001356 if (!isdigit(c)) {
1357 tok->done = E_TOKEN;
1358 tok_backup(tok, c);
1359 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001360 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001361 do {
1362 c = tok_nextc(tok);
1363 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001365#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001366 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001367 /* Imaginary part */
1368 imaginary:
1369 c = tok_nextc(tok);
1370#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 }
1372 }
1373 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001374 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 *p_end = tok->cur;
1376 return NUMBER;
1377 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001378
1379 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001380 /* String */
1381 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001382 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001383 int quote = c;
1384 int triple = 0;
1385 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001386 for (;;) {
1387 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001388 if (c == '\n') {
1389 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001390 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001391 tok_backup(tok, c);
1392 return ERRORTOKEN;
1393 }
1394 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001395 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001396 }
1397 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001398 if (triple)
1399 tok->done = E_EOFS;
1400 else
1401 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001402 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 return ERRORTOKEN;
1404 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001405 else if (c == quote) {
1406 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001407 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001408 c = tok_nextc(tok);
1409 if (c == quote) {
1410 triple = 1;
1411 tripcount = 0;
1412 continue;
1413 }
1414 tok_backup(tok, c);
1415 }
1416 if (!triple || tripcount == 3)
1417 break;
1418 }
1419 else if (c == '\\') {
1420 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001422 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001423 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001424 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 return ERRORTOKEN;
1426 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001427 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001428 else
1429 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001431 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001432 *p_end = tok->cur;
1433 return STRING;
1434 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001435
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001436 /* Line continuation */
1437 if (c == '\\') {
1438 c = tok_nextc(tok);
1439 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001440 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001441 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001442 return ERRORTOKEN;
1443 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001444 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001445 goto again; /* Read next line */
1446 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001447
Guido van Rossumfbab9051991-10-20 20:25:03 +00001448 /* Check for two-character token */
1449 {
1450 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001451 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001452 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001453 int c3 = tok_nextc(tok);
1454 int token3 = PyToken_ThreeChars(c, c2, c3);
1455 if (token3 != OP) {
1456 token = token3;
1457 } else {
1458 tok_backup(tok, c3);
1459 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001461 *p_end = tok->cur;
1462 return token;
1463 }
1464 tok_backup(tok, c2);
1465 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001466
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001467 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001468 switch (c) {
1469 case '(':
1470 case '[':
1471 case '{':
1472 tok->level++;
1473 break;
1474 case ')':
1475 case ']':
1476 case '}':
1477 tok->level--;
1478 break;
1479 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001480
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001482 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001483 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001484 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001485}
1486
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001487int
1488PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1489{
1490 int result = tok_get(tok, p_start, p_end);
1491 if (tok->decoding_erred) {
1492 result = ERRORTOKEN;
1493 tok->done = E_DECODE;
1494 }
1495 return result;
1496}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001497
Guido van Rossum408027e1996-12-30 16:17:54 +00001498#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001499
1500void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001501tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502{
Guido van Rossum86bea461997-04-29 21:03:06 +00001503 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1505 printf("(%.*s)", (int)(end - start), start);
1506}
1507
1508#endif