blob: 9cbc8fe20f879a6752241b1c934cc2b20633cc33 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_')
25
26#define is_potential_identifier_char(c) (\
27 (c >= 'a' && c <= 'z')\
28 || (c >= 'A' && c <= 'Z')\
29 || (c >= '0' && c <= '9')\
30 || c == '_')
31
Martin v. Löwis566f6af2002-10-26 14:39:10 +000032extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000033/* Return malloc'ed string including trailing \n;
34 empty malloc'ed string for EOF;
35 NULL if interrupted */
36
Guido van Rossum4fe87291992-02-26 15:24:44 +000037/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000039
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000040/* Convert a possibly signed character to a nonnegative int */
41/* XXX This assumes characters are 8 bits wide */
42#ifdef __CHAR_UNSIGNED__
43#define Py_CHARMASK(c) (c)
44#else
45#define Py_CHARMASK(c) ((c) & 0xff)
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000049static struct tok_state *tok_new(void);
50static int tok_nextc(struct tok_state *tok);
51static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000052
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
Guido van Rossum86bea461997-04-29 21:03:06 +000055char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000081 "LBRACE",
82 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 "EQEQUAL",
84 "NOTEQUAL",
85 "LESSEQUAL",
86 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000087 "TILDE",
88 "CIRCUMFLEX",
89 "LEFTSHIFT",
90 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000091 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000092 "PLUSEQUAL",
93 "MINEQUAL",
94 "STAREQUAL",
95 "SLASHEQUAL",
96 "PERCENTEQUAL",
97 "AMPEREQUAL",
98 "VBAREQUAL",
99 "CIRCUMFLEXEQUAL",
100 "LEFTSHIFTEQUAL",
101 "RIGHTSHIFTEQUAL",
102 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000103 "DOUBLESLASH",
104 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000105 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000106 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000107 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000108 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 "OP",
110 "<ERRORTOKEN>",
111 "<N_TOKENS>"
112};
113
114
115/* Create and initialize a new tok_state structure */
116
117static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000118tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000120 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
121 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122 if (tok == NULL)
123 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000124 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 tok->done = E_OK;
126 tok->fp = NULL;
127 tok->tabsize = TABSIZE;
128 tok->indent = 0;
129 tok->indstack[0] = 0;
130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000134 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000135 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000136 tok->altwarning = 1;
137 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->alttabsize = 1;
139 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->decoding_state = 0;
141 tok->decoding_erred = 0;
142 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->decoding_readline = NULL;
147 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000149 return tok;
150}
151
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152#ifdef PGEN
153
154static char *
155decoding_fgets(char *s, int size, struct tok_state *tok)
156{
157 return fgets(s, size, tok->fp);
158}
159
160static int
161decoding_feof(struct tok_state *tok)
162{
163 return feof(tok->fp);
164}
165
166static const char *
167decode_str(const char *str, struct tok_state *tok)
168{
169 return str;
170}
171
172#else /* PGEN */
173
174static char *
175error_ret(struct tok_state *tok) /* XXX */
176{
177 tok->decoding_erred = 1;
178 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180 tok->buf = NULL;
181 return NULL; /* as if it were EOF */
182}
183
184static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 if (result != NULL) {
189 memcpy(result, s, len);
190 result[len] = '\0';
191 }
192 return result;
193}
194
195static char *
196get_normal_name(char *s) /* for utf-8 and latin-1 */
197{
198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0') break;
203 else if (c == '_') buf[i] = '-';
204 else buf[i] = tolower(c);
205 }
206 buf[i] = '\0';
207 if (strcmp(buf, "utf-8") == 0 ||
208 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
214 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
215 else return s;
216}
217
218/* Return the coding spec in S, or NULL if none is found. */
219
220static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000221get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000224 /* Coding spec must be in a comment, and that comment must be
225 * the only statement on the source code line. */
226 for (i = 0; i < size - 6; i++) {
227 if (s[i] == '#')
228 break;
229 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
230 return NULL;
231 }
232 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 const char* t = s + i;
234 if (strncmp(t, "coding", 6) == 0) {
235 const char* begin = NULL;
236 t += 6;
237 if (t[0] != ':' && t[0] != '=')
238 continue;
239 do {
240 t++;
241 } while (t[0] == '\x20' || t[0] == '\t');
242
243 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000244 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000245 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246 t++;
247
248 if (begin < t) {
249 char* r = new_string(begin, t - begin);
250 char* q = get_normal_name(r);
251 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000253 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254 }
255 return r;
256 }
257 }
258 }
259 return NULL;
260}
261
262/* Check whether the line contains a coding spec. If it does,
263 invoke the set_readline function for the new encoding.
264 This function receives the tok_state and the new encoding.
265 Return 1 on success, 0 on failure. */
266
267static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269 int set_readline(struct tok_state *, const char *))
270{
Tim Peters17db21f2002-09-03 15:39:58 +0000271 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000273
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000274 if (tok->cont_line)
275 /* It's a continuation line, so it can't be a coding spec. */
276 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000278 if (cs != NULL) {
279 tok->read_coding_spec = 1;
280 if (tok->encoding == NULL) {
281 assert(tok->decoding_state == 1); /* raw */
282 if (strcmp(cs, "utf-8") == 0 ||
283 strcmp(cs, "iso-8859-1") == 0) {
284 tok->encoding = cs;
285 } else {
286 r = set_readline(tok, cs);
287 if (r) {
288 tok->encoding = cs;
289 tok->decoding_state = -1;
290 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000291 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 } else { /* then, compare cs with BOM */
295 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000297 }
298 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000299 if (!r) {
300 cs = tok->encoding;
301 if (!cs)
302 cs = "with BOM";
303 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305 return r;
306}
307
308/* See whether the file starts with a BOM. If it does,
309 invoke the set_readline function with the new encoding.
310 Return 1 on success, 0 on failure. */
311
312static int
313check_bom(int get_char(struct tok_state *),
314 void unget_char(int, struct tok_state *),
315 int set_readline(struct tok_state *, const char *),
316 struct tok_state *tok)
317{
318 int ch = get_char(tok);
319 tok->decoding_state = 1;
320 if (ch == EOF) {
321 return 1;
322 } else if (ch == 0xEF) {
323 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
324 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
325#if 0
326 /* Disable support for UTF-16 BOMs until a decision
327 is made whether this needs to be supported. */
328 } else if (ch == 0xFE) {
329 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-be")) return 0;
331 tok->decoding_state = -1;
332 } else if (ch == 0xFF) {
333 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
334 if (!set_readline(tok, "utf-16-le")) return 0;
335 tok->decoding_state = -1;
336#endif
337 } else {
338 unget_char(ch, tok);
339 return 1;
340 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000341 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
344 return 1;
345 NON_BOM:
346 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
347 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
348 return 1;
349}
350
351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyStringObject *: previous call to fp_readl did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 reached): see tok_nextc and its calls to decoding_fgets.
364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
366static char *
367fp_readl(char *s, int size, struct tok_state *tok)
368{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000369 PyObject* bufobj = tok->decoding_buffer;
370 const char *buf;
371 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372
373 /* Ask for one less byte so we can terminate it */
374 assert(size > 0);
375 size--;
376
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000377 if (bufobj == NULL) {
378 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
379 if (bufobj == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000382 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0)
383 return error_ret(tok);
384 if (buflen > size) {
385 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 return error_ret(tok);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000390 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000391 memcpy(s, buf, buflen);
392 s[buflen] = '\0';
393 if (buflen == 0) return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395}
396
397/* Set the readline function for TOK to a StreamReader's
398 readline function. The StreamReader is named ENC.
399
400 This function is called from check_bom and check_coding_spec.
401
402 ENC is usually identical to the future value of tok->encoding,
403 except for the (currently unsupported) case of UTF-16.
404
405 Return 1 on success, 0 on failure. */
406
407static int
408fp_setreadl(struct tok_state *tok, const char* enc)
409{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000410 PyObject *readline = NULL, *stream = NULL, *io = NULL;
411 int ok = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000413 io = PyImport_ImportModule("io");
414 if (io == NULL)
415 goto cleanup;
416
417 stream = PyObject_CallMethod(io, "open", "ssis",
418 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000419 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000420 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000421
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000422 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000423 if (readline == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000424 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000427 ok = 1;
428
429 cleanup:
430 Py_XDECREF(stream);
431 Py_XDECREF(io);
432 return ok;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000433}
434
435/* Fetch the next byte from TOK. */
436
437static int fp_getc(struct tok_state *tok) {
438 return getc(tok->fp);
439}
440
441/* Unfetch the last byte back into TOK. */
442
443static void fp_ungetc(int c, struct tok_state *tok) {
444 ungetc(c, tok->fp);
445}
446
447/* Read a line of input from TOK. Determine encoding
448 if necessary. */
449
450static char *
451decoding_fgets(char *s, int size, struct tok_state *tok)
452{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000453 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000454 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000455 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000456 if (tok->decoding_state < 0) {
457 /* We already have a codec associated with
458 this input. */
459 line = fp_readl(s, size, tok);
460 break;
461 } else if (tok->decoding_state > 0) {
462 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000463 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465 break;
466 } else {
467 /* We have not yet determined the encoding.
468 If an encoding is found, use the file-pointer
469 reader functions from now on. */
470 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
471 return error_ret(tok);
472 assert(tok->decoding_state != 0);
473 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000474 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
476 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
477 return error_ret(tok);
478 }
479 }
480#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000481 /* The default encoding is ASCII, so make sure we don't have any
482 non-ASCII bytes in it. */
483 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000484 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000485 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486 if (*c > 127) {
487 badchar = *c;
488 break;
489 }
490 }
491 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000492 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000493 /* Need to add 1 to the line number, since this line
494 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000495 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000496 "Non-ASCII character '\\x%.2x' "
497 "in file %.200s on line %i, "
498 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000500 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000501 PyErr_SetString(PyExc_SyntaxError, buf);
502 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503 }
504#endif
505 return line;
506}
507
508static int
509decoding_feof(struct tok_state *tok)
510{
511 if (tok->decoding_state >= 0) {
512 return feof(tok->fp);
513 } else {
514 PyObject* buf = tok->decoding_buffer;
515 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000516 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517 if (buf == NULL) {
518 error_ret(tok);
519 return 1;
520 } else {
521 tok->decoding_buffer = buf;
522 }
523 }
524 return PyObject_Length(buf) == 0;
525 }
526}
527
528/* Fetch a byte from TOK, using the string buffer. */
529
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000530static int
531buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000532 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533}
534
535/* Unfetch a byte from TOK, using the string buffer. */
536
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000537static void
538buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000539 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000540 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541}
542
543/* Set the readline function for TOK to ENC. For the string-based
544 tokenizer, this means to just record the encoding. */
545
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000546static int
547buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548 tok->enc = enc;
549 return 1;
550}
551
552/* Return a UTF-8 encoding Python string object from the
553 C byte string STR, which is encoded with ENC. */
554
555static PyObject *
556translate_into_utf8(const char* str, const char* enc) {
557 PyObject *utf8;
558 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
559 if (buf == NULL)
560 return NULL;
561 utf8 = PyUnicode_AsUTF8String(buf);
562 Py_DECREF(buf);
563 return utf8;
564}
565
566/* Decode a byte string STR for use as the buffer of TOK.
567 Look for encoding declarations inside STR, and record them
568 inside TOK. */
569
570static const char *
571decode_str(const char *str, struct tok_state *tok)
572{
573 PyObject* utf8 = NULL;
574 const char *s;
575 int lineno = 0;
576 tok->enc = NULL;
577 tok->str = str;
578 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000579 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000581 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582 if (tok->enc != NULL) {
583 utf8 = translate_into_utf8(str, tok->enc);
584 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000585 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586 str = PyString_AsString(utf8);
587 }
588 for (s = str;; s++) {
589 if (*s == '\0') break;
590 else if (*s == '\n') {
591 lineno++;
592 if (lineno == 2) break;
593 }
594 }
595 tok->enc = NULL;
596 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000597 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598 if (tok->enc != NULL) {
599 assert(utf8 == NULL);
600 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000601 if (utf8 == NULL) {
602 PyErr_Format(PyExc_SyntaxError,
603 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000604 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000605 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606 str = PyString_AsString(utf8);
607 }
608 assert(tok->decoding_buffer == NULL);
609 tok->decoding_buffer = utf8; /* CAUTION */
610 return str;
611}
612
613#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000614
615/* Set up tokenizer for string */
616
617struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000618PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619{
620 struct tok_state *tok = tok_new();
621 if (tok == NULL)
622 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000624 if (str == NULL) {
625 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000627 }
628
Martin v. Löwis95292d62002-12-11 14:04:59 +0000629 /* XXX: constify members. */
630 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 return tok;
632}
633
634
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000635/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636
637struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000638PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000639{
640 struct tok_state *tok = tok_new();
641 if (tok == NULL)
642 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000643 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000644 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645 return NULL;
646 }
647 tok->cur = tok->inp = tok->buf;
648 tok->end = tok->buf + BUFSIZ;
649 tok->fp = fp;
650 tok->prompt = ps1;
651 tok->nextprompt = ps2;
652 return tok;
653}
654
655
656/* Free a tok_state structure */
657
658void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000659PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000662 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000663#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664 Py_XDECREF(tok->decoding_readline);
665 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000666#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000668 PyMem_FREE(tok->buf);
669 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000670}
671
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000672/* Get next char, updating state; error code goes into tok->done */
673
674static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000675tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000676{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000677 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000678 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000679 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000680 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000681 if (tok->done != E_OK)
682 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000683 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000684 char *end = strchr(tok->inp, '\n');
685 if (end != NULL)
686 end++;
687 else {
688 end = strchr(tok->inp, '\0');
689 if (end == tok->inp) {
690 tok->done = E_EOF;
691 return EOF;
692 }
693 }
694 if (tok->start == NULL)
695 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000696 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000697 tok->lineno++;
698 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000699 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000700 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000702 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703 if (tok->nextprompt != NULL)
704 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000705 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000706 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000707 else if (*newtok == '\0') {
708 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000709 tok->done = E_EOF;
710 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000711 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000712 size_t start = tok->start - tok->buf;
713 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000714 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000715 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000716 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000717 tok->lineno++;
718 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000719 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000720 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000721 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000722 tok->done = E_NOMEM;
723 return EOF;
724 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000725 tok->buf = buf;
726 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000727 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000728 strcpy(tok->buf + oldlen, newtok);
729 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000730 tok->inp = tok->buf + newlen;
731 tok->end = tok->inp + 1;
732 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000733 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000734 else {
735 tok->lineno++;
736 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000737 PyMem_FREE(tok->buf);
738 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000739 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000740 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000741 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000742 tok->inp = strchr(tok->buf, '\0');
743 tok->end = tok->inp + 1;
744 }
745 }
746 else {
747 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000748 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000749 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000750 if (tok->start == NULL) {
751 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000752 tok->buf = (char *)
753 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000754 if (tok->buf == NULL) {
755 tok->done = E_NOMEM;
756 return EOF;
757 }
758 tok->end = tok->buf + BUFSIZ;
759 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000760 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
761 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000762 tok->done = E_EOF;
763 done = 1;
764 }
765 else {
766 tok->done = E_OK;
767 tok->inp = strchr(tok->buf, '\0');
768 done = tok->inp[-1] == '\n';
769 }
770 }
771 else {
772 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000773 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000774 tok->done = E_EOF;
775 done = 1;
776 }
777 else
778 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 }
780 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000781 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000782 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000783 Py_ssize_t curstart = tok->start == NULL ? -1 :
784 tok->start - tok->buf;
785 Py_ssize_t curvalid = tok->inp - tok->buf;
786 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000787 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000788 newbuf = (char *)PyMem_REALLOC(newbuf,
789 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000790 if (newbuf == NULL) {
791 tok->done = E_NOMEM;
792 tok->cur = tok->inp;
793 return EOF;
794 }
795 tok->buf = newbuf;
796 tok->inp = tok->buf + curvalid;
797 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000798 tok->start = curstart < 0 ? NULL :
799 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000800 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000801 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000802 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000803 /* Break out early on decoding
804 errors, as tok->buf will be NULL
805 */
806 if (tok->decoding_erred)
807 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 /* Last line does not end in \n,
809 fake one */
810 strcpy(tok->inp, "\n");
811 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000812 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000813 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000814 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000815 if (tok->buf != NULL) {
816 tok->cur = tok->buf + cur;
817 tok->line_start = tok->cur;
818 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000819 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000820 pt = tok->inp - 2;
821 if (pt >= tok->buf && *pt == '\r') {
822 *pt++ = '\n';
823 *pt = '\0';
824 tok->inp = pt;
825 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000826 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000827 }
828 if (tok->done != E_OK) {
829 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000830 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000831 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000832 return EOF;
833 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000834 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000835 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836}
837
838
839/* Back-up one character */
840
841static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000842tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843{
844 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000845 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000846 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847 if (*tok->cur != c)
848 *tok->cur = c;
849 }
850}
851
852
853/* Return the token corresponding to a single character */
854
855int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000856PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
858 switch (c) {
859 case '(': return LPAR;
860 case ')': return RPAR;
861 case '[': return LSQB;
862 case ']': return RSQB;
863 case ':': return COLON;
864 case ',': return COMMA;
865 case ';': return SEMI;
866 case '+': return PLUS;
867 case '-': return MINUS;
868 case '*': return STAR;
869 case '/': return SLASH;
870 case '|': return VBAR;
871 case '&': return AMPER;
872 case '<': return LESS;
873 case '>': return GREATER;
874 case '=': return EQUAL;
875 case '.': return DOT;
876 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000877 case '{': return LBRACE;
878 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000879 case '^': return CIRCUMFLEX;
880 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000881 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882 default: return OP;
883 }
884}
885
886
Guido van Rossumfbab9051991-10-20 20:25:03 +0000887int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000888PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000889{
890 switch (c1) {
891 case '=':
892 switch (c2) {
893 case '=': return EQEQUAL;
894 }
895 break;
896 case '!':
897 switch (c2) {
898 case '=': return NOTEQUAL;
899 }
900 break;
901 case '<':
902 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000903 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000904 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000905 }
906 break;
907 case '>':
908 switch (c2) {
909 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000910 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000911 }
912 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000913 case '+':
914 switch (c2) {
915 case '=': return PLUSEQUAL;
916 }
917 break;
918 case '-':
919 switch (c2) {
920 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000921 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000922 }
923 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000924 case '*':
925 switch (c2) {
926 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000927 case '=': return STAREQUAL;
928 }
929 break;
930 case '/':
931 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000932 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000933 case '=': return SLASHEQUAL;
934 }
935 break;
936 case '|':
937 switch (c2) {
938 case '=': return VBAREQUAL;
939 }
940 break;
941 case '%':
942 switch (c2) {
943 case '=': return PERCENTEQUAL;
944 }
945 break;
946 case '&':
947 switch (c2) {
948 case '=': return AMPEREQUAL;
949 }
950 break;
951 case '^':
952 switch (c2) {
953 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000954 }
955 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000956 }
957 return OP;
958}
959
Thomas Wouters434d0822000-08-24 20:11:32 +0000960int
961PyToken_ThreeChars(int c1, int c2, int c3)
962{
963 switch (c1) {
964 case '<':
965 switch (c2) {
966 case '<':
967 switch (c3) {
968 case '=':
969 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000970 }
971 break;
972 }
973 break;
974 case '>':
975 switch (c2) {
976 case '>':
977 switch (c3) {
978 case '=':
979 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000980 }
981 break;
982 }
983 break;
984 case '*':
985 switch (c2) {
986 case '*':
987 switch (c3) {
988 case '=':
989 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000990 }
991 break;
992 }
993 break;
Guido van Rossum4668b002001-08-08 05:00:18 +0000994 case '/':
995 switch (c2) {
996 case '/':
997 switch (c3) {
998 case '=':
999 return DOUBLESLASHEQUAL;
1000 }
1001 break;
1002 }
1003 break;
Georg Brandldde00282007-03-18 19:01:53 +00001004 case '.':
1005 switch (c2) {
1006 case '.':
1007 switch (c3) {
1008 case '.':
1009 return ELLIPSIS;
1010 }
1011 break;
1012 }
1013 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001014 }
1015 return OP;
1016}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001017
Guido van Rossum926f13a1998-04-09 21:38:06 +00001018static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001019indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001020{
1021 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001022 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001023 tok->cur = tok->inp;
1024 return 1;
1025 }
1026 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001027 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1028 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001029 tok->altwarning = 0;
1030 }
1031 return 0;
1032}
1033
1034
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001035/* Get next token, after space stripping etc. */
1036
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001037static int
1038tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039{
1040 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001041 int blankline;
1042
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001043 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001044 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001045 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001046 blankline = 0;
1047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048 /* Get indentation level */
1049 if (tok->atbol) {
1050 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001051 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001052 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001053 for (;;) {
1054 c = tok_nextc(tok);
1055 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001056 col++, altcol++;
1057 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001058 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001059 altcol = (altcol/tok->alttabsize + 1)
1060 * tok->alttabsize;
1061 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001062 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001063 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064 else
1065 break;
1066 }
1067 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001068 if (c == '#' || c == '\n') {
1069 /* Lines with only whitespace and/or comments
1070 shouldn't affect the indentation and are
1071 not passed to the parser as NEWLINE tokens,
1072 except *totally* empty lines in interactive
1073 mode, which signal the end of a command group. */
1074 if (col == 0 && c == '\n' && tok->prompt != NULL)
1075 blankline = 0; /* Let it through */
1076 else
1077 blankline = 1; /* Ignore completely */
1078 /* We can't jump back right here since we still
1079 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001080 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001081 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001082 if (col == tok->indstack[tok->indent]) {
1083 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001084 if (altcol != tok->altindstack[tok->indent]) {
1085 if (indenterror(tok))
1086 return ERRORTOKEN;
1087 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001088 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001089 else if (col > tok->indstack[tok->indent]) {
1090 /* Indent -- always one */
1091 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001092 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001093 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001094 return ERRORTOKEN;
1095 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001096 if (altcol <= tok->altindstack[tok->indent]) {
1097 if (indenterror(tok))
1098 return ERRORTOKEN;
1099 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001100 tok->pendin++;
1101 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001102 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001104 else /* col < tok->indstack[tok->indent] */ {
1105 /* Dedent -- any number, must be consistent */
1106 while (tok->indent > 0 &&
1107 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001108 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001109 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 }
1111 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001112 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001113 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001114 return ERRORTOKEN;
1115 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001116 if (altcol != tok->altindstack[tok->indent]) {
1117 if (indenterror(tok))
1118 return ERRORTOKEN;
1119 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001120 }
1121 }
1122 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001124 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001125
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001126 /* Return pending indents/dedents */
1127 if (tok->pendin != 0) {
1128 if (tok->pendin < 0) {
1129 tok->pendin++;
1130 return DEDENT;
1131 }
1132 else {
1133 tok->pendin--;
1134 return INDENT;
1135 }
1136 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001137
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001138 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001139 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001140 /* Skip spaces */
1141 do {
1142 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001143 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001144
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001145 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001146 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001147
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001148 /* Skip comment */
1149 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001150 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001151 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001152
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001153 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001154 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001155 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001156 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001157
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001158 /* Identifier (most frequent token!) */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001159 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001160 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001161 switch (c) {
1162 case 'r':
1163 case 'R':
1164 c = tok_nextc(tok);
1165 if (c == '"' || c == '\'')
1166 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001167 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001168 case 'b':
1169 case 'B':
1170 c = tok_nextc(tok);
1171 if (c == 'r' || c == 'R')
1172 c = tok_nextc(tok);
1173 if (c == '"' || c == '\'')
1174 goto letter_quote;
1175 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001176 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001177 while (is_potential_identifier_char(c)) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001178 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001179 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001181 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001182 *p_end = tok->cur;
1183 return NAME;
1184 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001185
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001186 /* Newline */
1187 if (c == '\n') {
1188 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001189 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001190 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001191 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001193 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 return NEWLINE;
1195 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001196
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001197 /* Period or number starting with period? */
1198 if (c == '.') {
1199 c = tok_nextc(tok);
1200 if (isdigit(c)) {
1201 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001202 } else if (c == '.') {
1203 c = tok_nextc(tok);
1204 if (c == '.') {
1205 *p_start = tok->start;
1206 *p_end = tok->cur;
1207 return ELLIPSIS;
1208 } else {
1209 tok_backup(tok, c);
1210 }
1211 tok_backup(tok, '.');
1212 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001213 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001214 }
Georg Brandldde00282007-03-18 19:01:53 +00001215 *p_start = tok->start;
1216 *p_end = tok->cur;
1217 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001218 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001219
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220 /* Number */
1221 if (isdigit(c)) {
1222 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001223 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 c = tok_nextc(tok);
1225 if (c == '.')
1226 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001227#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001228 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001229 goto imaginary;
1230#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 if (c == 'x' || c == 'X') {
1232 /* Hex */
1233 do {
1234 c = tok_nextc(tok);
1235 } while (isxdigit(c));
1236 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001237 else if (c == 'o' || c == 'O') {
1238 /* Octal */
1239 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001241 } while ('0' <= c && c < '8');
1242 }
1243 else if (c == 'b' || c == 'B') {
1244 /* Binary */
1245 do {
1246 c = tok_nextc(tok);
1247 } while (c == '0' || c == '1');
1248 }
1249 else {
1250 int nonzero = 0;
1251 /* maybe old-style octal; c is first char of it */
1252 /* in any case, allow '0' as a literal */
1253 while (c == '0')
1254 c = tok_nextc(tok);
1255 while (isdigit(c)) {
1256 nonzero = 1;
1257 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001258 }
1259 if (c == '.')
1260 goto fraction;
1261 else if (c == 'e' || c == 'E')
1262 goto exponent;
1263#ifndef WITHOUT_COMPLEX
1264 else if (c == 'j' || c == 'J')
1265 goto imaginary;
1266#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001267 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001268 tok->done = E_TOKEN;
1269 tok_backup(tok, c);
1270 return ERRORTOKEN;
1271 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 }
1273 }
1274 else {
1275 /* Decimal */
1276 do {
1277 c = tok_nextc(tok);
1278 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001279 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001280 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001281 if (c == '.') {
1282 fraction:
1283 /* Fraction */
1284 do {
1285 c = tok_nextc(tok);
1286 } while (isdigit(c));
1287 }
1288 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001289 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001290 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001291 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001292 if (c == '+' || c == '-')
1293 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001294 if (!isdigit(c)) {
1295 tok->done = E_TOKEN;
1296 tok_backup(tok, c);
1297 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001298 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001299 do {
1300 c = tok_nextc(tok);
1301 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001303#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001304 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001305 /* Imaginary part */
1306 imaginary:
1307 c = tok_nextc(tok);
1308#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 }
1310 }
1311 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001312 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 *p_end = tok->cur;
1314 return NUMBER;
1315 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001316
1317 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001318 /* String */
1319 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001320 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001321 int quote = c;
1322 int triple = 0;
1323 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 for (;;) {
1325 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001326 if (c == '\n') {
1327 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001328 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001329 tok_backup(tok, c);
1330 return ERRORTOKEN;
1331 }
1332 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001333 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001334 }
1335 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001336 if (triple)
1337 tok->done = E_EOFS;
1338 else
1339 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001340 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 return ERRORTOKEN;
1342 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001343 else if (c == quote) {
1344 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001345 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001346 c = tok_nextc(tok);
1347 if (c == quote) {
1348 triple = 1;
1349 tripcount = 0;
1350 continue;
1351 }
1352 tok_backup(tok, c);
1353 }
1354 if (!triple || tripcount == 3)
1355 break;
1356 }
1357 else if (c == '\\') {
1358 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001359 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001360 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001361 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001362 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 return ERRORTOKEN;
1364 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001366 else
1367 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001369 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001370 *p_end = tok->cur;
1371 return STRING;
1372 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001373
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 /* Line continuation */
1375 if (c == '\\') {
1376 c = tok_nextc(tok);
1377 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001378 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001379 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 return ERRORTOKEN;
1381 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001382 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383 goto again; /* Read next line */
1384 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001385
Guido van Rossumfbab9051991-10-20 20:25:03 +00001386 /* Check for two-character token */
1387 {
1388 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001389 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001390 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001391 int c3 = tok_nextc(tok);
1392 int token3 = PyToken_ThreeChars(c, c2, c3);
1393 if (token3 != OP) {
1394 token = token3;
1395 } else {
1396 tok_backup(tok, c3);
1397 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001399 *p_end = tok->cur;
1400 return token;
1401 }
1402 tok_backup(tok, c2);
1403 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001405 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001406 switch (c) {
1407 case '(':
1408 case '[':
1409 case '{':
1410 tok->level++;
1411 break;
1412 case ')':
1413 case ']':
1414 case '}':
1415 tok->level--;
1416 break;
1417 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001420 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001422 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423}
1424
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001425int
1426PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1427{
1428 int result = tok_get(tok, p_start, p_end);
1429 if (tok->decoding_erred) {
1430 result = ERRORTOKEN;
1431 tok->done = E_DECODE;
1432 }
1433 return result;
1434}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435
Guido van Rossum408027e1996-12-30 16:17:54 +00001436#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001437
1438void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001439tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001440{
Guido van Rossum86bea461997-04-29 21:03:06 +00001441 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001442 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1443 printf("(%.*s)", (int)(end - start), start);
1444}
1445
1446#endif