blob: e7dada63bc7074d0b5db8400ac05b9c6b2d82447 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_')
25
26#define is_potential_identifier_char(c) (\
27 (c >= 'a' && c <= 'z')\
28 || (c >= 'A' && c <= 'Z')\
29 || (c >= '0' && c <= '9')\
30 || c == '_')
31
Martin v. Löwis566f6af2002-10-26 14:39:10 +000032extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000033/* Return malloc'ed string including trailing \n;
34 empty malloc'ed string for EOF;
35 NULL if interrupted */
36
Guido van Rossum4fe87291992-02-26 15:24:44 +000037/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000039
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000040/* Convert a possibly signed character to a nonnegative int */
41/* XXX This assumes characters are 8 bits wide */
42#ifdef __CHAR_UNSIGNED__
43#define Py_CHARMASK(c) (c)
44#else
45#define Py_CHARMASK(c) ((c) & 0xff)
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000049static struct tok_state *tok_new(void);
50static int tok_nextc(struct tok_state *tok);
51static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000052
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
Guido van Rossum86bea461997-04-29 21:03:06 +000055char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000081 "LBRACE",
82 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 "EQEQUAL",
84 "NOTEQUAL",
85 "LESSEQUAL",
86 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000087 "TILDE",
88 "CIRCUMFLEX",
89 "LEFTSHIFT",
90 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000091 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000092 "PLUSEQUAL",
93 "MINEQUAL",
94 "STAREQUAL",
95 "SLASHEQUAL",
96 "PERCENTEQUAL",
97 "AMPEREQUAL",
98 "VBAREQUAL",
99 "CIRCUMFLEXEQUAL",
100 "LEFTSHIFTEQUAL",
101 "RIGHTSHIFTEQUAL",
102 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000103 "DOUBLESLASH",
104 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000105 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000106 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000107 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000108 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 "OP",
110 "<ERRORTOKEN>",
111 "<N_TOKENS>"
112};
113
114
115/* Create and initialize a new tok_state structure */
116
117static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000118tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000120 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
121 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122 if (tok == NULL)
123 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000124 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 tok->done = E_OK;
126 tok->fp = NULL;
127 tok->tabsize = TABSIZE;
128 tok->indent = 0;
129 tok->indstack[0] = 0;
130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000134 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000135 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000136 tok->altwarning = 1;
137 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->alttabsize = 1;
139 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->decoding_state = 0;
141 tok->decoding_erred = 0;
142 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->decoding_readline = NULL;
147 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000149 return tok;
150}
151
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152#ifdef PGEN
153
154static char *
155decoding_fgets(char *s, int size, struct tok_state *tok)
156{
157 return fgets(s, size, tok->fp);
158}
159
160static int
161decoding_feof(struct tok_state *tok)
162{
163 return feof(tok->fp);
164}
165
166static const char *
167decode_str(const char *str, struct tok_state *tok)
168{
169 return str;
170}
171
172#else /* PGEN */
173
174static char *
175error_ret(struct tok_state *tok) /* XXX */
176{
177 tok->decoding_erred = 1;
178 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180 tok->buf = NULL;
181 return NULL; /* as if it were EOF */
182}
183
184static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 if (result != NULL) {
189 memcpy(result, s, len);
190 result[len] = '\0';
191 }
192 return result;
193}
194
195static char *
196get_normal_name(char *s) /* for utf-8 and latin-1 */
197{
198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0') break;
203 else if (c == '_') buf[i] = '-';
204 else buf[i] = tolower(c);
205 }
206 buf[i] = '\0';
207 if (strcmp(buf, "utf-8") == 0 ||
208 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
214 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
215 else return s;
216}
217
218/* Return the coding spec in S, or NULL if none is found. */
219
220static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000221get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000224 /* Coding spec must be in a comment, and that comment must be
225 * the only statement on the source code line. */
226 for (i = 0; i < size - 6; i++) {
227 if (s[i] == '#')
228 break;
229 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
230 return NULL;
231 }
232 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 const char* t = s + i;
234 if (strncmp(t, "coding", 6) == 0) {
235 const char* begin = NULL;
236 t += 6;
237 if (t[0] != ':' && t[0] != '=')
238 continue;
239 do {
240 t++;
241 } while (t[0] == '\x20' || t[0] == '\t');
242
243 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000244 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000245 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246 t++;
247
248 if (begin < t) {
249 char* r = new_string(begin, t - begin);
250 char* q = get_normal_name(r);
251 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000253 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254 }
255 return r;
256 }
257 }
258 }
259 return NULL;
260}
261
262/* Check whether the line contains a coding spec. If it does,
263 invoke the set_readline function for the new encoding.
264 This function receives the tok_state and the new encoding.
265 Return 1 on success, 0 on failure. */
266
267static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269 int set_readline(struct tok_state *, const char *))
270{
Tim Peters17db21f2002-09-03 15:39:58 +0000271 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000273
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000274 if (tok->cont_line)
275 /* It's a continuation line, so it can't be a coding spec. */
276 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000278 if (cs != NULL) {
279 tok->read_coding_spec = 1;
280 if (tok->encoding == NULL) {
281 assert(tok->decoding_state == 1); /* raw */
282 if (strcmp(cs, "utf-8") == 0 ||
283 strcmp(cs, "iso-8859-1") == 0) {
284 tok->encoding = cs;
285 } else {
286 r = set_readline(tok, cs);
287 if (r) {
288 tok->encoding = cs;
289 tok->decoding_state = -1;
290 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000291 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 } else { /* then, compare cs with BOM */
295 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000297 }
298 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000299 if (!r) {
300 cs = tok->encoding;
301 if (!cs)
302 cs = "with BOM";
303 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305 return r;
306}
307
308/* See whether the file starts with a BOM. If it does,
309 invoke the set_readline function with the new encoding.
310 Return 1 on success, 0 on failure. */
311
312static int
313check_bom(int get_char(struct tok_state *),
314 void unget_char(int, struct tok_state *),
315 int set_readline(struct tok_state *, const char *),
316 struct tok_state *tok)
317{
318 int ch = get_char(tok);
319 tok->decoding_state = 1;
320 if (ch == EOF) {
321 return 1;
322 } else if (ch == 0xEF) {
323 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
324 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
325#if 0
326 /* Disable support for UTF-16 BOMs until a decision
327 is made whether this needs to be supported. */
328 } else if (ch == 0xFE) {
329 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-be")) return 0;
331 tok->decoding_state = -1;
332 } else if (ch == 0xFF) {
333 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
334 if (!set_readline(tok, "utf-16-le")) return 0;
335 tok->decoding_state = -1;
336#endif
337 } else {
338 unget_char(ch, tok);
339 return 1;
340 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000341 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
344 return 1;
345 NON_BOM:
346 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
347 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
348 return 1;
349}
350
351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyStringObject *: previous call to fp_readl did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 reached): see tok_nextc and its calls to decoding_fgets.
364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
366static char *
367fp_readl(char *s, int size, struct tok_state *tok)
368{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000369 PyObject* bufobj = tok->decoding_buffer;
370 const char *buf;
371 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372
373 /* Ask for one less byte so we can terminate it */
374 assert(size > 0);
375 size--;
376
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000377 if (bufobj == NULL) {
378 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
379 if (bufobj == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000382 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0)
383 return error_ret(tok);
384 if (buflen > size) {
385 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 return error_ret(tok);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000390 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000391 memcpy(s, buf, buflen);
392 s[buflen] = '\0';
393 if (buflen == 0) return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395}
396
397/* Set the readline function for TOK to a StreamReader's
398 readline function. The StreamReader is named ENC.
399
400 This function is called from check_bom and check_coding_spec.
401
402 ENC is usually identical to the future value of tok->encoding,
403 except for the (currently unsupported) case of UTF-16.
404
405 Return 1 on success, 0 on failure. */
406
407static int
408fp_setreadl(struct tok_state *tok, const char* enc)
409{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000410 PyObject *readline = NULL, *stream = NULL, *io = NULL;
411 int ok = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000413 io = PyImport_ImportModule("io");
414 if (io == NULL)
415 goto cleanup;
416
417 stream = PyObject_CallMethod(io, "open", "ssis",
418 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000419 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000420 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000421
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000422 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000423 if (readline == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000424 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000427 ok = 1;
428
429 cleanup:
430 Py_XDECREF(stream);
431 Py_XDECREF(io);
432 return ok;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000433}
434
435/* Fetch the next byte from TOK. */
436
437static int fp_getc(struct tok_state *tok) {
438 return getc(tok->fp);
439}
440
441/* Unfetch the last byte back into TOK. */
442
443static void fp_ungetc(int c, struct tok_state *tok) {
444 ungetc(c, tok->fp);
445}
446
447/* Read a line of input from TOK. Determine encoding
448 if necessary. */
449
450static char *
451decoding_fgets(char *s, int size, struct tok_state *tok)
452{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000453 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000454 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000455 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000456 if (tok->decoding_state < 0) {
457 /* We already have a codec associated with
458 this input. */
459 line = fp_readl(s, size, tok);
460 break;
461 } else if (tok->decoding_state > 0) {
462 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000463 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465 break;
466 } else {
467 /* We have not yet determined the encoding.
468 If an encoding is found, use the file-pointer
469 reader functions from now on. */
470 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
471 return error_ret(tok);
472 assert(tok->decoding_state != 0);
473 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000474 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
476 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
477 return error_ret(tok);
478 }
479 }
480#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000481 /* The default encoding is ASCII, so make sure we don't have any
482 non-ASCII bytes in it. */
483 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000484 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000485 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000486 if (*c > 127) {
487 badchar = *c;
488 break;
489 }
490 }
491 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000492 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000493 /* Need to add 1 to the line number, since this line
494 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000495 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000496 "Non-ASCII character '\\x%.2x' "
497 "in file %.200s on line %i, "
498 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000500 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000501 PyErr_SetString(PyExc_SyntaxError, buf);
502 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503 }
504#endif
505 return line;
506}
507
508static int
509decoding_feof(struct tok_state *tok)
510{
511 if (tok->decoding_state >= 0) {
512 return feof(tok->fp);
513 } else {
514 PyObject* buf = tok->decoding_buffer;
515 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000516 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000517 if (buf == NULL) {
518 error_ret(tok);
519 return 1;
520 } else {
521 tok->decoding_buffer = buf;
522 }
523 }
524 return PyObject_Length(buf) == 0;
525 }
526}
527
528/* Fetch a byte from TOK, using the string buffer. */
529
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000530static int
531buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000532 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533}
534
535/* Unfetch a byte from TOK, using the string buffer. */
536
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000537static void
538buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000539 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000540 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000541}
542
543/* Set the readline function for TOK to ENC. For the string-based
544 tokenizer, this means to just record the encoding. */
545
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000546static int
547buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548 tok->enc = enc;
549 return 1;
550}
551
552/* Return a UTF-8 encoding Python string object from the
553 C byte string STR, which is encoded with ENC. */
554
555static PyObject *
556translate_into_utf8(const char* str, const char* enc) {
557 PyObject *utf8;
558 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
559 if (buf == NULL)
560 return NULL;
561 utf8 = PyUnicode_AsUTF8String(buf);
562 Py_DECREF(buf);
563 return utf8;
564}
565
566/* Decode a byte string STR for use as the buffer of TOK.
567 Look for encoding declarations inside STR, and record them
568 inside TOK. */
569
570static const char *
571decode_str(const char *str, struct tok_state *tok)
572{
573 PyObject* utf8 = NULL;
574 const char *s;
575 int lineno = 0;
576 tok->enc = NULL;
577 tok->str = str;
578 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000579 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000581 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000582 if (tok->enc != NULL) {
583 utf8 = translate_into_utf8(str, tok->enc);
584 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000585 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586 str = PyString_AsString(utf8);
587 }
588 for (s = str;; s++) {
589 if (*s == '\0') break;
590 else if (*s == '\n') {
591 lineno++;
592 if (lineno == 2) break;
593 }
594 }
595 tok->enc = NULL;
596 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000597 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598 if (tok->enc != NULL) {
599 assert(utf8 == NULL);
600 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000601 if (utf8 == NULL) {
602 PyErr_Format(PyExc_SyntaxError,
603 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000604 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000605 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606 str = PyString_AsString(utf8);
607 }
608 assert(tok->decoding_buffer == NULL);
609 tok->decoding_buffer = utf8; /* CAUTION */
610 return str;
611}
612
613#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000614
615/* Set up tokenizer for string */
616
617struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000618PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000619{
620 struct tok_state *tok = tok_new();
621 if (tok == NULL)
622 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000624 if (str == NULL) {
625 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000626 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000627 }
628
Martin v. Löwis95292d62002-12-11 14:04:59 +0000629 /* XXX: constify members. */
630 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000631 return tok;
632}
633
634
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000635/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000636
637struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000638PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000639{
640 struct tok_state *tok = tok_new();
641 if (tok == NULL)
642 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000643 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000644 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645 return NULL;
646 }
647 tok->cur = tok->inp = tok->buf;
648 tok->end = tok->buf + BUFSIZ;
649 tok->fp = fp;
650 tok->prompt = ps1;
651 tok->nextprompt = ps2;
652 return tok;
653}
654
655
656/* Free a tok_state structure */
657
658void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000659PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000661 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000662 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000663#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000664 Py_XDECREF(tok->decoding_readline);
665 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000666#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000667 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000668 PyMem_FREE(tok->buf);
669 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000670}
671
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000672#if !defined(PGEN)
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000673static int
674tok_stdin_decode(struct tok_state *tok, char **inp)
675{
676 PyObject *enc, *sysstdin, *decoded, *utf8;
677 const char *encoding;
678 char *converted;
679
680 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
681 return 0;
682 sysstdin = PySys_GetObject("stdin");
683 if (sysstdin == NULL || !PyFile_Check(sysstdin))
684 return 0;
685
686 enc = ((PyFileObject *)sysstdin)->f_encoding;
687 if (enc == NULL || !PyString_Check(enc))
688 return 0;
689 Py_INCREF(enc);
690
691 encoding = PyString_AsString(enc);
692 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
693 if (decoded == NULL)
694 goto error_clear;
695
696 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
697 Py_DECREF(decoded);
698 if (utf8 == NULL)
699 goto error_clear;
700
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000701 assert(PyBytes_Check(utf8));
702 converted = new_string(PyBytes_AS_STRING(utf8),
703 PyBytes_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000704 Py_DECREF(utf8);
705 if (converted == NULL)
706 goto error_nomem;
707
708 PyMem_FREE(*inp);
709 *inp = converted;
710 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000711 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000712 tok->encoding = new_string(encoding, strlen(encoding));
713 if (tok->encoding == NULL)
714 goto error_nomem;
715
716 Py_DECREF(enc);
717 return 0;
718
719error_nomem:
720 Py_DECREF(enc);
721 tok->done = E_NOMEM;
722 return -1;
723
724error_clear:
725 /* Fallback to iso-8859-1: for backward compatibility */
726 Py_DECREF(enc);
727 PyErr_Clear();
728 return 0;
729}
730#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000731
732/* Get next char, updating state; error code goes into tok->done */
733
734static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000735tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000736{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000737 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000738 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000739 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000740 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000741 if (tok->done != E_OK)
742 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000744 char *end = strchr(tok->inp, '\n');
745 if (end != NULL)
746 end++;
747 else {
748 end = strchr(tok->inp, '\0');
749 if (end == tok->inp) {
750 tok->done = E_EOF;
751 return EOF;
752 }
753 }
754 if (tok->start == NULL)
755 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000756 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000757 tok->lineno++;
758 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000759 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000761 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000762 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763 if (tok->nextprompt != NULL)
764 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000765 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000766 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000767 else if (*newtok == '\0') {
768 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 tok->done = E_EOF;
770 }
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000771#if !defined(PGEN)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000772 else if (tok_stdin_decode(tok, &newtok) != 0)
773 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000774#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000775 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000776 size_t start = tok->start - tok->buf;
777 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000778 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000779 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000780 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000781 tok->lineno++;
782 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000783 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000784 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000785 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000786 tok->done = E_NOMEM;
787 return EOF;
788 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000789 tok->buf = buf;
790 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000791 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000792 strcpy(tok->buf + oldlen, newtok);
793 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000794 tok->inp = tok->buf + newlen;
795 tok->end = tok->inp + 1;
796 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000797 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000798 else {
799 tok->lineno++;
800 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000801 PyMem_FREE(tok->buf);
802 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000803 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000804 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000805 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000806 tok->inp = strchr(tok->buf, '\0');
807 tok->end = tok->inp + 1;
808 }
809 }
810 else {
811 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000812 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000813 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000814 if (tok->start == NULL) {
815 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000816 tok->buf = (char *)
817 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 if (tok->buf == NULL) {
819 tok->done = E_NOMEM;
820 return EOF;
821 }
822 tok->end = tok->buf + BUFSIZ;
823 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000824 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
825 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000826 tok->done = E_EOF;
827 done = 1;
828 }
829 else {
830 tok->done = E_OK;
831 tok->inp = strchr(tok->buf, '\0');
832 done = tok->inp[-1] == '\n';
833 }
834 }
835 else {
836 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000837 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000838 tok->done = E_EOF;
839 done = 1;
840 }
841 else
842 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000843 }
844 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000845 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000847 Py_ssize_t curstart = tok->start == NULL ? -1 :
848 tok->start - tok->buf;
849 Py_ssize_t curvalid = tok->inp - tok->buf;
850 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000851 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000852 newbuf = (char *)PyMem_REALLOC(newbuf,
853 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000854 if (newbuf == NULL) {
855 tok->done = E_NOMEM;
856 tok->cur = tok->inp;
857 return EOF;
858 }
859 tok->buf = newbuf;
860 tok->inp = tok->buf + curvalid;
861 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 tok->start = curstart < 0 ? NULL :
863 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000864 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000866 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000867 /* Break out early on decoding
868 errors, as tok->buf will be NULL
869 */
870 if (tok->decoding_erred)
871 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000872 /* Last line does not end in \n,
873 fake one */
874 strcpy(tok->inp, "\n");
875 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000877 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000878 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000879 if (tok->buf != NULL) {
880 tok->cur = tok->buf + cur;
881 tok->line_start = tok->cur;
882 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000883 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000884 pt = tok->inp - 2;
885 if (pt >= tok->buf && *pt == '\r') {
886 *pt++ = '\n';
887 *pt = '\0';
888 tok->inp = pt;
889 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000890 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000891 }
892 if (tok->done != E_OK) {
893 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000894 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000895 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896 return EOF;
897 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000898 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000899 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000900}
901
902
903/* Back-up one character */
904
905static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000906tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907{
908 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000909 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000910 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000911 if (*tok->cur != c)
912 *tok->cur = c;
913 }
914}
915
916
917/* Return the token corresponding to a single character */
918
919int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000920PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000921{
922 switch (c) {
923 case '(': return LPAR;
924 case ')': return RPAR;
925 case '[': return LSQB;
926 case ']': return RSQB;
927 case ':': return COLON;
928 case ',': return COMMA;
929 case ';': return SEMI;
930 case '+': return PLUS;
931 case '-': return MINUS;
932 case '*': return STAR;
933 case '/': return SLASH;
934 case '|': return VBAR;
935 case '&': return AMPER;
936 case '<': return LESS;
937 case '>': return GREATER;
938 case '=': return EQUAL;
939 case '.': return DOT;
940 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000941 case '{': return LBRACE;
942 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000943 case '^': return CIRCUMFLEX;
944 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000945 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946 default: return OP;
947 }
948}
949
950
Guido van Rossumfbab9051991-10-20 20:25:03 +0000951int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000952PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000953{
954 switch (c1) {
955 case '=':
956 switch (c2) {
957 case '=': return EQEQUAL;
958 }
959 break;
960 case '!':
961 switch (c2) {
962 case '=': return NOTEQUAL;
963 }
964 break;
965 case '<':
966 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000967 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000968 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000969 }
970 break;
971 case '>':
972 switch (c2) {
973 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000974 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000975 }
976 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000977 case '+':
978 switch (c2) {
979 case '=': return PLUSEQUAL;
980 }
981 break;
982 case '-':
983 switch (c2) {
984 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000985 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000986 }
987 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000988 case '*':
989 switch (c2) {
990 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000991 case '=': return STAREQUAL;
992 }
993 break;
994 case '/':
995 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000996 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000997 case '=': return SLASHEQUAL;
998 }
999 break;
1000 case '|':
1001 switch (c2) {
1002 case '=': return VBAREQUAL;
1003 }
1004 break;
1005 case '%':
1006 switch (c2) {
1007 case '=': return PERCENTEQUAL;
1008 }
1009 break;
1010 case '&':
1011 switch (c2) {
1012 case '=': return AMPEREQUAL;
1013 }
1014 break;
1015 case '^':
1016 switch (c2) {
1017 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001018 }
1019 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001020 }
1021 return OP;
1022}
1023
Thomas Wouters434d0822000-08-24 20:11:32 +00001024int
1025PyToken_ThreeChars(int c1, int c2, int c3)
1026{
1027 switch (c1) {
1028 case '<':
1029 switch (c2) {
1030 case '<':
1031 switch (c3) {
1032 case '=':
1033 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001034 }
1035 break;
1036 }
1037 break;
1038 case '>':
1039 switch (c2) {
1040 case '>':
1041 switch (c3) {
1042 case '=':
1043 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001044 }
1045 break;
1046 }
1047 break;
1048 case '*':
1049 switch (c2) {
1050 case '*':
1051 switch (c3) {
1052 case '=':
1053 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001054 }
1055 break;
1056 }
1057 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001058 case '/':
1059 switch (c2) {
1060 case '/':
1061 switch (c3) {
1062 case '=':
1063 return DOUBLESLASHEQUAL;
1064 }
1065 break;
1066 }
1067 break;
Georg Brandldde00282007-03-18 19:01:53 +00001068 case '.':
1069 switch (c2) {
1070 case '.':
1071 switch (c3) {
1072 case '.':
1073 return ELLIPSIS;
1074 }
1075 break;
1076 }
1077 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001078 }
1079 return OP;
1080}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001081
Guido van Rossum926f13a1998-04-09 21:38:06 +00001082static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001083indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001084{
1085 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001086 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001087 tok->cur = tok->inp;
1088 return 1;
1089 }
1090 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001091 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1092 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001093 tok->altwarning = 0;
1094 }
1095 return 0;
1096}
1097
1098
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001099/* Get next token, after space stripping etc. */
1100
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001101static int
1102tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103{
1104 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001105 int blankline;
1106
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001107 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001108 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001109 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001110 blankline = 0;
1111
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001112 /* Get indentation level */
1113 if (tok->atbol) {
1114 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001115 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001116 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 for (;;) {
1118 c = tok_nextc(tok);
1119 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001120 col++, altcol++;
1121 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001122 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 altcol = (altcol/tok->alttabsize + 1)
1124 * tok->alttabsize;
1125 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001126 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001127 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001128 else
1129 break;
1130 }
1131 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001132 if (c == '#' || c == '\n') {
1133 /* Lines with only whitespace and/or comments
1134 shouldn't affect the indentation and are
1135 not passed to the parser as NEWLINE tokens,
1136 except *totally* empty lines in interactive
1137 mode, which signal the end of a command group. */
1138 if (col == 0 && c == '\n' && tok->prompt != NULL)
1139 blankline = 0; /* Let it through */
1140 else
1141 blankline = 1; /* Ignore completely */
1142 /* We can't jump back right here since we still
1143 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001144 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001145 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001146 if (col == tok->indstack[tok->indent]) {
1147 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001148 if (altcol != tok->altindstack[tok->indent]) {
1149 if (indenterror(tok))
1150 return ERRORTOKEN;
1151 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001152 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 else if (col > tok->indstack[tok->indent]) {
1154 /* Indent -- always one */
1155 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001156 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001157 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001158 return ERRORTOKEN;
1159 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001160 if (altcol <= tok->altindstack[tok->indent]) {
1161 if (indenterror(tok))
1162 return ERRORTOKEN;
1163 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001164 tok->pendin++;
1165 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001166 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001167 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001168 else /* col < tok->indstack[tok->indent] */ {
1169 /* Dedent -- any number, must be consistent */
1170 while (tok->indent > 0 &&
1171 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001172 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001173 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001174 }
1175 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001176 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001177 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001178 return ERRORTOKEN;
1179 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001180 if (altcol != tok->altindstack[tok->indent]) {
1181 if (indenterror(tok))
1182 return ERRORTOKEN;
1183 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 }
1185 }
1186 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001187
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001188 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001189
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001190 /* Return pending indents/dedents */
1191 if (tok->pendin != 0) {
1192 if (tok->pendin < 0) {
1193 tok->pendin++;
1194 return DEDENT;
1195 }
1196 else {
1197 tok->pendin--;
1198 return INDENT;
1199 }
1200 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001201
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001203 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 /* Skip spaces */
1205 do {
1206 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001207 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001208
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001210 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001211
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001212 /* Skip comment */
1213 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001214 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001216
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001218 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001219 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001220 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001221
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 /* Identifier (most frequent token!) */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001223 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001224 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001225 switch (c) {
1226 case 'r':
1227 case 'R':
1228 c = tok_nextc(tok);
1229 if (c == '"' || c == '\'')
1230 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001231 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001232 case 'b':
1233 case 'B':
1234 c = tok_nextc(tok);
1235 if (c == 'r' || c == 'R')
1236 c = tok_nextc(tok);
1237 if (c == '"' || c == '\'')
1238 goto letter_quote;
1239 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001240 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001241 while (is_potential_identifier_char(c)) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001242 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001243 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001244 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001245 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001246 *p_end = tok->cur;
1247 return NAME;
1248 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001249
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001250 /* Newline */
1251 if (c == '\n') {
1252 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001253 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001254 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001255 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001256 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001257 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001258 return NEWLINE;
1259 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001260
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001261 /* Period or number starting with period? */
1262 if (c == '.') {
1263 c = tok_nextc(tok);
1264 if (isdigit(c)) {
1265 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001266 } else if (c == '.') {
1267 c = tok_nextc(tok);
1268 if (c == '.') {
1269 *p_start = tok->start;
1270 *p_end = tok->cur;
1271 return ELLIPSIS;
1272 } else {
1273 tok_backup(tok, c);
1274 }
1275 tok_backup(tok, '.');
1276 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001277 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001278 }
Georg Brandldde00282007-03-18 19:01:53 +00001279 *p_start = tok->start;
1280 *p_end = tok->cur;
1281 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001282 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001283
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 /* Number */
1285 if (isdigit(c)) {
1286 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001287 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 c = tok_nextc(tok);
1289 if (c == '.')
1290 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001291#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001292 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001293 goto imaginary;
1294#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 if (c == 'x' || c == 'X') {
1296 /* Hex */
1297 do {
1298 c = tok_nextc(tok);
1299 } while (isxdigit(c));
1300 }
1301 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001302 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001303 /* Octal; c is first char of it */
1304 /* There's no 'isoctdigit' macro, sigh */
1305 while ('0' <= c && c < '8') {
1306 c = tok_nextc(tok);
1307 }
Tim Petersd507dab2001-08-30 20:51:59 +00001308 if (isdigit(c)) {
1309 found_decimal = 1;
1310 do {
1311 c = tok_nextc(tok);
1312 } while (isdigit(c));
1313 }
1314 if (c == '.')
1315 goto fraction;
1316 else if (c == 'e' || c == 'E')
1317 goto exponent;
1318#ifndef WITHOUT_COMPLEX
1319 else if (c == 'j' || c == 'J')
1320 goto imaginary;
1321#endif
1322 else if (found_decimal) {
1323 tok->done = E_TOKEN;
1324 tok_backup(tok, c);
1325 return ERRORTOKEN;
1326 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 }
1328 }
1329 else {
1330 /* Decimal */
1331 do {
1332 c = tok_nextc(tok);
1333 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001334 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001335 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001336 if (c == '.') {
1337 fraction:
1338 /* Fraction */
1339 do {
1340 c = tok_nextc(tok);
1341 } while (isdigit(c));
1342 }
1343 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001344 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001345 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001347 if (c == '+' || c == '-')
1348 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001349 if (!isdigit(c)) {
1350 tok->done = E_TOKEN;
1351 tok_backup(tok, c);
1352 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001353 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001354 do {
1355 c = tok_nextc(tok);
1356 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001358#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001359 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001360 /* Imaginary part */
1361 imaginary:
1362 c = tok_nextc(tok);
1363#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 }
1365 }
1366 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001367 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 *p_end = tok->cur;
1369 return NUMBER;
1370 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001371
1372 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 /* String */
1374 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001375 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001376 int quote = c;
1377 int triple = 0;
1378 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 for (;;) {
1380 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001381 if (c == '\n') {
1382 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001383 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001384 tok_backup(tok, c);
1385 return ERRORTOKEN;
1386 }
1387 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001388 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001389 }
1390 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001391 if (triple)
1392 tok->done = E_EOFS;
1393 else
1394 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001395 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001396 return ERRORTOKEN;
1397 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 else if (c == quote) {
1399 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001400 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001401 c = tok_nextc(tok);
1402 if (c == quote) {
1403 triple = 1;
1404 tripcount = 0;
1405 continue;
1406 }
1407 tok_backup(tok, c);
1408 }
1409 if (!triple || tripcount == 3)
1410 break;
1411 }
1412 else if (c == '\\') {
1413 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001415 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001416 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001417 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 return ERRORTOKEN;
1419 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001421 else
1422 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001424 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001425 *p_end = tok->cur;
1426 return STRING;
1427 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001429 /* Line continuation */
1430 if (c == '\\') {
1431 c = tok_nextc(tok);
1432 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001433 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001434 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 return ERRORTOKEN;
1436 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001437 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001438 goto again; /* Read next line */
1439 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440
Guido van Rossumfbab9051991-10-20 20:25:03 +00001441 /* Check for two-character token */
1442 {
1443 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001444 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001445 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001446 int c3 = tok_nextc(tok);
1447 int token3 = PyToken_ThreeChars(c, c2, c3);
1448 if (token3 != OP) {
1449 token = token3;
1450 } else {
1451 tok_backup(tok, c3);
1452 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001453 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001454 *p_end = tok->cur;
1455 return token;
1456 }
1457 tok_backup(tok, c2);
1458 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001461 switch (c) {
1462 case '(':
1463 case '[':
1464 case '{':
1465 tok->level++;
1466 break;
1467 case ')':
1468 case ']':
1469 case '}':
1470 tok->level--;
1471 break;
1472 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001475 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001477 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478}
1479
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001480int
1481PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1482{
1483 int result = tok_get(tok, p_start, p_end);
1484 if (tok->decoding_erred) {
1485 result = ERRORTOKEN;
1486 tok->done = E_DECODE;
1487 }
1488 return result;
1489}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001490
Guido van Rossum408027e1996-12-30 16:17:54 +00001491#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492
1493void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001494tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001495{
Guido van Rossum86bea461997-04-29 21:03:06 +00001496 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001497 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1498 printf("(%.*s)", (int)(end - start), start);
1499}
1500
1501#endif