blob: 00bb38ad195680bb37f596c825b5d1b5f6259d51 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_')
25
26#define is_potential_identifier_char(c) (\
27 (c >= 'a' && c <= 'z')\
28 || (c >= 'A' && c <= 'Z')\
29 || (c >= '0' && c <= '9')\
30 || c == '_')
31
Martin v. Löwis566f6af2002-10-26 14:39:10 +000032extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000033/* Return malloc'ed string including trailing \n;
34 empty malloc'ed string for EOF;
35 NULL if interrupted */
36
Guido van Rossum4fe87291992-02-26 15:24:44 +000037/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000039
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000040/* Convert a possibly signed character to a nonnegative int */
41/* XXX This assumes characters are 8 bits wide */
42#ifdef __CHAR_UNSIGNED__
43#define Py_CHARMASK(c) (c)
44#else
45#define Py_CHARMASK(c) ((c) & 0xff)
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000049static struct tok_state *tok_new(void);
50static int tok_nextc(struct tok_state *tok);
51static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000052
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
Guido van Rossum86bea461997-04-29 21:03:06 +000055char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000081 "LBRACE",
82 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 "EQEQUAL",
84 "NOTEQUAL",
85 "LESSEQUAL",
86 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000087 "TILDE",
88 "CIRCUMFLEX",
89 "LEFTSHIFT",
90 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000091 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000092 "PLUSEQUAL",
93 "MINEQUAL",
94 "STAREQUAL",
95 "SLASHEQUAL",
96 "PERCENTEQUAL",
97 "AMPEREQUAL",
98 "VBAREQUAL",
99 "CIRCUMFLEXEQUAL",
100 "LEFTSHIFTEQUAL",
101 "RIGHTSHIFTEQUAL",
102 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000103 "DOUBLESLASH",
104 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000105 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000106 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000107 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000108 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 "OP",
110 "<ERRORTOKEN>",
111 "<N_TOKENS>"
112};
113
114
115/* Create and initialize a new tok_state structure */
116
117static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000118tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000120 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
121 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122 if (tok == NULL)
123 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000124 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 tok->done = E_OK;
126 tok->fp = NULL;
127 tok->tabsize = TABSIZE;
128 tok->indent = 0;
129 tok->indstack[0] = 0;
130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000134 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000135 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000136 tok->altwarning = 1;
137 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->alttabsize = 1;
139 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->decoding_state = 0;
141 tok->decoding_erred = 0;
142 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->decoding_readline = NULL;
147 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000149 return tok;
150}
151
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152#ifdef PGEN
153
154static char *
155decoding_fgets(char *s, int size, struct tok_state *tok)
156{
157 return fgets(s, size, tok->fp);
158}
159
160static int
161decoding_feof(struct tok_state *tok)
162{
163 return feof(tok->fp);
164}
165
166static const char *
167decode_str(const char *str, struct tok_state *tok)
168{
169 return str;
170}
171
172#else /* PGEN */
173
174static char *
175error_ret(struct tok_state *tok) /* XXX */
176{
177 tok->decoding_erred = 1;
178 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180 tok->buf = NULL;
181 return NULL; /* as if it were EOF */
182}
183
184static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 if (result != NULL) {
189 memcpy(result, s, len);
190 result[len] = '\0';
191 }
192 return result;
193}
194
195static char *
196get_normal_name(char *s) /* for utf-8 and latin-1 */
197{
198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0') break;
203 else if (c == '_') buf[i] = '-';
204 else buf[i] = tolower(c);
205 }
206 buf[i] = '\0';
207 if (strcmp(buf, "utf-8") == 0 ||
208 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
214 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
215 else return s;
216}
217
218/* Return the coding spec in S, or NULL if none is found. */
219
220static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000221get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000224 /* Coding spec must be in a comment, and that comment must be
225 * the only statement on the source code line. */
226 for (i = 0; i < size - 6; i++) {
227 if (s[i] == '#')
228 break;
229 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
230 return NULL;
231 }
232 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 const char* t = s + i;
234 if (strncmp(t, "coding", 6) == 0) {
235 const char* begin = NULL;
236 t += 6;
237 if (t[0] != ':' && t[0] != '=')
238 continue;
239 do {
240 t++;
241 } while (t[0] == '\x20' || t[0] == '\t');
242
243 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000244 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000245 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246 t++;
247
248 if (begin < t) {
249 char* r = new_string(begin, t - begin);
250 char* q = get_normal_name(r);
251 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000253 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254 }
255 return r;
256 }
257 }
258 }
259 return NULL;
260}
261
262/* Check whether the line contains a coding spec. If it does,
263 invoke the set_readline function for the new encoding.
264 This function receives the tok_state and the new encoding.
265 Return 1 on success, 0 on failure. */
266
267static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269 int set_readline(struct tok_state *, const char *))
270{
Tim Peters17db21f2002-09-03 15:39:58 +0000271 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000273
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000274 if (tok->cont_line)
275 /* It's a continuation line, so it can't be a coding spec. */
276 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000278 if (cs != NULL) {
279 tok->read_coding_spec = 1;
280 if (tok->encoding == NULL) {
281 assert(tok->decoding_state == 1); /* raw */
282 if (strcmp(cs, "utf-8") == 0 ||
283 strcmp(cs, "iso-8859-1") == 0) {
284 tok->encoding = cs;
285 } else {
286 r = set_readline(tok, cs);
287 if (r) {
288 tok->encoding = cs;
289 tok->decoding_state = -1;
290 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000291 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 } else { /* then, compare cs with BOM */
295 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000297 }
298 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000299 if (!r) {
300 cs = tok->encoding;
301 if (!cs)
302 cs = "with BOM";
303 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305 return r;
306}
307
308/* See whether the file starts with a BOM. If it does,
309 invoke the set_readline function with the new encoding.
310 Return 1 on success, 0 on failure. */
311
312static int
313check_bom(int get_char(struct tok_state *),
314 void unget_char(int, struct tok_state *),
315 int set_readline(struct tok_state *, const char *),
316 struct tok_state *tok)
317{
318 int ch = get_char(tok);
319 tok->decoding_state = 1;
320 if (ch == EOF) {
321 return 1;
322 } else if (ch == 0xEF) {
323 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
324 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
325#if 0
326 /* Disable support for UTF-16 BOMs until a decision
327 is made whether this needs to be supported. */
328 } else if (ch == 0xFE) {
329 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-be")) return 0;
331 tok->decoding_state = -1;
332 } else if (ch == 0xFF) {
333 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
334 if (!set_readline(tok, "utf-16-le")) return 0;
335 tok->decoding_state = -1;
336#endif
337 } else {
338 unget_char(ch, tok);
339 return 1;
340 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000341 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
344 return 1;
345 NON_BOM:
346 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
347 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
348 return 1;
349}
350
351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyStringObject *: previous call to fp_readl did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 reached): see tok_nextc and its calls to decoding_fgets.
364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
366static char *
367fp_readl(char *s, int size, struct tok_state *tok)
368{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000369 PyObject* bufobj = tok->decoding_buffer;
370 const char *buf;
371 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000372
373 /* Ask for one less byte so we can terminate it */
374 assert(size > 0);
375 size--;
376
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000377 if (bufobj == NULL) {
378 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
379 if (bufobj == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000381 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000382 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0)
383 return error_ret(tok);
384 if (buflen > size) {
385 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
386 buflen-size);
387 if (tok->decoding_buffer == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 return error_ret(tok);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000390 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000391 memcpy(s, buf, buflen);
392 s[buflen] = '\0';
393 if (buflen == 0) return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000395}
396
397/* Set the readline function for TOK to a StreamReader's
398 readline function. The StreamReader is named ENC.
399
400 This function is called from check_bom and check_coding_spec.
401
402 ENC is usually identical to the future value of tok->encoding,
403 except for the (currently unsupported) case of UTF-16.
404
405 Return 1 on success, 0 on failure. */
406
407static int
408fp_setreadl(struct tok_state *tok, const char* enc)
409{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000410 PyObject *readline = NULL, *stream = NULL, *io = NULL;
411 int ok = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000412
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000413 io = PyImport_ImportModule("io");
414 if (io == NULL)
415 goto cleanup;
416
417 stream = PyObject_CallMethod(io, "open", "ssis",
418 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000419 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000420 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000421
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000422 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000423 if (readline == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000424 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
426 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000427 ok = 1;
428
429 cleanup:
430 Py_XDECREF(stream);
431 Py_XDECREF(io);
432 return ok;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000433}
434
435/* Fetch the next byte from TOK. */
436
437static int fp_getc(struct tok_state *tok) {
438 return getc(tok->fp);
439}
440
441/* Unfetch the last byte back into TOK. */
442
443static void fp_ungetc(int c, struct tok_state *tok) {
444 ungetc(c, tok->fp);
445}
446
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000447/* Check whether the characters at s start a valid
448 UTF-8 sequence. Return the number of characters forming
449 the sequence if yes, 0 if not. */
450static int valid_utf8(const unsigned char* s)
451{
452 int expected = 0;
453 int length;
454 if (*s < 0x80)
455 /* single-byte code */
456 return 1;
457 if (*s < 0xc0)
458 /* following byte */
459 return 0;
460 if (*s < 0xE0)
461 expected = 1;
462 else if (*s < 0xF0)
463 expected = 2;
464 else if (*s < 0xF8)
465 expected = 3;
466 else
467 return 0;
468 length = expected + 1;
469 for (; expected; expected--)
470 if (s[expected] < 0x80 || s[expected] >= 0xC0)
471 return 0;
472 return length;
473}
474
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475/* Read a line of input from TOK. Determine encoding
476 if necessary. */
477
478static char *
479decoding_fgets(char *s, int size, struct tok_state *tok)
480{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000481 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000482 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000483 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000484 if (tok->decoding_state < 0) {
485 /* We already have a codec associated with
486 this input. */
487 line = fp_readl(s, size, tok);
488 break;
489 } else if (tok->decoding_state > 0) {
490 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000491 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000492 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493 break;
494 } else {
495 /* We have not yet determined the encoding.
496 If an encoding is found, use the file-pointer
497 reader functions from now on. */
498 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
499 return error_ret(tok);
500 assert(tok->decoding_state != 0);
501 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000502 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
504 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
505 return error_ret(tok);
506 }
507 }
508#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000509 /* The default encoding is UTF-8, so make sure we don't have any
510 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000511 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000513 int length;
514 for (c = (unsigned char *)line; *c; c += length)
515 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516 badchar = *c;
517 break;
518 }
519 }
520 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000521 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000522 /* Need to add 1 to the line number, since this line
523 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000524 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000525 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000526 "in file %.200s on line %i, "
527 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000528 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000529 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000530 PyErr_SetString(PyExc_SyntaxError, buf);
531 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 }
533#endif
534 return line;
535}
536
537static int
538decoding_feof(struct tok_state *tok)
539{
540 if (tok->decoding_state >= 0) {
541 return feof(tok->fp);
542 } else {
543 PyObject* buf = tok->decoding_buffer;
544 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000545 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546 if (buf == NULL) {
547 error_ret(tok);
548 return 1;
549 } else {
550 tok->decoding_buffer = buf;
551 }
552 }
553 return PyObject_Length(buf) == 0;
554 }
555}
556
557/* Fetch a byte from TOK, using the string buffer. */
558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000559static int
560buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000561 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562}
563
564/* Unfetch a byte from TOK, using the string buffer. */
565
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000566static void
567buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000568 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000569 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000570}
571
572/* Set the readline function for TOK to ENC. For the string-based
573 tokenizer, this means to just record the encoding. */
574
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000575static int
576buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577 tok->enc = enc;
578 return 1;
579}
580
581/* Return a UTF-8 encoding Python string object from the
582 C byte string STR, which is encoded with ENC. */
583
584static PyObject *
585translate_into_utf8(const char* str, const char* enc) {
586 PyObject *utf8;
587 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
588 if (buf == NULL)
589 return NULL;
590 utf8 = PyUnicode_AsUTF8String(buf);
591 Py_DECREF(buf);
592 return utf8;
593}
594
595/* Decode a byte string STR for use as the buffer of TOK.
596 Look for encoding declarations inside STR, and record them
597 inside TOK. */
598
599static const char *
600decode_str(const char *str, struct tok_state *tok)
601{
602 PyObject* utf8 = NULL;
603 const char *s;
604 int lineno = 0;
605 tok->enc = NULL;
606 tok->str = str;
607 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000608 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000609 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000610 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611 if (tok->enc != NULL) {
612 utf8 = translate_into_utf8(str, tok->enc);
613 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000614 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 str = PyString_AsString(utf8);
616 }
617 for (s = str;; s++) {
618 if (*s == '\0') break;
619 else if (*s == '\n') {
620 lineno++;
621 if (lineno == 2) break;
622 }
623 }
624 tok->enc = NULL;
625 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000626 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627 if (tok->enc != NULL) {
628 assert(utf8 == NULL);
629 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000630 if (utf8 == NULL) {
631 PyErr_Format(PyExc_SyntaxError,
632 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000633 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000634 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000635 str = PyString_AsString(utf8);
636 }
637 assert(tok->decoding_buffer == NULL);
638 tok->decoding_buffer = utf8; /* CAUTION */
639 return str;
640}
641
642#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000643
644/* Set up tokenizer for string */
645
646struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000647PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000648{
649 struct tok_state *tok = tok_new();
650 if (tok == NULL)
651 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000653 if (str == NULL) {
654 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000656 }
657
Martin v. Löwis95292d62002-12-11 14:04:59 +0000658 /* XXX: constify members. */
659 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000660 return tok;
661}
662
663
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000664/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000665
666struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000667PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000668{
669 struct tok_state *tok = tok_new();
670 if (tok == NULL)
671 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000672 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000673 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000674 return NULL;
675 }
676 tok->cur = tok->inp = tok->buf;
677 tok->end = tok->buf + BUFSIZ;
678 tok->fp = fp;
679 tok->prompt = ps1;
680 tok->nextprompt = ps2;
681 return tok;
682}
683
684
685/* Free a tok_state structure */
686
687void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000688PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000689{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000690 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000691 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000692#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000693 Py_XDECREF(tok->decoding_readline);
694 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000695#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000696 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000697 PyMem_FREE(tok->buf);
698 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000699}
700
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000701/* Get next char, updating state; error code goes into tok->done */
702
703static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000704tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000705{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000707 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000708 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000709 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000710 if (tok->done != E_OK)
711 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000713 char *end = strchr(tok->inp, '\n');
714 if (end != NULL)
715 end++;
716 else {
717 end = strchr(tok->inp, '\0');
718 if (end == tok->inp) {
719 tok->done = E_EOF;
720 return EOF;
721 }
722 }
723 if (tok->start == NULL)
724 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000725 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000726 tok->lineno++;
727 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000728 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000730 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000731 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732 if (tok->nextprompt != NULL)
733 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000734 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000735 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000736 else if (*newtok == '\0') {
737 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000738 tok->done = E_EOF;
739 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000740 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000741 size_t start = tok->start - tok->buf;
742 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000743 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000744 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000745 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000746 tok->lineno++;
747 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000748 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000749 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000750 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000751 tok->done = E_NOMEM;
752 return EOF;
753 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000754 tok->buf = buf;
755 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000756 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000757 strcpy(tok->buf + oldlen, newtok);
758 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000759 tok->inp = tok->buf + newlen;
760 tok->end = tok->inp + 1;
761 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000762 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000763 else {
764 tok->lineno++;
765 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000766 PyMem_FREE(tok->buf);
767 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000768 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000769 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000770 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000771 tok->inp = strchr(tok->buf, '\0');
772 tok->end = tok->inp + 1;
773 }
774 }
775 else {
776 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000777 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000778 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 if (tok->start == NULL) {
780 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 tok->buf = (char *)
782 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 if (tok->buf == NULL) {
784 tok->done = E_NOMEM;
785 return EOF;
786 }
787 tok->end = tok->buf + BUFSIZ;
788 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000789 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
790 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 tok->done = E_EOF;
792 done = 1;
793 }
794 else {
795 tok->done = E_OK;
796 tok->inp = strchr(tok->buf, '\0');
797 done = tok->inp[-1] == '\n';
798 }
799 }
800 else {
801 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000802 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000803 tok->done = E_EOF;
804 done = 1;
805 }
806 else
807 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000808 }
809 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000810 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000812 Py_ssize_t curstart = tok->start == NULL ? -1 :
813 tok->start - tok->buf;
814 Py_ssize_t curvalid = tok->inp - tok->buf;
815 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000816 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000817 newbuf = (char *)PyMem_REALLOC(newbuf,
818 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000819 if (newbuf == NULL) {
820 tok->done = E_NOMEM;
821 tok->cur = tok->inp;
822 return EOF;
823 }
824 tok->buf = newbuf;
825 tok->inp = tok->buf + curvalid;
826 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 tok->start = curstart < 0 ? NULL :
828 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000829 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000830 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000831 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000832 /* Break out early on decoding
833 errors, as tok->buf will be NULL
834 */
835 if (tok->decoding_erred)
836 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000837 /* Last line does not end in \n,
838 fake one */
839 strcpy(tok->inp, "\n");
840 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000841 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000842 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000843 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000844 if (tok->buf != NULL) {
845 tok->cur = tok->buf + cur;
846 tok->line_start = tok->cur;
847 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000848 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000849 pt = tok->inp - 2;
850 if (pt >= tok->buf && *pt == '\r') {
851 *pt++ = '\n';
852 *pt = '\0';
853 tok->inp = pt;
854 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000855 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856 }
857 if (tok->done != E_OK) {
858 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000859 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000860 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861 return EOF;
862 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000864 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000865}
866
867
868/* Back-up one character */
869
870static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000871tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000872{
873 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000874 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000875 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000876 if (*tok->cur != c)
877 *tok->cur = c;
878 }
879}
880
881
882/* Return the token corresponding to a single character */
883
884int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000885PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000886{
887 switch (c) {
888 case '(': return LPAR;
889 case ')': return RPAR;
890 case '[': return LSQB;
891 case ']': return RSQB;
892 case ':': return COLON;
893 case ',': return COMMA;
894 case ';': return SEMI;
895 case '+': return PLUS;
896 case '-': return MINUS;
897 case '*': return STAR;
898 case '/': return SLASH;
899 case '|': return VBAR;
900 case '&': return AMPER;
901 case '<': return LESS;
902 case '>': return GREATER;
903 case '=': return EQUAL;
904 case '.': return DOT;
905 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000906 case '{': return LBRACE;
907 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000908 case '^': return CIRCUMFLEX;
909 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000910 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000911 default: return OP;
912 }
913}
914
915
Guido van Rossumfbab9051991-10-20 20:25:03 +0000916int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000917PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000918{
919 switch (c1) {
920 case '=':
921 switch (c2) {
922 case '=': return EQEQUAL;
923 }
924 break;
925 case '!':
926 switch (c2) {
927 case '=': return NOTEQUAL;
928 }
929 break;
930 case '<':
931 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000932 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000933 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000934 }
935 break;
936 case '>':
937 switch (c2) {
938 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000939 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000940 }
941 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000942 case '+':
943 switch (c2) {
944 case '=': return PLUSEQUAL;
945 }
946 break;
947 case '-':
948 switch (c2) {
949 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000950 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000951 }
952 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000953 case '*':
954 switch (c2) {
955 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000956 case '=': return STAREQUAL;
957 }
958 break;
959 case '/':
960 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000961 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000962 case '=': return SLASHEQUAL;
963 }
964 break;
965 case '|':
966 switch (c2) {
967 case '=': return VBAREQUAL;
968 }
969 break;
970 case '%':
971 switch (c2) {
972 case '=': return PERCENTEQUAL;
973 }
974 break;
975 case '&':
976 switch (c2) {
977 case '=': return AMPEREQUAL;
978 }
979 break;
980 case '^':
981 switch (c2) {
982 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000983 }
984 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000985 }
986 return OP;
987}
988
Thomas Wouters434d0822000-08-24 20:11:32 +0000989int
990PyToken_ThreeChars(int c1, int c2, int c3)
991{
992 switch (c1) {
993 case '<':
994 switch (c2) {
995 case '<':
996 switch (c3) {
997 case '=':
998 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +0000999 }
1000 break;
1001 }
1002 break;
1003 case '>':
1004 switch (c2) {
1005 case '>':
1006 switch (c3) {
1007 case '=':
1008 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 }
1010 break;
1011 }
1012 break;
1013 case '*':
1014 switch (c2) {
1015 case '*':
1016 switch (c3) {
1017 case '=':
1018 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 }
1020 break;
1021 }
1022 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001023 case '/':
1024 switch (c2) {
1025 case '/':
1026 switch (c3) {
1027 case '=':
1028 return DOUBLESLASHEQUAL;
1029 }
1030 break;
1031 }
1032 break;
Georg Brandldde00282007-03-18 19:01:53 +00001033 case '.':
1034 switch (c2) {
1035 case '.':
1036 switch (c3) {
1037 case '.':
1038 return ELLIPSIS;
1039 }
1040 break;
1041 }
1042 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001043 }
1044 return OP;
1045}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001046
Guido van Rossum926f13a1998-04-09 21:38:06 +00001047static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001048indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001049{
1050 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001051 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001052 tok->cur = tok->inp;
1053 return 1;
1054 }
1055 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001056 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1057 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001058 tok->altwarning = 0;
1059 }
1060 return 0;
1061}
1062
1063
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064/* Get next token, after space stripping etc. */
1065
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001066static int
1067tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001068{
1069 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001070 int blankline;
1071
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001072 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001073 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001074 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001075 blankline = 0;
1076
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001077 /* Get indentation level */
1078 if (tok->atbol) {
1079 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001080 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001081 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001082 for (;;) {
1083 c = tok_nextc(tok);
1084 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001085 col++, altcol++;
1086 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001088 altcol = (altcol/tok->alttabsize + 1)
1089 * tok->alttabsize;
1090 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001091 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001092 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093 else
1094 break;
1095 }
1096 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001097 if (c == '#' || c == '\n') {
1098 /* Lines with only whitespace and/or comments
1099 shouldn't affect the indentation and are
1100 not passed to the parser as NEWLINE tokens,
1101 except *totally* empty lines in interactive
1102 mode, which signal the end of a command group. */
1103 if (col == 0 && c == '\n' && tok->prompt != NULL)
1104 blankline = 0; /* Let it through */
1105 else
1106 blankline = 1; /* Ignore completely */
1107 /* We can't jump back right here since we still
1108 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001109 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001110 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001111 if (col == tok->indstack[tok->indent]) {
1112 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001113 if (altcol != tok->altindstack[tok->indent]) {
1114 if (indenterror(tok))
1115 return ERRORTOKEN;
1116 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001118 else if (col > tok->indstack[tok->indent]) {
1119 /* Indent -- always one */
1120 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001121 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001122 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001123 return ERRORTOKEN;
1124 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001125 if (altcol <= tok->altindstack[tok->indent]) {
1126 if (indenterror(tok))
1127 return ERRORTOKEN;
1128 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001129 tok->pendin++;
1130 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001131 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001132 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001133 else /* col < tok->indstack[tok->indent] */ {
1134 /* Dedent -- any number, must be consistent */
1135 while (tok->indent > 0 &&
1136 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001137 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001138 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001139 }
1140 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001141 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001142 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 return ERRORTOKEN;
1144 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001145 if (altcol != tok->altindstack[tok->indent]) {
1146 if (indenterror(tok))
1147 return ERRORTOKEN;
1148 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001149 }
1150 }
1151 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001152
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001153 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001154
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001155 /* Return pending indents/dedents */
1156 if (tok->pendin != 0) {
1157 if (tok->pendin < 0) {
1158 tok->pendin++;
1159 return DEDENT;
1160 }
1161 else {
1162 tok->pendin--;
1163 return INDENT;
1164 }
1165 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001166
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001167 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001168 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001169 /* Skip spaces */
1170 do {
1171 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001172 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001173
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001174 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001175 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001176
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001177 /* Skip comment */
1178 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001179 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001180 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001181
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001182 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001183 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001185 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001186
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187 /* Identifier (most frequent token!) */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001188 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001189 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001190 switch (c) {
1191 case 'r':
1192 case 'R':
1193 c = tok_nextc(tok);
1194 if (c == '"' || c == '\'')
1195 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001196 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001197 case 'b':
1198 case 'B':
1199 c = tok_nextc(tok);
1200 if (c == 'r' || c == 'R')
1201 c = tok_nextc(tok);
1202 if (c == '"' || c == '\'')
1203 goto letter_quote;
1204 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001205 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206 while (is_potential_identifier_char(c)) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001208 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001210 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 *p_end = tok->cur;
1212 return NAME;
1213 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001214
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001215 /* Newline */
1216 if (c == '\n') {
1217 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001218 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001219 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001220 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001222 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001223 return NEWLINE;
1224 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001225
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001226 /* Period or number starting with period? */
1227 if (c == '.') {
1228 c = tok_nextc(tok);
1229 if (isdigit(c)) {
1230 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001231 } else if (c == '.') {
1232 c = tok_nextc(tok);
1233 if (c == '.') {
1234 *p_start = tok->start;
1235 *p_end = tok->cur;
1236 return ELLIPSIS;
1237 } else {
1238 tok_backup(tok, c);
1239 }
1240 tok_backup(tok, '.');
1241 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001242 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001243 }
Georg Brandldde00282007-03-18 19:01:53 +00001244 *p_start = tok->start;
1245 *p_end = tok->cur;
1246 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001247 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001248
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 /* Number */
1250 if (isdigit(c)) {
1251 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001252 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001253 c = tok_nextc(tok);
1254 if (c == '.')
1255 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001256#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001257 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001258 goto imaginary;
1259#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001260 if (c == 'x' || c == 'X') {
1261 /* Hex */
1262 do {
1263 c = tok_nextc(tok);
1264 } while (isxdigit(c));
1265 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001266 else if (c == 'o' || c == 'O') {
1267 /* Octal */
1268 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001270 } while ('0' <= c && c < '8');
1271 }
1272 else if (c == 'b' || c == 'B') {
1273 /* Binary */
1274 do {
1275 c = tok_nextc(tok);
1276 } while (c == '0' || c == '1');
1277 }
1278 else {
1279 int nonzero = 0;
1280 /* maybe old-style octal; c is first char of it */
1281 /* in any case, allow '0' as a literal */
1282 while (c == '0')
1283 c = tok_nextc(tok);
1284 while (isdigit(c)) {
1285 nonzero = 1;
1286 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001287 }
1288 if (c == '.')
1289 goto fraction;
1290 else if (c == 'e' || c == 'E')
1291 goto exponent;
1292#ifndef WITHOUT_COMPLEX
1293 else if (c == 'j' || c == 'J')
1294 goto imaginary;
1295#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001296 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001297 tok->done = E_TOKEN;
1298 tok_backup(tok, c);
1299 return ERRORTOKEN;
1300 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 }
1302 }
1303 else {
1304 /* Decimal */
1305 do {
1306 c = tok_nextc(tok);
1307 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001308 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001309 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001310 if (c == '.') {
1311 fraction:
1312 /* Fraction */
1313 do {
1314 c = tok_nextc(tok);
1315 } while (isdigit(c));
1316 }
1317 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001318 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001319 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001321 if (c == '+' || c == '-')
1322 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001323 if (!isdigit(c)) {
1324 tok->done = E_TOKEN;
1325 tok_backup(tok, c);
1326 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001327 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001328 do {
1329 c = tok_nextc(tok);
1330 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001331 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001332#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001333 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001334 /* Imaginary part */
1335 imaginary:
1336 c = tok_nextc(tok);
1337#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001338 }
1339 }
1340 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001341 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342 *p_end = tok->cur;
1343 return NUMBER;
1344 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001345
1346 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001347 /* String */
1348 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001349 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001350 int quote = c;
1351 int triple = 0;
1352 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 for (;;) {
1354 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001355 if (c == '\n') {
1356 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001357 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001358 tok_backup(tok, c);
1359 return ERRORTOKEN;
1360 }
1361 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001362 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001363 }
1364 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001365 if (triple)
1366 tok->done = E_EOFS;
1367 else
1368 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001369 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001370 return ERRORTOKEN;
1371 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001372 else if (c == quote) {
1373 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001374 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001375 c = tok_nextc(tok);
1376 if (c == quote) {
1377 triple = 1;
1378 tripcount = 0;
1379 continue;
1380 }
1381 tok_backup(tok, c);
1382 }
1383 if (!triple || tripcount == 3)
1384 break;
1385 }
1386 else if (c == '\\') {
1387 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001389 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001390 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001391 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001392 return ERRORTOKEN;
1393 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001395 else
1396 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001398 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001399 *p_end = tok->cur;
1400 return STRING;
1401 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 /* Line continuation */
1404 if (c == '\\') {
1405 c = tok_nextc(tok);
1406 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001407 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001408 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 return ERRORTOKEN;
1410 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001411 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412 goto again; /* Read next line */
1413 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
Guido van Rossumfbab9051991-10-20 20:25:03 +00001415 /* Check for two-character token */
1416 {
1417 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001418 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001419 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001420 int c3 = tok_nextc(tok);
1421 int token3 = PyToken_ThreeChars(c, c2, c3);
1422 if (token3 != OP) {
1423 token = token3;
1424 } else {
1425 tok_backup(tok, c3);
1426 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001428 *p_end = tok->cur;
1429 return token;
1430 }
1431 tok_backup(tok, c2);
1432 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001433
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001434 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001435 switch (c) {
1436 case '(':
1437 case '[':
1438 case '{':
1439 tok->level++;
1440 break;
1441 case ')':
1442 case ']':
1443 case '}':
1444 tok->level--;
1445 break;
1446 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001447
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001448 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001449 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001450 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001451 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001452}
1453
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001454int
1455PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1456{
1457 int result = tok_get(tok, p_start, p_end);
1458 if (tok->decoding_erred) {
1459 result = ERRORTOKEN;
1460 tok->done = E_DECODE;
1461 }
1462 return result;
1463}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001464
Guido van Rossum408027e1996-12-30 16:17:54 +00001465#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466
1467void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001468tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469{
Guido van Rossum86bea461997-04-29 21:03:06 +00001470 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001471 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1472 printf("(%.*s)", (int)(end - start), start);
1473}
1474
1475#endif