blob: 2e700bcfe256f6c640fb0b3ba098abd286f1a87f [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_')
25
26#define is_potential_identifier_char(c) (\
27 (c >= 'a' && c <= 'z')\
28 || (c >= 'A' && c <= 'Z')\
29 || (c >= '0' && c <= '9')\
30 || c == '_')
31
Martin v. Löwis566f6af2002-10-26 14:39:10 +000032extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000033/* Return malloc'ed string including trailing \n;
34 empty malloc'ed string for EOF;
35 NULL if interrupted */
36
Guido van Rossum4fe87291992-02-26 15:24:44 +000037/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000038#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000039
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000040/* Convert a possibly signed character to a nonnegative int */
41/* XXX This assumes characters are 8 bits wide */
42#ifdef __CHAR_UNSIGNED__
43#define Py_CHARMASK(c) (c)
44#else
45#define Py_CHARMASK(c) ((c) & 0xff)
46#endif
47
Guido van Rossum3f5da241990-12-20 15:06:42 +000048/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000049static struct tok_state *tok_new(void);
50static int tok_nextc(struct tok_state *tok);
51static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000052
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000053/* Token names */
54
Guido van Rossum86bea461997-04-29 21:03:06 +000055char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000056 "ENDMARKER",
57 "NAME",
58 "NUMBER",
59 "STRING",
60 "NEWLINE",
61 "INDENT",
62 "DEDENT",
63 "LPAR",
64 "RPAR",
65 "LSQB",
66 "RSQB",
67 "COLON",
68 "COMMA",
69 "SEMI",
70 "PLUS",
71 "MINUS",
72 "STAR",
73 "SLASH",
74 "VBAR",
75 "AMPER",
76 "LESS",
77 "GREATER",
78 "EQUAL",
79 "DOT",
80 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000081 "LBRACE",
82 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000083 "EQEQUAL",
84 "NOTEQUAL",
85 "LESSEQUAL",
86 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000087 "TILDE",
88 "CIRCUMFLEX",
89 "LEFTSHIFT",
90 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000091 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000092 "PLUSEQUAL",
93 "MINEQUAL",
94 "STAREQUAL",
95 "SLASHEQUAL",
96 "PERCENTEQUAL",
97 "AMPEREQUAL",
98 "VBAREQUAL",
99 "CIRCUMFLEXEQUAL",
100 "LEFTSHIFTEQUAL",
101 "RIGHTSHIFTEQUAL",
102 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +0000103 "DOUBLESLASH",
104 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000105 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000106 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000107 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000108 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000109 "OP",
110 "<ERRORTOKEN>",
111 "<N_TOKENS>"
112};
113
114
115/* Create and initialize a new tok_state structure */
116
117static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000118tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000120 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
121 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000122 if (tok == NULL)
123 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000124 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000125 tok->done = E_OK;
126 tok->fp = NULL;
127 tok->tabsize = TABSIZE;
128 tok->indent = 0;
129 tok->indstack[0] = 0;
130 tok->atbol = 1;
131 tok->pendin = 0;
132 tok->prompt = tok->nextprompt = NULL;
133 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000134 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000135 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000136 tok->altwarning = 1;
137 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000138 tok->alttabsize = 1;
139 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->decoding_state = 0;
141 tok->decoding_erred = 0;
142 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000144 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000146 tok->decoding_readline = NULL;
147 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000148#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000149 return tok;
150}
151
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000152#ifdef PGEN
153
154static char *
155decoding_fgets(char *s, int size, struct tok_state *tok)
156{
157 return fgets(s, size, tok->fp);
158}
159
160static int
161decoding_feof(struct tok_state *tok)
162{
163 return feof(tok->fp);
164}
165
166static const char *
167decode_str(const char *str, struct tok_state *tok)
168{
169 return str;
170}
171
172#else /* PGEN */
173
174static char *
175error_ret(struct tok_state *tok) /* XXX */
176{
177 tok->decoding_erred = 1;
178 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180 tok->buf = NULL;
181 return NULL; /* as if it were EOF */
182}
183
184static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000186{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 if (result != NULL) {
189 memcpy(result, s, len);
190 result[len] = '\0';
191 }
192 return result;
193}
194
195static char *
196get_normal_name(char *s) /* for utf-8 and latin-1 */
197{
198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0') break;
203 else if (c == '_') buf[i] = '-';
204 else buf[i] = tolower(c);
205 }
206 buf[i] = '\0';
207 if (strcmp(buf, "utf-8") == 0 ||
208 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
214 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
215 else return s;
216}
217
218/* Return the coding spec in S, or NULL if none is found. */
219
220static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000221get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000224 /* Coding spec must be in a comment, and that comment must be
225 * the only statement on the source code line. */
226 for (i = 0; i < size - 6; i++) {
227 if (s[i] == '#')
228 break;
229 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
230 return NULL;
231 }
232 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000233 const char* t = s + i;
234 if (strncmp(t, "coding", 6) == 0) {
235 const char* begin = NULL;
236 t += 6;
237 if (t[0] != ':' && t[0] != '=')
238 continue;
239 do {
240 t++;
241 } while (t[0] == '\x20' || t[0] == '\t');
242
243 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000244 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000245 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000246 t++;
247
248 if (begin < t) {
249 char* r = new_string(begin, t - begin);
250 char* q = get_normal_name(r);
251 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000253 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254 }
255 return r;
256 }
257 }
258 }
259 return NULL;
260}
261
262/* Check whether the line contains a coding spec. If it does,
263 invoke the set_readline function for the new encoding.
264 This function receives the tok_state and the new encoding.
265 Return 1 on success, 0 on failure. */
266
267static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000269 int set_readline(struct tok_state *, const char *))
270{
Tim Peters17db21f2002-09-03 15:39:58 +0000271 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000273
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000274 if (tok->cont_line)
275 /* It's a continuation line, so it can't be a coding spec. */
276 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000278 if (cs != NULL) {
279 tok->read_coding_spec = 1;
280 if (tok->encoding == NULL) {
281 assert(tok->decoding_state == 1); /* raw */
282 if (strcmp(cs, "utf-8") == 0 ||
283 strcmp(cs, "iso-8859-1") == 0) {
284 tok->encoding = cs;
285 } else {
286 r = set_readline(tok, cs);
287 if (r) {
288 tok->encoding = cs;
289 tok->decoding_state = -1;
290 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000291 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
294 } else { /* then, compare cs with BOM */
295 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000297 }
298 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000299 if (!r) {
300 cs = tok->encoding;
301 if (!cs)
302 cs = "with BOM";
303 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
304 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000305 return r;
306}
307
308/* See whether the file starts with a BOM. If it does,
309 invoke the set_readline function with the new encoding.
310 Return 1 on success, 0 on failure. */
311
312static int
313check_bom(int get_char(struct tok_state *),
314 void unget_char(int, struct tok_state *),
315 int set_readline(struct tok_state *, const char *),
316 struct tok_state *tok)
317{
318 int ch = get_char(tok);
319 tok->decoding_state = 1;
320 if (ch == EOF) {
321 return 1;
322 } else if (ch == 0xEF) {
323 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
324 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
325#if 0
326 /* Disable support for UTF-16 BOMs until a decision
327 is made whether this needs to be supported. */
328 } else if (ch == 0xFE) {
329 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-be")) return 0;
331 tok->decoding_state = -1;
332 } else if (ch == 0xFF) {
333 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
334 if (!set_readline(tok, "utf-16-le")) return 0;
335 tok->decoding_state = -1;
336#endif
337 } else {
338 unget_char(ch, tok);
339 return 1;
340 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000341 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
344 return 1;
345 NON_BOM:
346 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
347 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
348 return 1;
349}
350
351/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyStringObject *: previous call to fp_readl did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000363 reached): see tok_nextc and its calls to decoding_fgets.
364*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365
366static char *
367fp_readl(char *s, int size, struct tok_state *tok)
368{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000369 PyObject* bufobj = tok->decoding_buffer;
370 const char *buf;
371 Py_ssize_t buflen;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000372 int allocated = 0;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373
374 /* Ask for one less byte so we can terminate it */
375 assert(size > 0);
376 size--;
377
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000378 if (bufobj == NULL) {
379 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
380 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000381 goto error;
382 allocated = 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000383 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000384 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0) {
385 goto error;
386 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000387 if (buflen > size) {
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000388 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
390 buflen-size);
391 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000392 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000393 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000395 memcpy(s, buf, buflen);
396 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000397 if (buflen == 0) /* EOF */
398 s = NULL;
399 if (allocated) {
400 Py_DECREF(bufobj);
401 }
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000402 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000403
404error:
405 if (allocated) {
406 Py_XDECREF(bufobj);
407 }
408 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000409}
410
411/* Set the readline function for TOK to a StreamReader's
412 readline function. The StreamReader is named ENC.
413
414 This function is called from check_bom and check_coding_spec.
415
416 ENC is usually identical to the future value of tok->encoding,
417 except for the (currently unsupported) case of UTF-16.
418
419 Return 1 on success, 0 on failure. */
420
421static int
422fp_setreadl(struct tok_state *tok, const char* enc)
423{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000424 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000425
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000426 io = PyImport_ImportModule("io");
427 if (io == NULL)
428 goto cleanup;
429
430 stream = PyObject_CallMethod(io, "open", "ssis",
431 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000432 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000433 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000434
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000435 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000436 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000437 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000438
439 cleanup:
440 Py_XDECREF(stream);
441 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000442 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000443}
444
445/* Fetch the next byte from TOK. */
446
447static int fp_getc(struct tok_state *tok) {
448 return getc(tok->fp);
449}
450
451/* Unfetch the last byte back into TOK. */
452
453static void fp_ungetc(int c, struct tok_state *tok) {
454 ungetc(c, tok->fp);
455}
456
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000457/* Check whether the characters at s start a valid
458 UTF-8 sequence. Return the number of characters forming
459 the sequence if yes, 0 if not. */
460static int valid_utf8(const unsigned char* s)
461{
462 int expected = 0;
463 int length;
464 if (*s < 0x80)
465 /* single-byte code */
466 return 1;
467 if (*s < 0xc0)
468 /* following byte */
469 return 0;
470 if (*s < 0xE0)
471 expected = 1;
472 else if (*s < 0xF0)
473 expected = 2;
474 else if (*s < 0xF8)
475 expected = 3;
476 else
477 return 0;
478 length = expected + 1;
479 for (; expected; expected--)
480 if (s[expected] < 0x80 || s[expected] >= 0xC0)
481 return 0;
482 return length;
483}
484
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000485/* Read a line of input from TOK. Determine encoding
486 if necessary. */
487
488static char *
489decoding_fgets(char *s, int size, struct tok_state *tok)
490{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000491 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000492 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000493 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494 if (tok->decoding_state < 0) {
495 /* We already have a codec associated with
496 this input. */
497 line = fp_readl(s, size, tok);
498 break;
499 } else if (tok->decoding_state > 0) {
500 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000501 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000502 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000503 break;
504 } else {
505 /* We have not yet determined the encoding.
506 If an encoding is found, use the file-pointer
507 reader functions from now on. */
508 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
509 return error_ret(tok);
510 assert(tok->decoding_state != 0);
511 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000512 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000513 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
514 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
515 return error_ret(tok);
516 }
517 }
518#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000519 /* The default encoding is UTF-8, so make sure we don't have any
520 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000521 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000522 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000523 int length;
524 for (c = (unsigned char *)line; *c; c += length)
525 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526 badchar = *c;
527 break;
528 }
529 }
530 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000531 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000532 /* Need to add 1 to the line number, since this line
533 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000534 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000535 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000536 "in file %.200s on line %i, "
537 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000538 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000539 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000540 PyErr_SetString(PyExc_SyntaxError, buf);
541 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 }
543#endif
544 return line;
545}
546
547static int
548decoding_feof(struct tok_state *tok)
549{
550 if (tok->decoding_state >= 0) {
551 return feof(tok->fp);
552 } else {
553 PyObject* buf = tok->decoding_buffer;
554 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000555 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556 if (buf == NULL) {
557 error_ret(tok);
558 return 1;
559 } else {
560 tok->decoding_buffer = buf;
561 }
562 }
563 return PyObject_Length(buf) == 0;
564 }
565}
566
567/* Fetch a byte from TOK, using the string buffer. */
568
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000569static int
570buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000571 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572}
573
574/* Unfetch a byte from TOK, using the string buffer. */
575
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000576static void
577buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000579 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000580}
581
582/* Set the readline function for TOK to ENC. For the string-based
583 tokenizer, this means to just record the encoding. */
584
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000585static int
586buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 tok->enc = enc;
588 return 1;
589}
590
591/* Return a UTF-8 encoding Python string object from the
592 C byte string STR, which is encoded with ENC. */
593
594static PyObject *
595translate_into_utf8(const char* str, const char* enc) {
596 PyObject *utf8;
597 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
598 if (buf == NULL)
599 return NULL;
600 utf8 = PyUnicode_AsUTF8String(buf);
601 Py_DECREF(buf);
602 return utf8;
603}
604
605/* Decode a byte string STR for use as the buffer of TOK.
606 Look for encoding declarations inside STR, and record them
607 inside TOK. */
608
609static const char *
610decode_str(const char *str, struct tok_state *tok)
611{
612 PyObject* utf8 = NULL;
613 const char *s;
614 int lineno = 0;
615 tok->enc = NULL;
616 tok->str = str;
617 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000618 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000620 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621 if (tok->enc != NULL) {
622 utf8 = translate_into_utf8(str, tok->enc);
623 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000624 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 str = PyString_AsString(utf8);
626 }
627 for (s = str;; s++) {
628 if (*s == '\0') break;
629 else if (*s == '\n') {
630 lineno++;
631 if (lineno == 2) break;
632 }
633 }
634 tok->enc = NULL;
635 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000636 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637 if (tok->enc != NULL) {
638 assert(utf8 == NULL);
639 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000640 if (utf8 == NULL) {
641 PyErr_Format(PyExc_SyntaxError,
642 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000643 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000644 }
Neal Norwitzf7f28fc2007-08-11 21:31:25 +0000645 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646 }
647 assert(tok->decoding_buffer == NULL);
648 tok->decoding_buffer = utf8; /* CAUTION */
649 return str;
650}
651
652#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000653
654/* Set up tokenizer for string */
655
656struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000657PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000658{
659 struct tok_state *tok = tok_new();
660 if (tok == NULL)
661 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000662 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000663 if (str == NULL) {
664 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000666 }
667
Martin v. Löwis95292d62002-12-11 14:04:59 +0000668 /* XXX: constify members. */
669 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000670 return tok;
671}
672
673
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000674/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000675
676struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000677PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000678{
679 struct tok_state *tok = tok_new();
680 if (tok == NULL)
681 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000682 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000683 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000684 return NULL;
685 }
686 tok->cur = tok->inp = tok->buf;
687 tok->end = tok->buf + BUFSIZ;
688 tok->fp = fp;
689 tok->prompt = ps1;
690 tok->nextprompt = ps2;
691 return tok;
692}
693
694
695/* Free a tok_state structure */
696
697void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000698PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000699{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000701 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000702#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 Py_XDECREF(tok->decoding_readline);
704 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000705#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000706 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000707 PyMem_FREE(tok->buf);
708 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000709}
710
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711/* Get next char, updating state; error code goes into tok->done */
712
713static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000714tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000716 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000717 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000718 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000719 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000720 if (tok->done != E_OK)
721 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000723 char *end = strchr(tok->inp, '\n');
724 if (end != NULL)
725 end++;
726 else {
727 end = strchr(tok->inp, '\0');
728 if (end == tok->inp) {
729 tok->done = E_EOF;
730 return EOF;
731 }
732 }
733 if (tok->start == NULL)
734 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000735 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000736 tok->lineno++;
737 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000738 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000741 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742 if (tok->nextprompt != NULL)
743 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000744 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000745 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000746 else if (*newtok == '\0') {
747 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748 tok->done = E_EOF;
749 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000750 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000751 size_t start = tok->start - tok->buf;
752 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000753 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000754 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000755 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000756 tok->lineno++;
757 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000758 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000759 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000760 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000761 tok->done = E_NOMEM;
762 return EOF;
763 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000764 tok->buf = buf;
765 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000766 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000767 strcpy(tok->buf + oldlen, newtok);
768 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000769 tok->inp = tok->buf + newlen;
770 tok->end = tok->inp + 1;
771 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000772 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000773 else {
774 tok->lineno++;
775 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000776 PyMem_FREE(tok->buf);
777 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000778 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000780 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000781 tok->inp = strchr(tok->buf, '\0');
782 tok->end = tok->inp + 1;
783 }
784 }
785 else {
786 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000787 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000788 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000789 if (tok->start == NULL) {
790 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000791 tok->buf = (char *)
792 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 if (tok->buf == NULL) {
794 tok->done = E_NOMEM;
795 return EOF;
796 }
797 tok->end = tok->buf + BUFSIZ;
798 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000799 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
800 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000801 tok->done = E_EOF;
802 done = 1;
803 }
804 else {
805 tok->done = E_OK;
806 tok->inp = strchr(tok->buf, '\0');
807 done = tok->inp[-1] == '\n';
808 }
809 }
810 else {
811 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000812 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000813 tok->done = E_EOF;
814 done = 1;
815 }
816 else
817 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000818 }
819 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000820 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000822 Py_ssize_t curstart = tok->start == NULL ? -1 :
823 tok->start - tok->buf;
824 Py_ssize_t curvalid = tok->inp - tok->buf;
825 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000826 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000827 newbuf = (char *)PyMem_REALLOC(newbuf,
828 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000829 if (newbuf == NULL) {
830 tok->done = E_NOMEM;
831 tok->cur = tok->inp;
832 return EOF;
833 }
834 tok->buf = newbuf;
835 tok->inp = tok->buf + curvalid;
836 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000837 tok->start = curstart < 0 ? NULL :
838 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000839 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000840 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000841 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000842 /* Break out early on decoding
843 errors, as tok->buf will be NULL
844 */
845 if (tok->decoding_erred)
846 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000847 /* Last line does not end in \n,
848 fake one */
849 strcpy(tok->inp, "\n");
850 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000851 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000852 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000853 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000854 if (tok->buf != NULL) {
855 tok->cur = tok->buf + cur;
856 tok->line_start = tok->cur;
857 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000858 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000859 pt = tok->inp - 2;
860 if (pt >= tok->buf && *pt == '\r') {
861 *pt++ = '\n';
862 *pt = '\0';
863 tok->inp = pt;
864 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000865 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000866 }
867 if (tok->done != E_OK) {
868 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000869 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000870 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000871 return EOF;
872 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000874 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000875}
876
877
878/* Back-up one character */
879
880static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000881tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882{
883 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000884 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000885 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000886 if (*tok->cur != c)
887 *tok->cur = c;
888 }
889}
890
891
892/* Return the token corresponding to a single character */
893
894int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000895PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896{
897 switch (c) {
898 case '(': return LPAR;
899 case ')': return RPAR;
900 case '[': return LSQB;
901 case ']': return RSQB;
902 case ':': return COLON;
903 case ',': return COMMA;
904 case ';': return SEMI;
905 case '+': return PLUS;
906 case '-': return MINUS;
907 case '*': return STAR;
908 case '/': return SLASH;
909 case '|': return VBAR;
910 case '&': return AMPER;
911 case '<': return LESS;
912 case '>': return GREATER;
913 case '=': return EQUAL;
914 case '.': return DOT;
915 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916 case '{': return LBRACE;
917 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000918 case '^': return CIRCUMFLEX;
919 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000920 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000921 default: return OP;
922 }
923}
924
925
Guido van Rossumfbab9051991-10-20 20:25:03 +0000926int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000927PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000928{
929 switch (c1) {
930 case '=':
931 switch (c2) {
932 case '=': return EQEQUAL;
933 }
934 break;
935 case '!':
936 switch (c2) {
937 case '=': return NOTEQUAL;
938 }
939 break;
940 case '<':
941 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000942 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000943 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000944 }
945 break;
946 case '>':
947 switch (c2) {
948 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000949 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000950 }
951 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000952 case '+':
953 switch (c2) {
954 case '=': return PLUSEQUAL;
955 }
956 break;
957 case '-':
958 switch (c2) {
959 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000960 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000961 }
962 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000963 case '*':
964 switch (c2) {
965 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000966 case '=': return STAREQUAL;
967 }
968 break;
969 case '/':
970 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000971 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000972 case '=': return SLASHEQUAL;
973 }
974 break;
975 case '|':
976 switch (c2) {
977 case '=': return VBAREQUAL;
978 }
979 break;
980 case '%':
981 switch (c2) {
982 case '=': return PERCENTEQUAL;
983 }
984 break;
985 case '&':
986 switch (c2) {
987 case '=': return AMPEREQUAL;
988 }
989 break;
990 case '^':
991 switch (c2) {
992 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000993 }
994 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000995 }
996 return OP;
997}
998
Thomas Wouters434d0822000-08-24 20:11:32 +0000999int
1000PyToken_ThreeChars(int c1, int c2, int c3)
1001{
1002 switch (c1) {
1003 case '<':
1004 switch (c2) {
1005 case '<':
1006 switch (c3) {
1007 case '=':
1008 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001009 }
1010 break;
1011 }
1012 break;
1013 case '>':
1014 switch (c2) {
1015 case '>':
1016 switch (c3) {
1017 case '=':
1018 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 }
1020 break;
1021 }
1022 break;
1023 case '*':
1024 switch (c2) {
1025 case '*':
1026 switch (c3) {
1027 case '=':
1028 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001029 }
1030 break;
1031 }
1032 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001033 case '/':
1034 switch (c2) {
1035 case '/':
1036 switch (c3) {
1037 case '=':
1038 return DOUBLESLASHEQUAL;
1039 }
1040 break;
1041 }
1042 break;
Georg Brandldde00282007-03-18 19:01:53 +00001043 case '.':
1044 switch (c2) {
1045 case '.':
1046 switch (c3) {
1047 case '.':
1048 return ELLIPSIS;
1049 }
1050 break;
1051 }
1052 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001053 }
1054 return OP;
1055}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001056
Guido van Rossum926f13a1998-04-09 21:38:06 +00001057static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001058indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001059{
1060 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001061 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001062 tok->cur = tok->inp;
1063 return 1;
1064 }
1065 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001066 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1067 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001068 tok->altwarning = 0;
1069 }
1070 return 0;
1071}
1072
1073
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001074/* Get next token, after space stripping etc. */
1075
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001076static int
1077tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001078{
1079 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001080 int blankline;
1081
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001082 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001083 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001084 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001085 blankline = 0;
1086
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087 /* Get indentation level */
1088 if (tok->atbol) {
1089 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001090 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001092 for (;;) {
1093 c = tok_nextc(tok);
1094 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001095 col++, altcol++;
1096 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001098 altcol = (altcol/tok->alttabsize + 1)
1099 * tok->alttabsize;
1100 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001101 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001102 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001103 else
1104 break;
1105 }
1106 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001107 if (c == '#' || c == '\n') {
1108 /* Lines with only whitespace and/or comments
1109 shouldn't affect the indentation and are
1110 not passed to the parser as NEWLINE tokens,
1111 except *totally* empty lines in interactive
1112 mode, which signal the end of a command group. */
1113 if (col == 0 && c == '\n' && tok->prompt != NULL)
1114 blankline = 0; /* Let it through */
1115 else
1116 blankline = 1; /* Ignore completely */
1117 /* We can't jump back right here since we still
1118 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001119 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001120 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121 if (col == tok->indstack[tok->indent]) {
1122 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001123 if (altcol != tok->altindstack[tok->indent]) {
1124 if (indenterror(tok))
1125 return ERRORTOKEN;
1126 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001128 else if (col > tok->indstack[tok->indent]) {
1129 /* Indent -- always one */
1130 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001131 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001132 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001133 return ERRORTOKEN;
1134 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001135 if (altcol <= tok->altindstack[tok->indent]) {
1136 if (indenterror(tok))
1137 return ERRORTOKEN;
1138 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001139 tok->pendin++;
1140 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001141 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001142 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 else /* col < tok->indstack[tok->indent] */ {
1144 /* Dedent -- any number, must be consistent */
1145 while (tok->indent > 0 &&
1146 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001147 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001148 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001149 }
1150 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001151 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001152 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 return ERRORTOKEN;
1154 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155 if (altcol != tok->altindstack[tok->indent]) {
1156 if (indenterror(tok))
1157 return ERRORTOKEN;
1158 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001159 }
1160 }
1161 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001162
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001163 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001164
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001165 /* Return pending indents/dedents */
1166 if (tok->pendin != 0) {
1167 if (tok->pendin < 0) {
1168 tok->pendin++;
1169 return DEDENT;
1170 }
1171 else {
1172 tok->pendin--;
1173 return INDENT;
1174 }
1175 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001176
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001178 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 /* Skip spaces */
1180 do {
1181 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001182 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001183
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001184 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001185 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001186
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001187 /* Skip comment */
1188 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001189 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001190 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001191
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001192 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001193 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001195 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001196
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001197 /* Identifier (most frequent token!) */
Martin v. Löwis5b222132007-06-10 09:51:05 +00001198 if (is_potential_identifier_start(c)) {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001199 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001200 switch (c) {
1201 case 'r':
1202 case 'R':
1203 c = tok_nextc(tok);
1204 if (c == '"' || c == '\'')
1205 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001206 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001207 case 'b':
1208 case 'B':
1209 c = tok_nextc(tok);
1210 if (c == 'r' || c == 'R')
1211 c = tok_nextc(tok);
1212 if (c == '"' || c == '\'')
1213 goto letter_quote;
1214 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001215 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001216 while (is_potential_identifier_char(c)) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001218 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001219 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001220 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 *p_end = tok->cur;
1222 return NAME;
1223 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001224
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001225 /* Newline */
1226 if (c == '\n') {
1227 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001228 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001229 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001230 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001232 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 return NEWLINE;
1234 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001235
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001236 /* Period or number starting with period? */
1237 if (c == '.') {
1238 c = tok_nextc(tok);
1239 if (isdigit(c)) {
1240 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001241 } else if (c == '.') {
1242 c = tok_nextc(tok);
1243 if (c == '.') {
1244 *p_start = tok->start;
1245 *p_end = tok->cur;
1246 return ELLIPSIS;
1247 } else {
1248 tok_backup(tok, c);
1249 }
1250 tok_backup(tok, '.');
1251 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001252 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001253 }
Georg Brandldde00282007-03-18 19:01:53 +00001254 *p_start = tok->start;
1255 *p_end = tok->cur;
1256 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001257 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001258
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259 /* Number */
1260 if (isdigit(c)) {
1261 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001262 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 c = tok_nextc(tok);
1264 if (c == '.')
1265 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001266#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001267 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001268 goto imaginary;
1269#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001270 if (c == 'x' || c == 'X') {
1271 /* Hex */
1272 do {
1273 c = tok_nextc(tok);
1274 } while (isxdigit(c));
1275 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001276 else if (c == 'o' || c == 'O') {
1277 /* Octal */
1278 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001279 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001280 } while ('0' <= c && c < '8');
1281 }
1282 else if (c == 'b' || c == 'B') {
1283 /* Binary */
1284 do {
1285 c = tok_nextc(tok);
1286 } while (c == '0' || c == '1');
1287 }
1288 else {
1289 int nonzero = 0;
1290 /* maybe old-style octal; c is first char of it */
1291 /* in any case, allow '0' as a literal */
1292 while (c == '0')
1293 c = tok_nextc(tok);
1294 while (isdigit(c)) {
1295 nonzero = 1;
1296 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001297 }
1298 if (c == '.')
1299 goto fraction;
1300 else if (c == 'e' || c == 'E')
1301 goto exponent;
1302#ifndef WITHOUT_COMPLEX
1303 else if (c == 'j' || c == 'J')
1304 goto imaginary;
1305#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001306 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001307 tok->done = E_TOKEN;
1308 tok_backup(tok, c);
1309 return ERRORTOKEN;
1310 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001311 }
1312 }
1313 else {
1314 /* Decimal */
1315 do {
1316 c = tok_nextc(tok);
1317 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001318 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001319 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001320 if (c == '.') {
1321 fraction:
1322 /* Fraction */
1323 do {
1324 c = tok_nextc(tok);
1325 } while (isdigit(c));
1326 }
1327 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001328 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001329 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001330 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001331 if (c == '+' || c == '-')
1332 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001333 if (!isdigit(c)) {
1334 tok->done = E_TOKEN;
1335 tok_backup(tok, c);
1336 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001337 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001338 do {
1339 c = tok_nextc(tok);
1340 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001341 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001342#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001343 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001344 /* Imaginary part */
1345 imaginary:
1346 c = tok_nextc(tok);
1347#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348 }
1349 }
1350 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001351 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001352 *p_end = tok->cur;
1353 return NUMBER;
1354 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001355
1356 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001357 /* String */
1358 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001359 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001360 int quote = c;
1361 int triple = 0;
1362 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 for (;;) {
1364 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001365 if (c == '\n') {
1366 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001367 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
1371 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001372 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 }
1374 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001375 if (triple)
1376 tok->done = E_EOFS;
1377 else
1378 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001379 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 return ERRORTOKEN;
1381 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 else if (c == quote) {
1383 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001384 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 c = tok_nextc(tok);
1386 if (c == quote) {
1387 triple = 1;
1388 tripcount = 0;
1389 continue;
1390 }
1391 tok_backup(tok, c);
1392 }
1393 if (!triple || tripcount == 3)
1394 break;
1395 }
1396 else if (c == '\\') {
1397 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001398 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001399 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001400 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001401 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 return ERRORTOKEN;
1403 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001405 else
1406 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001408 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001409 *p_end = tok->cur;
1410 return STRING;
1411 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 /* Line continuation */
1414 if (c == '\\') {
1415 c = tok_nextc(tok);
1416 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001417 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001418 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 return ERRORTOKEN;
1420 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001421 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 goto again; /* Read next line */
1423 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Guido van Rossumfbab9051991-10-20 20:25:03 +00001425 /* Check for two-character token */
1426 {
1427 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001428 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001429 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001430 int c3 = tok_nextc(tok);
1431 int token3 = PyToken_ThreeChars(c, c2, c3);
1432 if (token3 != OP) {
1433 token = token3;
1434 } else {
1435 tok_backup(tok, c3);
1436 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001437 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001438 *p_end = tok->cur;
1439 return token;
1440 }
1441 tok_backup(tok, c2);
1442 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001444 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001445 switch (c) {
1446 case '(':
1447 case '[':
1448 case '{':
1449 tok->level++;
1450 break;
1451 case ')':
1452 case ']':
1453 case '}':
1454 tok->level--;
1455 break;
1456 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001457
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001458 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001459 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001460 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001461 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462}
1463
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001464int
1465PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1466{
1467 int result = tok_get(tok, p_start, p_end);
1468 if (tok->decoding_erred) {
1469 result = ERRORTOKEN;
1470 tok->done = E_DECODE;
1471 }
1472 return result;
1473}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474
Guido van Rossum408027e1996-12-30 16:17:54 +00001475#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476
1477void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001478tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479{
Guido van Rossum86bea461997-04-29 21:03:06 +00001480 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1482 printf("(%.*s)", (int)(end - start), start);
1483}
1484
1485#endif