blob: a43094b2806adbb08b1b84ec338cf8c13f1029b8 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000070 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000092 "DOUBLESLASH",
93 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000094 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +000095 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +000096 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000125 tok->altwarning = 1;
126 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000330 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000331 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000341 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000358 PyObject* bufobj = tok->decoding_buffer;
359 const char *buf;
360 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361
362 /* Ask for one less byte so we can terminate it */
363 assert(size > 0);
364 size--;
365
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000366 if (bufobj == NULL) {
367 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
368 if (bufobj == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000371 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0)
372 return error_ret(tok);
373 if (buflen > size) {
374 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
375 buflen-size);
376 if (tok->decoding_buffer == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377 return error_ret(tok);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000378 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000380 memcpy(s, buf, buflen);
381 s[buflen] = '\0';
382 if (buflen == 0) return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384}
385
386/* Set the readline function for TOK to a StreamReader's
387 readline function. The StreamReader is named ENC.
388
389 This function is called from check_bom and check_coding_spec.
390
391 ENC is usually identical to the future value of tok->encoding,
392 except for the (currently unsupported) case of UTF-16.
393
394 Return 1 on success, 0 on failure. */
395
396static int
397fp_setreadl(struct tok_state *tok, const char* enc)
398{
399 PyObject *reader, *stream, *readline;
400
Martin v. Löwis95292d62002-12-11 14:04:59 +0000401 /* XXX: constify filename argument. */
402 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000403 if (stream == NULL)
404 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000405
406 reader = PyCodec_StreamReader(enc, stream, NULL);
407 Py_DECREF(stream);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000408 if (reader == NULL)
409 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410
411 readline = PyObject_GetAttrString(reader, "readline");
412 Py_DECREF(reader);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000413 if (readline == NULL)
414 return 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000415
416 tok->decoding_readline = readline;
417 return 1;
418}
419
420/* Fetch the next byte from TOK. */
421
422static int fp_getc(struct tok_state *tok) {
423 return getc(tok->fp);
424}
425
426/* Unfetch the last byte back into TOK. */
427
428static void fp_ungetc(int c, struct tok_state *tok) {
429 ungetc(c, tok->fp);
430}
431
432/* Read a line of input from TOK. Determine encoding
433 if necessary. */
434
435static char *
436decoding_fgets(char *s, int size, struct tok_state *tok)
437{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000438 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000439 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000440 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000441 if (tok->decoding_state < 0) {
442 /* We already have a codec associated with
443 this input. */
444 line = fp_readl(s, size, tok);
445 break;
446 } else if (tok->decoding_state > 0) {
447 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000448 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000450 break;
451 } else {
452 /* We have not yet determined the encoding.
453 If an encoding is found, use the file-pointer
454 reader functions from now on. */
455 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
456 return error_ret(tok);
457 assert(tok->decoding_state != 0);
458 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000459 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
461 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
462 return error_ret(tok);
463 }
464 }
465#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000466 /* The default encoding is ASCII, so make sure we don't have any
467 non-ASCII bytes in it. */
468 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000469 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000470 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471 if (*c > 127) {
472 badchar = *c;
473 break;
474 }
475 }
476 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000477 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000478 /* Need to add 1 to the line number, since this line
479 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000480 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000481 "Non-ASCII character '\\x%.2x' "
482 "in file %.200s on line %i, "
483 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000484 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000485 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000486 PyErr_SetString(PyExc_SyntaxError, buf);
487 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488 }
489#endif
490 return line;
491}
492
493static int
494decoding_feof(struct tok_state *tok)
495{
496 if (tok->decoding_state >= 0) {
497 return feof(tok->fp);
498 } else {
499 PyObject* buf = tok->decoding_buffer;
500 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000501 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000502 if (buf == NULL) {
503 error_ret(tok);
504 return 1;
505 } else {
506 tok->decoding_buffer = buf;
507 }
508 }
509 return PyObject_Length(buf) == 0;
510 }
511}
512
513/* Fetch a byte from TOK, using the string buffer. */
514
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000515static int
516buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000517 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000518}
519
520/* Unfetch a byte from TOK, using the string buffer. */
521
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000522static void
523buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000525 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000526}
527
528/* Set the readline function for TOK to ENC. For the string-based
529 tokenizer, this means to just record the encoding. */
530
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000531static int
532buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533 tok->enc = enc;
534 return 1;
535}
536
537/* Return a UTF-8 encoding Python string object from the
538 C byte string STR, which is encoded with ENC. */
539
540static PyObject *
541translate_into_utf8(const char* str, const char* enc) {
542 PyObject *utf8;
543 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
544 if (buf == NULL)
545 return NULL;
546 utf8 = PyUnicode_AsUTF8String(buf);
547 Py_DECREF(buf);
548 return utf8;
549}
550
551/* Decode a byte string STR for use as the buffer of TOK.
552 Look for encoding declarations inside STR, and record them
553 inside TOK. */
554
555static const char *
556decode_str(const char *str, struct tok_state *tok)
557{
558 PyObject* utf8 = NULL;
559 const char *s;
560 int lineno = 0;
561 tok->enc = NULL;
562 tok->str = str;
563 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000564 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000566 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567 if (tok->enc != NULL) {
568 utf8 = translate_into_utf8(str, tok->enc);
569 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000570 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 str = PyString_AsString(utf8);
572 }
573 for (s = str;; s++) {
574 if (*s == '\0') break;
575 else if (*s == '\n') {
576 lineno++;
577 if (lineno == 2) break;
578 }
579 }
580 tok->enc = NULL;
581 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000582 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000583 if (tok->enc != NULL) {
584 assert(utf8 == NULL);
585 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000586 if (utf8 == NULL) {
587 PyErr_Format(PyExc_SyntaxError,
588 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000589 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000590 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591 str = PyString_AsString(utf8);
592 }
593 assert(tok->decoding_buffer == NULL);
594 tok->decoding_buffer = utf8; /* CAUTION */
595 return str;
596}
597
598#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000599
600/* Set up tokenizer for string */
601
602struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000603PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000604{
605 struct tok_state *tok = tok_new();
606 if (tok == NULL)
607 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000609 if (str == NULL) {
610 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000612 }
613
Martin v. Löwis95292d62002-12-11 14:04:59 +0000614 /* XXX: constify members. */
615 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000616 return tok;
617}
618
619
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000620/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000621
622struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000623PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000624{
625 struct tok_state *tok = tok_new();
626 if (tok == NULL)
627 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000628 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000629 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000630 return NULL;
631 }
632 tok->cur = tok->inp = tok->buf;
633 tok->end = tok->buf + BUFSIZ;
634 tok->fp = fp;
635 tok->prompt = ps1;
636 tok->nextprompt = ps2;
637 return tok;
638}
639
640
641/* Free a tok_state structure */
642
643void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000644PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000645{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000646 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000647 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000648#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649 Py_XDECREF(tok->decoding_readline);
650 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000651#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000652 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000653 PyMem_FREE(tok->buf);
654 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000655}
656
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000657#if !defined(PGEN)
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000658static int
659tok_stdin_decode(struct tok_state *tok, char **inp)
660{
661 PyObject *enc, *sysstdin, *decoded, *utf8;
662 const char *encoding;
663 char *converted;
664
665 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
666 return 0;
667 sysstdin = PySys_GetObject("stdin");
668 if (sysstdin == NULL || !PyFile_Check(sysstdin))
669 return 0;
670
671 enc = ((PyFileObject *)sysstdin)->f_encoding;
672 if (enc == NULL || !PyString_Check(enc))
673 return 0;
674 Py_INCREF(enc);
675
676 encoding = PyString_AsString(enc);
677 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
678 if (decoded == NULL)
679 goto error_clear;
680
681 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
682 Py_DECREF(decoded);
683 if (utf8 == NULL)
684 goto error_clear;
685
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000686 assert(PyBytes_Check(utf8));
687 converted = new_string(PyBytes_AS_STRING(utf8),
688 PyBytes_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000689 Py_DECREF(utf8);
690 if (converted == NULL)
691 goto error_nomem;
692
693 PyMem_FREE(*inp);
694 *inp = converted;
695 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000696 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000697 tok->encoding = new_string(encoding, strlen(encoding));
698 if (tok->encoding == NULL)
699 goto error_nomem;
700
701 Py_DECREF(enc);
702 return 0;
703
704error_nomem:
705 Py_DECREF(enc);
706 tok->done = E_NOMEM;
707 return -1;
708
709error_clear:
710 /* Fallback to iso-8859-1: for backward compatibility */
711 Py_DECREF(enc);
712 PyErr_Clear();
713 return 0;
714}
715#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000716
717/* Get next char, updating state; error code goes into tok->done */
718
719static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000720tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000721{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000722 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000723 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000724 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000725 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000726 if (tok->done != E_OK)
727 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000728 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000729 char *end = strchr(tok->inp, '\n');
730 if (end != NULL)
731 end++;
732 else {
733 end = strchr(tok->inp, '\0');
734 if (end == tok->inp) {
735 tok->done = E_EOF;
736 return EOF;
737 }
738 }
739 if (tok->start == NULL)
740 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000741 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000742 tok->lineno++;
743 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000744 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000745 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000746 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000747 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748 if (tok->nextprompt != NULL)
749 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000750 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000751 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000752 else if (*newtok == '\0') {
753 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 tok->done = E_EOF;
755 }
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000756#if !defined(PGEN)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000757 else if (tok_stdin_decode(tok, &newtok) != 0)
758 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000759#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000760 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000761 size_t start = tok->start - tok->buf;
762 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000763 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000764 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000765 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000766 tok->lineno++;
767 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000768 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000769 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000770 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000771 tok->done = E_NOMEM;
772 return EOF;
773 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000774 tok->buf = buf;
775 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000776 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000777 strcpy(tok->buf + oldlen, newtok);
778 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000779 tok->inp = tok->buf + newlen;
780 tok->end = tok->inp + 1;
781 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000782 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 else {
784 tok->lineno++;
785 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000786 PyMem_FREE(tok->buf);
787 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000788 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000789 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000790 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000791 tok->inp = strchr(tok->buf, '\0');
792 tok->end = tok->inp + 1;
793 }
794 }
795 else {
796 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000797 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000798 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000799 if (tok->start == NULL) {
800 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000801 tok->buf = (char *)
802 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 if (tok->buf == NULL) {
804 tok->done = E_NOMEM;
805 return EOF;
806 }
807 tok->end = tok->buf + BUFSIZ;
808 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000809 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
810 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000811 tok->done = E_EOF;
812 done = 1;
813 }
814 else {
815 tok->done = E_OK;
816 tok->inp = strchr(tok->buf, '\0');
817 done = tok->inp[-1] == '\n';
818 }
819 }
820 else {
821 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000822 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000823 tok->done = E_EOF;
824 done = 1;
825 }
826 else
827 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000828 }
829 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000830 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000832 Py_ssize_t curstart = tok->start == NULL ? -1 :
833 tok->start - tok->buf;
834 Py_ssize_t curvalid = tok->inp - tok->buf;
835 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000836 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000837 newbuf = (char *)PyMem_REALLOC(newbuf,
838 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000839 if (newbuf == NULL) {
840 tok->done = E_NOMEM;
841 tok->cur = tok->inp;
842 return EOF;
843 }
844 tok->buf = newbuf;
845 tok->inp = tok->buf + curvalid;
846 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000847 tok->start = curstart < 0 ? NULL :
848 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000849 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000850 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000851 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000852 /* Break out early on decoding
853 errors, as tok->buf will be NULL
854 */
855 if (tok->decoding_erred)
856 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000857 /* Last line does not end in \n,
858 fake one */
859 strcpy(tok->inp, "\n");
860 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000861 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000862 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000864 if (tok->buf != NULL) {
865 tok->cur = tok->buf + cur;
866 tok->line_start = tok->cur;
867 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000868 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000869 pt = tok->inp - 2;
870 if (pt >= tok->buf && *pt == '\r') {
871 *pt++ = '\n';
872 *pt = '\0';
873 tok->inp = pt;
874 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000875 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000876 }
877 if (tok->done != E_OK) {
878 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000879 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000880 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881 return EOF;
882 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000884 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000885}
886
887
888/* Back-up one character */
889
890static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000891tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000892{
893 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000894 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000895 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896 if (*tok->cur != c)
897 *tok->cur = c;
898 }
899}
900
901
902/* Return the token corresponding to a single character */
903
904int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000905PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000906{
907 switch (c) {
908 case '(': return LPAR;
909 case ')': return RPAR;
910 case '[': return LSQB;
911 case ']': return RSQB;
912 case ':': return COLON;
913 case ',': return COMMA;
914 case ';': return SEMI;
915 case '+': return PLUS;
916 case '-': return MINUS;
917 case '*': return STAR;
918 case '/': return SLASH;
919 case '|': return VBAR;
920 case '&': return AMPER;
921 case '<': return LESS;
922 case '>': return GREATER;
923 case '=': return EQUAL;
924 case '.': return DOT;
925 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000926 case '{': return LBRACE;
927 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000928 case '^': return CIRCUMFLEX;
929 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000930 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000931 default: return OP;
932 }
933}
934
935
Guido van Rossumfbab9051991-10-20 20:25:03 +0000936int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000937PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000938{
939 switch (c1) {
940 case '=':
941 switch (c2) {
942 case '=': return EQEQUAL;
943 }
944 break;
945 case '!':
946 switch (c2) {
947 case '=': return NOTEQUAL;
948 }
949 break;
950 case '<':
951 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000952 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000953 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000954 }
955 break;
956 case '>':
957 switch (c2) {
958 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000959 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000960 }
961 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000962 case '+':
963 switch (c2) {
964 case '=': return PLUSEQUAL;
965 }
966 break;
967 case '-':
968 switch (c2) {
969 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000970 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000971 }
972 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000973 case '*':
974 switch (c2) {
975 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000976 case '=': return STAREQUAL;
977 }
978 break;
979 case '/':
980 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000981 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000982 case '=': return SLASHEQUAL;
983 }
984 break;
985 case '|':
986 switch (c2) {
987 case '=': return VBAREQUAL;
988 }
989 break;
990 case '%':
991 switch (c2) {
992 case '=': return PERCENTEQUAL;
993 }
994 break;
995 case '&':
996 switch (c2) {
997 case '=': return AMPEREQUAL;
998 }
999 break;
1000 case '^':
1001 switch (c2) {
1002 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001003 }
1004 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001005 }
1006 return OP;
1007}
1008
Thomas Wouters434d0822000-08-24 20:11:32 +00001009int
1010PyToken_ThreeChars(int c1, int c2, int c3)
1011{
1012 switch (c1) {
1013 case '<':
1014 switch (c2) {
1015 case '<':
1016 switch (c3) {
1017 case '=':
1018 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 }
1020 break;
1021 }
1022 break;
1023 case '>':
1024 switch (c2) {
1025 case '>':
1026 switch (c3) {
1027 case '=':
1028 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001029 }
1030 break;
1031 }
1032 break;
1033 case '*':
1034 switch (c2) {
1035 case '*':
1036 switch (c3) {
1037 case '=':
1038 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001039 }
1040 break;
1041 }
1042 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001043 case '/':
1044 switch (c2) {
1045 case '/':
1046 switch (c3) {
1047 case '=':
1048 return DOUBLESLASHEQUAL;
1049 }
1050 break;
1051 }
1052 break;
Georg Brandldde00282007-03-18 19:01:53 +00001053 case '.':
1054 switch (c2) {
1055 case '.':
1056 switch (c3) {
1057 case '.':
1058 return ELLIPSIS;
1059 }
1060 break;
1061 }
1062 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001063 }
1064 return OP;
1065}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001066
Guido van Rossum926f13a1998-04-09 21:38:06 +00001067static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001068indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001069{
1070 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001071 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001072 tok->cur = tok->inp;
1073 return 1;
1074 }
1075 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001076 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1077 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001078 tok->altwarning = 0;
1079 }
1080 return 0;
1081}
1082
1083
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001084/* Get next token, after space stripping etc. */
1085
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001086static int
1087tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001088{
1089 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001090 int blankline;
1091
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001092 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001093 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001094 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001095 blankline = 0;
1096
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001097 /* Get indentation level */
1098 if (tok->atbol) {
1099 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001100 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001102 for (;;) {
1103 c = tok_nextc(tok);
1104 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001105 col++, altcol++;
1106 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001107 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001108 altcol = (altcol/tok->alttabsize + 1)
1109 * tok->alttabsize;
1110 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001111 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001113 else
1114 break;
1115 }
1116 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001117 if (c == '#' || c == '\n') {
1118 /* Lines with only whitespace and/or comments
1119 shouldn't affect the indentation and are
1120 not passed to the parser as NEWLINE tokens,
1121 except *totally* empty lines in interactive
1122 mode, which signal the end of a command group. */
1123 if (col == 0 && c == '\n' && tok->prompt != NULL)
1124 blankline = 0; /* Let it through */
1125 else
1126 blankline = 1; /* Ignore completely */
1127 /* We can't jump back right here since we still
1128 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001129 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001130 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001131 if (col == tok->indstack[tok->indent]) {
1132 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133 if (altcol != tok->altindstack[tok->indent]) {
1134 if (indenterror(tok))
1135 return ERRORTOKEN;
1136 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001137 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001138 else if (col > tok->indstack[tok->indent]) {
1139 /* Indent -- always one */
1140 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001141 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001142 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001143 return ERRORTOKEN;
1144 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001145 if (altcol <= tok->altindstack[tok->indent]) {
1146 if (indenterror(tok))
1147 return ERRORTOKEN;
1148 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001149 tok->pendin++;
1150 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001151 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001152 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 else /* col < tok->indstack[tok->indent] */ {
1154 /* Dedent -- any number, must be consistent */
1155 while (tok->indent > 0 &&
1156 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001157 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001158 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001159 }
1160 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001161 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001162 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 return ERRORTOKEN;
1164 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001165 if (altcol != tok->altindstack[tok->indent]) {
1166 if (indenterror(tok))
1167 return ERRORTOKEN;
1168 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001169 }
1170 }
1171 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001172
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001173 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001174
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001175 /* Return pending indents/dedents */
1176 if (tok->pendin != 0) {
1177 if (tok->pendin < 0) {
1178 tok->pendin++;
1179 return DEDENT;
1180 }
1181 else {
1182 tok->pendin--;
1183 return INDENT;
1184 }
1185 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001186
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001188 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 /* Skip spaces */
1190 do {
1191 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001192 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001193
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001194 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001195 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001196
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001197 /* Skip comment */
1198 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001199 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001200 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001201
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001202 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001203 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001205 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001206
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001207 /* Identifier (most frequent token!) */
1208 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001209 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001210 switch (c) {
1211 case 'r':
1212 case 'R':
1213 c = tok_nextc(tok);
1214 if (c == '"' || c == '\'')
1215 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001216 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001217 case 'b':
1218 case 'B':
1219 c = tok_nextc(tok);
1220 if (c == 'r' || c == 'R')
1221 c = tok_nextc(tok);
1222 if (c == '"' || c == '\'')
1223 goto letter_quote;
1224 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001225 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001226 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001228 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001230 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 *p_end = tok->cur;
1232 return NAME;
1233 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 /* Newline */
1236 if (c == '\n') {
1237 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001238 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001239 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001240 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001241 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001242 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001243 return NEWLINE;
1244 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001245
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001246 /* Period or number starting with period? */
1247 if (c == '.') {
1248 c = tok_nextc(tok);
1249 if (isdigit(c)) {
1250 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001251 } else if (c == '.') {
1252 c = tok_nextc(tok);
1253 if (c == '.') {
1254 *p_start = tok->start;
1255 *p_end = tok->cur;
1256 return ELLIPSIS;
1257 } else {
1258 tok_backup(tok, c);
1259 }
1260 tok_backup(tok, '.');
1261 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001262 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001263 }
Georg Brandldde00282007-03-18 19:01:53 +00001264 *p_start = tok->start;
1265 *p_end = tok->cur;
1266 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001267 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001268
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 /* Number */
1270 if (isdigit(c)) {
1271 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001272 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 c = tok_nextc(tok);
1274 if (c == '.')
1275 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001276#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001277 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001278 goto imaginary;
1279#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 if (c == 'x' || c == 'X') {
1281 /* Hex */
1282 do {
1283 c = tok_nextc(tok);
1284 } while (isxdigit(c));
1285 }
1286 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001287 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 /* Octal; c is first char of it */
1289 /* There's no 'isoctdigit' macro, sigh */
1290 while ('0' <= c && c < '8') {
1291 c = tok_nextc(tok);
1292 }
Tim Petersd507dab2001-08-30 20:51:59 +00001293 if (isdigit(c)) {
1294 found_decimal = 1;
1295 do {
1296 c = tok_nextc(tok);
1297 } while (isdigit(c));
1298 }
1299 if (c == '.')
1300 goto fraction;
1301 else if (c == 'e' || c == 'E')
1302 goto exponent;
1303#ifndef WITHOUT_COMPLEX
1304 else if (c == 'j' || c == 'J')
1305 goto imaginary;
1306#endif
1307 else if (found_decimal) {
1308 tok->done = E_TOKEN;
1309 tok_backup(tok, c);
1310 return ERRORTOKEN;
1311 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001312 }
1313 }
1314 else {
1315 /* Decimal */
1316 do {
1317 c = tok_nextc(tok);
1318 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001319 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001320 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001321 if (c == '.') {
1322 fraction:
1323 /* Fraction */
1324 do {
1325 c = tok_nextc(tok);
1326 } while (isdigit(c));
1327 }
1328 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001329 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001330 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001331 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001332 if (c == '+' || c == '-')
1333 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001334 if (!isdigit(c)) {
1335 tok->done = E_TOKEN;
1336 tok_backup(tok, c);
1337 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001338 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001339 do {
1340 c = tok_nextc(tok);
1341 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001342 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001343#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001344 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001345 /* Imaginary part */
1346 imaginary:
1347 c = tok_nextc(tok);
1348#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 }
1350 }
1351 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001352 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 *p_end = tok->cur;
1354 return NUMBER;
1355 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001356
1357 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001358 /* String */
1359 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001361 int quote = c;
1362 int triple = 0;
1363 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 for (;;) {
1365 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001366 if (c == '\n') {
1367 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001368 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001369 tok_backup(tok, c);
1370 return ERRORTOKEN;
1371 }
1372 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001373 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001374 }
1375 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001376 if (triple)
1377 tok->done = E_EOFS;
1378 else
1379 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001380 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001381 return ERRORTOKEN;
1382 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001383 else if (c == quote) {
1384 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001385 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001386 c = tok_nextc(tok);
1387 if (c == quote) {
1388 triple = 1;
1389 tripcount = 0;
1390 continue;
1391 }
1392 tok_backup(tok, c);
1393 }
1394 if (!triple || tripcount == 3)
1395 break;
1396 }
1397 else if (c == '\\') {
1398 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001400 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001401 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001402 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 return ERRORTOKEN;
1404 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001405 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001406 else
1407 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001409 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001410 *p_end = tok->cur;
1411 return STRING;
1412 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 /* Line continuation */
1415 if (c == '\\') {
1416 c = tok_nextc(tok);
1417 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001418 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001419 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 return ERRORTOKEN;
1421 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001422 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 goto again; /* Read next line */
1424 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425
Guido van Rossumfbab9051991-10-20 20:25:03 +00001426 /* Check for two-character token */
1427 {
1428 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001429 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001430 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001431 int c3 = tok_nextc(tok);
1432 int token3 = PyToken_ThreeChars(c, c2, c3);
1433 if (token3 != OP) {
1434 token = token3;
1435 } else {
1436 tok_backup(tok, c3);
1437 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001438 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001439 *p_end = tok->cur;
1440 return token;
1441 }
1442 tok_backup(tok, c2);
1443 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001445 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001446 switch (c) {
1447 case '(':
1448 case '[':
1449 case '{':
1450 tok->level++;
1451 break;
1452 case ')':
1453 case ']':
1454 case '}':
1455 tok->level--;
1456 break;
1457 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001458
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001460 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001461 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001462 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463}
1464
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001465int
1466PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1467{
1468 int result = tok_get(tok, p_start, p_end);
1469 if (tok->decoding_erred) {
1470 result = ERRORTOKEN;
1471 tok->done = E_DECODE;
1472 }
1473 return result;
1474}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001475
Guido van Rossum408027e1996-12-30 16:17:54 +00001476#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477
1478void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001479tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001480{
Guido van Rossum86bea461997-04-29 21:03:06 +00001481 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1483 printf("(%.*s)", (int)(end - start), start);
1484}
1485
1486#endif