blob: f3eeb2c2521843379074540773d25634e7297a76 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis566f6af2002-10-26 14:39:10 +000021extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000022/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
Guido van Rossum4fe87291992-02-26 15:24:44 +000026/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000027#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000028
Guido van Rossumcf57d8b1998-01-19 22:07:46 +000029/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
Guido van Rossum3f5da241990-12-20 15:06:42 +000037/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000038static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000041
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000042/* Token names */
43
Guido van Rossum86bea461997-04-29 21:03:06 +000044char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000045 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000070 "LBRACE",
71 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000072 "EQEQUAL",
73 "NOTEQUAL",
74 "LESSEQUAL",
75 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000076 "TILDE",
77 "CIRCUMFLEX",
78 "LEFTSHIFT",
79 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000080 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000081 "PLUSEQUAL",
82 "MINEQUAL",
83 "STAREQUAL",
84 "SLASHEQUAL",
85 "PERCENTEQUAL",
86 "AMPEREQUAL",
87 "VBAREQUAL",
88 "CIRCUMFLEXEQUAL",
89 "LEFTSHIFTEQUAL",
90 "RIGHTSHIFTEQUAL",
91 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000092 "DOUBLESLASH",
93 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +000094 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +000095 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +000096 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +000097 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000098 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000107tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000108{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000111 if (tok == NULL)
112 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000123 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000124 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000125 tok->altwarning = 1;
126 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000132 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000133 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000134#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000137#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000138 return tok;
139}
140
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146 return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152 return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158 return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000168 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
171}
172
173static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000174new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000176 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
180 }
181 return result;
182}
183
184static char *
185get_normal_name(char *s) /* for utf-8 and latin-1 */
186{
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
194 }
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found. */
208
209static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000210get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000212 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000233 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000234 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000241 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000242 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000243 }
244 return r;
245 }
246 }
247 }
248 return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
255
256static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 int set_readline(struct tok_state *, const char *))
259{
Tim Peters17db21f2002-09-03 15:39:58 +0000260 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000261 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000262
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000266 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000280 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 }
287 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000330 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000331 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000341 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000358 PyObject* bufobj = tok->decoding_buffer;
359 const char *buf;
360 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361
362 /* Ask for one less byte so we can terminate it */
363 assert(size > 0);
364 size--;
365
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000366 if (bufobj == NULL) {
367 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
368 if (bufobj == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000371 if (PyObject_AsCharBuffer(bufobj, &buf, &buflen) < 0)
372 return error_ret(tok);
373 if (buflen > size) {
374 tok->decoding_buffer = PyBytes_FromStringAndSize(buf+size,
375 buflen-size);
376 if (tok->decoding_buffer == NULL)
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377 return error_ret(tok);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000378 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379 }
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000380 memcpy(s, buf, buflen);
381 s[buflen] = '\0';
382 if (buflen == 0) return NULL; /* EOF */
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384}
385
386/* Set the readline function for TOK to a StreamReader's
387 readline function. The StreamReader is named ENC.
388
389 This function is called from check_bom and check_coding_spec.
390
391 ENC is usually identical to the future value of tok->encoding,
392 except for the (currently unsupported) case of UTF-16.
393
394 Return 1 on success, 0 on failure. */
395
396static int
397fp_setreadl(struct tok_state *tok, const char* enc)
398{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000399 PyObject *readline = NULL, *stream = NULL, *io = NULL;
400 int ok = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000401
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000402 io = PyImport_ImportModule("io");
403 if (io == NULL)
404 goto cleanup;
405
406 stream = PyObject_CallMethod(io, "open", "ssis",
407 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000408 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000409 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000411 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000412 if (readline == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000413 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000416 ok = 1;
417
418 cleanup:
419 Py_XDECREF(stream);
420 Py_XDECREF(io);
421 return ok;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000422}
423
424/* Fetch the next byte from TOK. */
425
426static int fp_getc(struct tok_state *tok) {
427 return getc(tok->fp);
428}
429
430/* Unfetch the last byte back into TOK. */
431
432static void fp_ungetc(int c, struct tok_state *tok) {
433 ungetc(c, tok->fp);
434}
435
436/* Read a line of input from TOK. Determine encoding
437 if necessary. */
438
439static char *
440decoding_fgets(char *s, int size, struct tok_state *tok)
441{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000442 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000443 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000444 for (;;) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000445 if (tok->decoding_state < 0) {
446 /* We already have a codec associated with
447 this input. */
448 line = fp_readl(s, size, tok);
449 break;
450 } else if (tok->decoding_state > 0) {
451 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000452 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000453 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000454 break;
455 } else {
456 /* We have not yet determined the encoding.
457 If an encoding is found, use the file-pointer
458 reader functions from now on. */
459 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
460 return error_ret(tok);
461 assert(tok->decoding_state != 0);
462 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000463 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
465 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
466 return error_ret(tok);
467 }
468 }
469#ifndef PGEN
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000470 /* The default encoding is ASCII, so make sure we don't have any
471 non-ASCII bytes in it. */
472 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000473 unsigned char *c;
Jack Jansencf0a2cf2002-08-05 14:14:05 +0000474 for (c = (unsigned char *)line; *c; c++)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475 if (*c > 127) {
476 badchar = *c;
477 break;
478 }
479 }
480 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000481 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000482 /* Need to add 1 to the line number, since this line
483 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000484 sprintf(buf,
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000485 "Non-ASCII character '\\x%.2x' "
486 "in file %.200s on line %i, "
487 "but no encoding declared; "
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000488 "see http://www.python.org/peps/pep-0263.html for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000489 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000490 PyErr_SetString(PyExc_SyntaxError, buf);
491 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000492 }
493#endif
494 return line;
495}
496
497static int
498decoding_feof(struct tok_state *tok)
499{
500 if (tok->decoding_state >= 0) {
501 return feof(tok->fp);
502 } else {
503 PyObject* buf = tok->decoding_buffer;
504 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000505 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506 if (buf == NULL) {
507 error_ret(tok);
508 return 1;
509 } else {
510 tok->decoding_buffer = buf;
511 }
512 }
513 return PyObject_Length(buf) == 0;
514 }
515}
516
517/* Fetch a byte from TOK, using the string buffer. */
518
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000519static int
520buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000521 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000522}
523
524/* Unfetch a byte from TOK, using the string buffer. */
525
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000526static void
527buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000529 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530}
531
532/* Set the readline function for TOK to ENC. For the string-based
533 tokenizer, this means to just record the encoding. */
534
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000535static int
536buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537 tok->enc = enc;
538 return 1;
539}
540
541/* Return a UTF-8 encoding Python string object from the
542 C byte string STR, which is encoded with ENC. */
543
544static PyObject *
545translate_into_utf8(const char* str, const char* enc) {
546 PyObject *utf8;
547 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
548 if (buf == NULL)
549 return NULL;
550 utf8 = PyUnicode_AsUTF8String(buf);
551 Py_DECREF(buf);
552 return utf8;
553}
554
555/* Decode a byte string STR for use as the buffer of TOK.
556 Look for encoding declarations inside STR, and record them
557 inside TOK. */
558
559static const char *
560decode_str(const char *str, struct tok_state *tok)
561{
562 PyObject* utf8 = NULL;
563 const char *s;
564 int lineno = 0;
565 tok->enc = NULL;
566 tok->str = str;
567 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000568 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000569 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000570 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 if (tok->enc != NULL) {
572 utf8 = translate_into_utf8(str, tok->enc);
573 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000574 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575 str = PyString_AsString(utf8);
576 }
577 for (s = str;; s++) {
578 if (*s == '\0') break;
579 else if (*s == '\n') {
580 lineno++;
581 if (lineno == 2) break;
582 }
583 }
584 tok->enc = NULL;
585 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000586 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 if (tok->enc != NULL) {
588 assert(utf8 == NULL);
589 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000590 if (utf8 == NULL) {
591 PyErr_Format(PyExc_SyntaxError,
592 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000593 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000594 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000595 str = PyString_AsString(utf8);
596 }
597 assert(tok->decoding_buffer == NULL);
598 tok->decoding_buffer = utf8; /* CAUTION */
599 return str;
600}
601
602#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000603
604/* Set up tokenizer for string */
605
606struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000607PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000608{
609 struct tok_state *tok = tok_new();
610 if (tok == NULL)
611 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000612 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000613 if (str == NULL) {
614 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000616 }
617
Martin v. Löwis95292d62002-12-11 14:04:59 +0000618 /* XXX: constify members. */
619 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000620 return tok;
621}
622
623
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000624/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000625
626struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000627PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000628{
629 struct tok_state *tok = tok_new();
630 if (tok == NULL)
631 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000632 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000633 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000634 return NULL;
635 }
636 tok->cur = tok->inp = tok->buf;
637 tok->end = tok->buf + BUFSIZ;
638 tok->fp = fp;
639 tok->prompt = ps1;
640 tok->nextprompt = ps2;
641 return tok;
642}
643
644
645/* Free a tok_state structure */
646
647void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000648PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000649{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000650 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000651 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000652#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653 Py_XDECREF(tok->decoding_readline);
654 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000655#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000656 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000657 PyMem_FREE(tok->buf);
658 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000659}
660
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000661#if !defined(PGEN)
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000662static int
663tok_stdin_decode(struct tok_state *tok, char **inp)
664{
665 PyObject *enc, *sysstdin, *decoded, *utf8;
666 const char *encoding;
667 char *converted;
668
669 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
670 return 0;
671 sysstdin = PySys_GetObject("stdin");
672 if (sysstdin == NULL || !PyFile_Check(sysstdin))
673 return 0;
674
675 enc = ((PyFileObject *)sysstdin)->f_encoding;
676 if (enc == NULL || !PyString_Check(enc))
677 return 0;
678 Py_INCREF(enc);
679
680 encoding = PyString_AsString(enc);
681 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
682 if (decoded == NULL)
683 goto error_clear;
684
685 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
686 Py_DECREF(decoded);
687 if (utf8 == NULL)
688 goto error_clear;
689
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000690 assert(PyBytes_Check(utf8));
691 converted = new_string(PyBytes_AS_STRING(utf8),
692 PyBytes_GET_SIZE(utf8));
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000693 Py_DECREF(utf8);
694 if (converted == NULL)
695 goto error_nomem;
696
697 PyMem_FREE(*inp);
698 *inp = converted;
699 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000700 PyMem_FREE(tok->encoding);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000701 tok->encoding = new_string(encoding, strlen(encoding));
702 if (tok->encoding == NULL)
703 goto error_nomem;
704
705 Py_DECREF(enc);
706 return 0;
707
708error_nomem:
709 Py_DECREF(enc);
710 tok->done = E_NOMEM;
711 return -1;
712
713error_clear:
714 /* Fallback to iso-8859-1: for backward compatibility */
715 Py_DECREF(enc);
716 PyErr_Clear();
717 return 0;
718}
719#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000720
721/* Get next char, updating state; error code goes into tok->done */
722
723static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000724tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000725{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000726 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000727 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000728 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000729 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000730 if (tok->done != E_OK)
731 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000732 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000733 char *end = strchr(tok->inp, '\n');
734 if (end != NULL)
735 end++;
736 else {
737 end = strchr(tok->inp, '\0');
738 if (end == tok->inp) {
739 tok->done = E_EOF;
740 return EOF;
741 }
742 }
743 if (tok->start == NULL)
744 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000745 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000746 tok->lineno++;
747 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000748 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000751 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000752 if (tok->nextprompt != NULL)
753 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000754 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000755 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000756 else if (*newtok == '\0') {
757 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000758 tok->done = E_EOF;
759 }
Guido van Rossum8d30cc02007-05-03 17:49:24 +0000760#if !defined(PGEN)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000761 else if (tok_stdin_decode(tok, &newtok) != 0)
762 PyMem_FREE(newtok);
Hye-Shik Chang7df44b32004-08-04 17:36:41 +0000763#endif
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000764 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000765 size_t start = tok->start - tok->buf;
766 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000767 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000768 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000769 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000770 tok->lineno++;
771 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000772 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000773 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000774 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000775 tok->done = E_NOMEM;
776 return EOF;
777 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000778 tok->buf = buf;
779 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000780 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000781 strcpy(tok->buf + oldlen, newtok);
782 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000783 tok->inp = tok->buf + newlen;
784 tok->end = tok->inp + 1;
785 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000786 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000787 else {
788 tok->lineno++;
789 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000790 PyMem_FREE(tok->buf);
791 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000792 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000793 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000794 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000795 tok->inp = strchr(tok->buf, '\0');
796 tok->end = tok->inp + 1;
797 }
798 }
799 else {
800 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000802 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000803 if (tok->start == NULL) {
804 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000805 tok->buf = (char *)
806 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000807 if (tok->buf == NULL) {
808 tok->done = E_NOMEM;
809 return EOF;
810 }
811 tok->end = tok->buf + BUFSIZ;
812 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000813 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
814 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000815 tok->done = E_EOF;
816 done = 1;
817 }
818 else {
819 tok->done = E_OK;
820 tok->inp = strchr(tok->buf, '\0');
821 done = tok->inp[-1] == '\n';
822 }
823 }
824 else {
825 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000826 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000827 tok->done = E_EOF;
828 done = 1;
829 }
830 else
831 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000832 }
833 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000834 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000835 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000836 Py_ssize_t curstart = tok->start == NULL ? -1 :
837 tok->start - tok->buf;
838 Py_ssize_t curvalid = tok->inp - tok->buf;
839 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000840 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000841 newbuf = (char *)PyMem_REALLOC(newbuf,
842 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000843 if (newbuf == NULL) {
844 tok->done = E_NOMEM;
845 tok->cur = tok->inp;
846 return EOF;
847 }
848 tok->buf = newbuf;
849 tok->inp = tok->buf + curvalid;
850 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000851 tok->start = curstart < 0 ? NULL :
852 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000853 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000854 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000855 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000856 /* Break out early on decoding
857 errors, as tok->buf will be NULL
858 */
859 if (tok->decoding_erred)
860 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000861 /* Last line does not end in \n,
862 fake one */
863 strcpy(tok->inp, "\n");
864 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000866 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000868 if (tok->buf != NULL) {
869 tok->cur = tok->buf + cur;
870 tok->line_start = tok->cur;
871 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000872 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000873 pt = tok->inp - 2;
874 if (pt >= tok->buf && *pt == '\r') {
875 *pt++ = '\n';
876 *pt = '\0';
877 tok->inp = pt;
878 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000879 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000880 }
881 if (tok->done != E_OK) {
882 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000883 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000884 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000885 return EOF;
886 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000888 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889}
890
891
892/* Back-up one character */
893
894static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000895tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896{
897 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000898 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000899 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000900 if (*tok->cur != c)
901 *tok->cur = c;
902 }
903}
904
905
906/* Return the token corresponding to a single character */
907
908int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000909PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910{
911 switch (c) {
912 case '(': return LPAR;
913 case ')': return RPAR;
914 case '[': return LSQB;
915 case ']': return RSQB;
916 case ':': return COLON;
917 case ',': return COMMA;
918 case ';': return SEMI;
919 case '+': return PLUS;
920 case '-': return MINUS;
921 case '*': return STAR;
922 case '/': return SLASH;
923 case '|': return VBAR;
924 case '&': return AMPER;
925 case '<': return LESS;
926 case '>': return GREATER;
927 case '=': return EQUAL;
928 case '.': return DOT;
929 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000930 case '{': return LBRACE;
931 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000932 case '^': return CIRCUMFLEX;
933 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000934 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000935 default: return OP;
936 }
937}
938
939
Guido van Rossumfbab9051991-10-20 20:25:03 +0000940int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000941PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000942{
943 switch (c1) {
944 case '=':
945 switch (c2) {
946 case '=': return EQEQUAL;
947 }
948 break;
949 case '!':
950 switch (c2) {
951 case '=': return NOTEQUAL;
952 }
953 break;
954 case '<':
955 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +0000956 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000957 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000958 }
959 break;
960 case '>':
961 switch (c2) {
962 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000963 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +0000964 }
965 break;
Thomas Wouters434d0822000-08-24 20:11:32 +0000966 case '+':
967 switch (c2) {
968 case '=': return PLUSEQUAL;
969 }
970 break;
971 case '-':
972 switch (c2) {
973 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +0000974 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +0000975 }
976 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +0000977 case '*':
978 switch (c2) {
979 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +0000980 case '=': return STAREQUAL;
981 }
982 break;
983 case '/':
984 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +0000985 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +0000986 case '=': return SLASHEQUAL;
987 }
988 break;
989 case '|':
990 switch (c2) {
991 case '=': return VBAREQUAL;
992 }
993 break;
994 case '%':
995 switch (c2) {
996 case '=': return PERCENTEQUAL;
997 }
998 break;
999 case '&':
1000 switch (c2) {
1001 case '=': return AMPEREQUAL;
1002 }
1003 break;
1004 case '^':
1005 switch (c2) {
1006 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001007 }
1008 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001009 }
1010 return OP;
1011}
1012
Thomas Wouters434d0822000-08-24 20:11:32 +00001013int
1014PyToken_ThreeChars(int c1, int c2, int c3)
1015{
1016 switch (c1) {
1017 case '<':
1018 switch (c2) {
1019 case '<':
1020 switch (c3) {
1021 case '=':
1022 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001023 }
1024 break;
1025 }
1026 break;
1027 case '>':
1028 switch (c2) {
1029 case '>':
1030 switch (c3) {
1031 case '=':
1032 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001033 }
1034 break;
1035 }
1036 break;
1037 case '*':
1038 switch (c2) {
1039 case '*':
1040 switch (c3) {
1041 case '=':
1042 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001043 }
1044 break;
1045 }
1046 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001047 case '/':
1048 switch (c2) {
1049 case '/':
1050 switch (c3) {
1051 case '=':
1052 return DOUBLESLASHEQUAL;
1053 }
1054 break;
1055 }
1056 break;
Georg Brandldde00282007-03-18 19:01:53 +00001057 case '.':
1058 switch (c2) {
1059 case '.':
1060 switch (c3) {
1061 case '.':
1062 return ELLIPSIS;
1063 }
1064 break;
1065 }
1066 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001067 }
1068 return OP;
1069}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001070
Guido van Rossum926f13a1998-04-09 21:38:06 +00001071static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001072indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001073{
1074 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001075 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001076 tok->cur = tok->inp;
1077 return 1;
1078 }
1079 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001080 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1081 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001082 tok->altwarning = 0;
1083 }
1084 return 0;
1085}
1086
1087
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001088/* Get next token, after space stripping etc. */
1089
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001090static int
1091tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001092{
1093 register int c;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001094 int blankline;
1095
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001096 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001097 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001098 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001099 blankline = 0;
1100
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101 /* Get indentation level */
1102 if (tok->atbol) {
1103 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001104 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001105 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001106 for (;;) {
1107 c = tok_nextc(tok);
1108 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001109 col++, altcol++;
1110 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001111 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001112 altcol = (altcol/tok->alttabsize + 1)
1113 * tok->alttabsize;
1114 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001115 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001116 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001117 else
1118 break;
1119 }
1120 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001121 if (c == '#' || c == '\n') {
1122 /* Lines with only whitespace and/or comments
1123 shouldn't affect the indentation and are
1124 not passed to the parser as NEWLINE tokens,
1125 except *totally* empty lines in interactive
1126 mode, which signal the end of a command group. */
1127 if (col == 0 && c == '\n' && tok->prompt != NULL)
1128 blankline = 0; /* Let it through */
1129 else
1130 blankline = 1; /* Ignore completely */
1131 /* We can't jump back right here since we still
1132 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001133 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001134 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001135 if (col == tok->indstack[tok->indent]) {
1136 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001137 if (altcol != tok->altindstack[tok->indent]) {
1138 if (indenterror(tok))
1139 return ERRORTOKEN;
1140 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001141 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001142 else if (col > tok->indstack[tok->indent]) {
1143 /* Indent -- always one */
1144 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001145 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001146 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001147 return ERRORTOKEN;
1148 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001149 if (altcol <= tok->altindstack[tok->indent]) {
1150 if (indenterror(tok))
1151 return ERRORTOKEN;
1152 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001153 tok->pendin++;
1154 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001155 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001156 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001157 else /* col < tok->indstack[tok->indent] */ {
1158 /* Dedent -- any number, must be consistent */
1159 while (tok->indent > 0 &&
1160 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001161 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001162 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001163 }
1164 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001165 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001166 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001167 return ERRORTOKEN;
1168 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001169 if (altcol != tok->altindstack[tok->indent]) {
1170 if (indenterror(tok))
1171 return ERRORTOKEN;
1172 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001173 }
1174 }
1175 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001176
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001177 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001178
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001179 /* Return pending indents/dedents */
1180 if (tok->pendin != 0) {
1181 if (tok->pendin < 0) {
1182 tok->pendin++;
1183 return DEDENT;
1184 }
1185 else {
1186 tok->pendin--;
1187 return INDENT;
1188 }
1189 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001190
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001191 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001192 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193 /* Skip spaces */
1194 do {
1195 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001196 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001197
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001198 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001199 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001200
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001201 /* Skip comment */
1202 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001203 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001205
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001206 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001207 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001209 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001210
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001211 /* Identifier (most frequent token!) */
1212 if (isalpha(c) || c == '_') {
Guido van Rossum86016cb2000-03-10 22:56:54 +00001213 /* Process r"", u"" and ur"" */
Guido van Rossum5026cb41997-04-25 17:32:00 +00001214 switch (c) {
1215 case 'r':
1216 case 'R':
1217 c = tok_nextc(tok);
1218 if (c == '"' || c == '\'')
1219 goto letter_quote;
Guido van Rossum86016cb2000-03-10 22:56:54 +00001220 break;
Thomas Wouters00e41de2007-02-23 19:56:57 +00001221 case 'b':
1222 case 'B':
1223 c = tok_nextc(tok);
1224 if (c == 'r' || c == 'R')
1225 c = tok_nextc(tok);
1226 if (c == '"' || c == '\'')
1227 goto letter_quote;
1228 break;
Guido van Rossum5026cb41997-04-25 17:32:00 +00001229 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001230 while (isalnum(c) || c == '_') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001231 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001232 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001234 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001235 *p_end = tok->cur;
1236 return NAME;
1237 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001238
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001239 /* Newline */
1240 if (c == '\n') {
1241 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001242 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001243 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001244 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001246 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001247 return NEWLINE;
1248 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001249
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001250 /* Period or number starting with period? */
1251 if (c == '.') {
1252 c = tok_nextc(tok);
1253 if (isdigit(c)) {
1254 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001255 } else if (c == '.') {
1256 c = tok_nextc(tok);
1257 if (c == '.') {
1258 *p_start = tok->start;
1259 *p_end = tok->cur;
1260 return ELLIPSIS;
1261 } else {
1262 tok_backup(tok, c);
1263 }
1264 tok_backup(tok, '.');
1265 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001266 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001267 }
Georg Brandldde00282007-03-18 19:01:53 +00001268 *p_start = tok->start;
1269 *p_end = tok->cur;
1270 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001271 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001272
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 /* Number */
1274 if (isdigit(c)) {
1275 if (c == '0') {
Tim Petersd507dab2001-08-30 20:51:59 +00001276 /* Hex or octal -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 c = tok_nextc(tok);
1278 if (c == '.')
1279 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001280#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001281 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001282 goto imaginary;
1283#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 if (c == 'x' || c == 'X') {
1285 /* Hex */
1286 do {
1287 c = tok_nextc(tok);
1288 } while (isxdigit(c));
1289 }
1290 else {
Tim Petersd507dab2001-08-30 20:51:59 +00001291 int found_decimal = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 /* Octal; c is first char of it */
1293 /* There's no 'isoctdigit' macro, sigh */
1294 while ('0' <= c && c < '8') {
1295 c = tok_nextc(tok);
1296 }
Tim Petersd507dab2001-08-30 20:51:59 +00001297 if (isdigit(c)) {
1298 found_decimal = 1;
1299 do {
1300 c = tok_nextc(tok);
1301 } while (isdigit(c));
1302 }
1303 if (c == '.')
1304 goto fraction;
1305 else if (c == 'e' || c == 'E')
1306 goto exponent;
1307#ifndef WITHOUT_COMPLEX
1308 else if (c == 'j' || c == 'J')
1309 goto imaginary;
1310#endif
1311 else if (found_decimal) {
1312 tok->done = E_TOKEN;
1313 tok_backup(tok, c);
1314 return ERRORTOKEN;
1315 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 }
1317 }
1318 else {
1319 /* Decimal */
1320 do {
1321 c = tok_nextc(tok);
1322 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001323 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001324 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001325 if (c == '.') {
1326 fraction:
1327 /* Fraction */
1328 do {
1329 c = tok_nextc(tok);
1330 } while (isdigit(c));
1331 }
1332 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001333 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001334 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001336 if (c == '+' || c == '-')
1337 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001338 if (!isdigit(c)) {
1339 tok->done = E_TOKEN;
1340 tok_backup(tok, c);
1341 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001342 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001343 do {
1344 c = tok_nextc(tok);
1345 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001347#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001348 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001349 /* Imaginary part */
1350 imaginary:
1351 c = tok_nextc(tok);
1352#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 }
1354 }
1355 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001356 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 *p_end = tok->cur;
1358 return NUMBER;
1359 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001360
1361 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001362 /* String */
1363 if (c == '\'' || c == '"') {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001364 Py_ssize_t quote2 = tok->cur - tok->start + 1;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001365 int quote = c;
1366 int triple = 0;
1367 int tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 for (;;) {
1369 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001370 if (c == '\n') {
1371 if (!triple) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001372 tok->done = E_EOLS;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 tok_backup(tok, c);
1374 return ERRORTOKEN;
1375 }
1376 tripcount = 0;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001377 tok->cont_line = 1; /* multiline string. */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001378 }
1379 else if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001380 if (triple)
1381 tok->done = E_EOFS;
1382 else
1383 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001384 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 return ERRORTOKEN;
1386 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001387 else if (c == quote) {
1388 tripcount++;
Guido van Rossum35685241998-02-16 15:42:50 +00001389 if (tok->cur - tok->start == quote2) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001390 c = tok_nextc(tok);
1391 if (c == quote) {
1392 triple = 1;
1393 tripcount = 0;
1394 continue;
1395 }
1396 tok_backup(tok, c);
1397 }
1398 if (!triple || tripcount == 3)
1399 break;
1400 }
1401 else if (c == '\\') {
1402 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 c = tok_nextc(tok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001404 if (c == EOF) {
Skip Montanaro118ec702002-08-15 01:20:16 +00001405 tok->done = E_EOLS;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001406 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 return ERRORTOKEN;
1408 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001410 else
1411 tripcount = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001413 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001414 *p_end = tok->cur;
1415 return STRING;
1416 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 /* Line continuation */
1419 if (c == '\\') {
1420 c = tok_nextc(tok);
1421 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001422 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001423 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 return ERRORTOKEN;
1425 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001426 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001427 goto again; /* Read next line */
1428 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429
Guido van Rossumfbab9051991-10-20 20:25:03 +00001430 /* Check for two-character token */
1431 {
1432 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001433 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001434 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001435 int c3 = tok_nextc(tok);
1436 int token3 = PyToken_ThreeChars(c, c2, c3);
1437 if (token3 != OP) {
1438 token = token3;
1439 } else {
1440 tok_backup(tok, c3);
1441 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001442 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001443 *p_end = tok->cur;
1444 return token;
1445 }
1446 tok_backup(tok, c2);
1447 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001448
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001449 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001450 switch (c) {
1451 case '(':
1452 case '[':
1453 case '{':
1454 tok->level++;
1455 break;
1456 case ')':
1457 case ']':
1458 case '}':
1459 tok->level--;
1460 break;
1461 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001464 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001465 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001466 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467}
1468
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001469int
1470PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1471{
1472 int result = tok_get(tok, p_start, p_end);
1473 if (tok->decoding_erred) {
1474 result = ERRORTOKEN;
1475 tok->done = E_DECODE;
1476 }
1477 return result;
1478}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479
Guido van Rossum408027e1996-12-30 16:17:54 +00001480#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481
1482void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001483tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001484{
Guido van Rossum86bea461997-04-29 21:03:06 +00001485 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1487 printf("(%.*s)", (int)(end - start), start);
1488}
1489
1490#endif