blob: c1a60146ccabc7cdd5ae2806c3f4cb1c7c60b79a [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000129 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000130 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000135 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000138 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000142 tok->decoding_readline = NULL;
143 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000144#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000145 return tok;
146}
147
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148#ifdef PGEN
149
150static char *
151decoding_fgets(char *s, int size, struct tok_state *tok)
152{
153 return fgets(s, size, tok->fp);
154}
155
156static int
157decoding_feof(struct tok_state *tok)
158{
159 return feof(tok->fp);
160}
161
162static const char *
163decode_str(const char *str, struct tok_state *tok)
164{
165 return str;
166}
167
168#else /* PGEN */
169
170static char *
171error_ret(struct tok_state *tok) /* XXX */
172{
173 tok->decoding_erred = 1;
174 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000175 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176 tok->buf = NULL;
177 return NULL; /* as if it were EOF */
178}
179
180static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000181new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000183 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000184 if (result != NULL) {
185 memcpy(result, s, len);
186 result[len] = '\0';
187 }
188 return result;
189}
190
191static char *
192get_normal_name(char *s) /* for utf-8 and latin-1 */
193{
194 char buf[13];
195 int i;
196 for (i = 0; i < 12; i++) {
197 int c = s[i];
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000198 if (c == '\0')
199 break;
200 else if (c == '_')
201 buf[i] = '-';
202 else
203 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204 }
205 buf[i] = '\0';
206 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000207 strncmp(buf, "utf-8-", 6) == 0)
208 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000214 strncmp(buf, "iso-latin-1-", 12) == 0)
215 return "iso-8859-1";
216 else
217 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000218}
219
220/* Return the coding spec in S, or NULL if none is found. */
221
222static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000226 /* Coding spec must be in a comment, and that comment must be
227 * the only statement on the source code line. */
228 for (i = 0; i < size - 6; i++) {
229 if (s[i] == '#')
230 break;
231 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
232 return NULL;
233 }
234 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 const char* t = s + i;
236 if (strncmp(t, "coding", 6) == 0) {
237 const char* begin = NULL;
238 t += 6;
239 if (t[0] != ':' && t[0] != '=')
240 continue;
241 do {
242 t++;
243 } while (t[0] == '\x20' || t[0] == '\t');
244
245 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000246 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000247 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248 t++;
249
250 if (begin < t) {
251 char* r = new_string(begin, t - begin);
252 char* q = get_normal_name(r);
253 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000255 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 }
257 return r;
258 }
259 }
260 }
261 return NULL;
262}
263
264/* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
268
269static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000270check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 int set_readline(struct tok_state *, const char *))
272{
Tim Peters17db21f2002-09-03 15:39:58 +0000273 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000275
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000276 if (tok->cont_line)
277 /* It's a continuation line, so it can't be a coding spec. */
278 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000279 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280 if (cs != NULL) {
281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000283 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000284 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 tok->encoding = cs;
286 } else {
287 r = set_readline(tok, cs);
288 if (r) {
289 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000290 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000291 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000292 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 } else { /* then, compare cs with BOM */
296 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000298 }
299 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000300 if (!r) {
301 cs = tok->encoding;
302 if (!cs)
303 cs = "with BOM";
304 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
305 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000306 return r;
307}
308
309/* See whether the file starts with a BOM. If it does,
310 invoke the set_readline function with the new encoding.
311 Return 1 on success, 0 on failure. */
312
313static int
314check_bom(int get_char(struct tok_state *),
315 void unget_char(int, struct tok_state *),
316 int set_readline(struct tok_state *, const char *),
317 struct tok_state *tok)
318{
319 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000320 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000321 if (ch == EOF) {
322 return 1;
323 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000324 ch = get_char(tok);
325 if (ch != 0xBB) {
326 unget_char(ch, tok);
327 unget_char(0xEF, tok);
328 /* any token beginning with '\xEF' is a bad token */
329 return 1;
330 }
331 ch = get_char(tok);
332 if (ch != 0xBF) {
333 unget_char(ch, tok);
334 unget_char(0xBB, tok);
335 unget_char(0xEF, tok);
336 /* any token beginning with '\xEF' is a bad token */
337 return 1;
338 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000339#if 0
340 /* Disable support for UTF-16 BOMs until a decision
341 is made whether this needs to be supported. */
342 } else if (ch == 0xFE) {
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000343 ch = get_char(tok);
344 if (ch != 0xFF)
345 goto NON_BOM;
346 if (!set_readline(tok, "utf-16-be"))
347 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000348 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000349 } else if (ch == 0xFF) {
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000350 ch = get_char(tok);
351 if (ch != 0xFE)
352 goto NON_BOM;
353 if (!set_readline(tok, "utf-16-le"))
354 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000355 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000356#endif
357 } else {
358 unget_char(ch, tok);
359 return 1;
360 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000361 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000364 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365 return 1;
366}
367
368/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000369 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 On entry, tok->decoding_buffer will be one of:
372 1) NULL: need to call tok->decoding_readline to get a new line
373 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
374 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000375 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 (in the s buffer) to copy entire contents of the line read
377 by tok->decoding_readline. tok->decoding_buffer has the overflow.
378 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000380 reached): see tok_nextc and its calls to decoding_fgets.
381*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382
383static char *
384fp_readl(char *s, int size, struct tok_state *tok)
385{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000386 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000387 const char *buf;
388 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000389
390 /* Ask for one less byte so we can terminate it */
391 assert(size > 0);
392 size--;
393
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000394 if (tok->decoding_buffer) {
395 bufobj = tok->decoding_buffer;
396 Py_INCREF(bufobj);
397 }
398 else
399 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000400 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
401 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000402 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000403 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000404 if (PyUnicode_CheckExact(bufobj))
405 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000406 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000407 if (buf == NULL) {
408 goto error;
409 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000410 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000411 else
412 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000413 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000414 if (buf == NULL) {
415 goto error;
416 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000417 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 }
419
420 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000421 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000422 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000423 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000424 buflen-size);
425 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000427 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000428 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000429 else
430 tok->decoding_buffer = NULL;
431
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000432 memcpy(s, buf, buflen);
433 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000434 if (buflen == 0) /* EOF */
435 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000437 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000438
439error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000440 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000441 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000442}
443
444/* Set the readline function for TOK to a StreamReader's
445 readline function. The StreamReader is named ENC.
446
447 This function is called from check_bom and check_coding_spec.
448
449 ENC is usually identical to the future value of tok->encoding,
450 except for the (currently unsupported) case of UTF-16.
451
452 Return 1 on success, 0 on failure. */
453
454static int
455fp_setreadl(struct tok_state *tok, const char* enc)
456{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000457 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458
Christian Heimes819b8bf2008-01-03 23:05:47 +0000459 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000460 if (io == NULL)
461 goto cleanup;
462
Brett Cannon8a9583e2008-09-04 05:04:25 +0000463 if (tok->filename)
464 stream = PyObject_CallMethod(io, "open", "ssis",
465 tok->filename, "r", -1, enc);
466 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000467 stream = PyObject_CallMethod(io, "open", "isisOOO",
468 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000469 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000470 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000471
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000472 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000473 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000475
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000476 /* The file has been reopened; parsing will restart from
477 * the beginning of the file, we have to reset the line number.
478 * But this function has been called from inside tok_nextc() which
479 * will increment lineno before it returns. So we set it -1 so that
480 * the next call to tok_nextc() will start with tok->lineno == 0.
481 */
482 tok->lineno = -1;
483
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000484 cleanup:
485 Py_XDECREF(stream);
486 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000487 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488}
489
490/* Fetch the next byte from TOK. */
491
492static int fp_getc(struct tok_state *tok) {
493 return getc(tok->fp);
494}
495
496/* Unfetch the last byte back into TOK. */
497
498static void fp_ungetc(int c, struct tok_state *tok) {
499 ungetc(c, tok->fp);
500}
501
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000502/* Check whether the characters at s start a valid
503 UTF-8 sequence. Return the number of characters forming
504 the sequence if yes, 0 if not. */
505static int valid_utf8(const unsigned char* s)
506{
507 int expected = 0;
508 int length;
509 if (*s < 0x80)
510 /* single-byte code */
511 return 1;
512 if (*s < 0xc0)
513 /* following byte */
514 return 0;
515 if (*s < 0xE0)
516 expected = 1;
517 else if (*s < 0xF0)
518 expected = 2;
519 else if (*s < 0xF8)
520 expected = 3;
521 else
522 return 0;
523 length = expected + 1;
524 for (; expected; expected--)
525 if (s[expected] < 0x80 || s[expected] >= 0xC0)
526 return 0;
527 return length;
528}
529
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530/* Read a line of input from TOK. Determine encoding
531 if necessary. */
532
533static char *
534decoding_fgets(char *s, int size, struct tok_state *tok)
535{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000536 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000537 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000538 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000539 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000540 /* We already have a codec associated with
541 this input. */
542 line = fp_readl(s, size, tok);
543 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000544 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000546 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548 break;
549 } else {
550 /* We have not yet determined the encoding.
551 If an encoding is found, use the file-pointer
552 reader functions from now on. */
553 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
554 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000555 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000556 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000557 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
559 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
560 return error_ret(tok);
561 }
562 }
563#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000564 /* The default encoding is UTF-8, so make sure we don't have any
565 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000566 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000568 int length;
569 for (c = (unsigned char *)line; *c; c += length)
570 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000571 badchar = *c;
572 break;
573 }
574 }
575 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000576 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000577 /* Need to add 1 to the line number, since this line
578 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000579 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000580 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000581 "in file %.200s on line %i, "
582 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000583 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000584 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000585 PyErr_SetString(PyExc_SyntaxError, buf);
586 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000587 }
588#endif
589 return line;
590}
591
592static int
593decoding_feof(struct tok_state *tok)
594{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000595 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596 return feof(tok->fp);
597 } else {
598 PyObject* buf = tok->decoding_buffer;
599 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000600 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 if (buf == NULL) {
602 error_ret(tok);
603 return 1;
604 } else {
605 tok->decoding_buffer = buf;
606 }
607 }
608 return PyObject_Length(buf) == 0;
609 }
610}
611
612/* Fetch a byte from TOK, using the string buffer. */
613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614static int
615buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000616 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000617}
618
619/* Unfetch a byte from TOK, using the string buffer. */
620
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000621static void
622buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000623 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000624 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625}
626
627/* Set the readline function for TOK to ENC. For the string-based
628 tokenizer, this means to just record the encoding. */
629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000630static int
631buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632 tok->enc = enc;
633 return 1;
634}
635
636/* Return a UTF-8 encoding Python string object from the
637 C byte string STR, which is encoded with ENC. */
638
639static PyObject *
640translate_into_utf8(const char* str, const char* enc) {
641 PyObject *utf8;
642 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
643 if (buf == NULL)
644 return NULL;
645 utf8 = PyUnicode_AsUTF8String(buf);
646 Py_DECREF(buf);
647 return utf8;
648}
649
650/* Decode a byte string STR for use as the buffer of TOK.
651 Look for encoding declarations inside STR, and record them
652 inside TOK. */
653
654static const char *
655decode_str(const char *str, struct tok_state *tok)
656{
657 PyObject* utf8 = NULL;
658 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000659 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660 int lineno = 0;
661 tok->enc = NULL;
662 tok->str = str;
663 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000664 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000666 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000667 if (tok->enc != NULL) {
668 utf8 = translate_into_utf8(str, tok->enc);
669 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000670 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000671 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000672 }
673 for (s = str;; s++) {
674 if (*s == '\0') break;
675 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000676 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000677 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000678 lineno++;
679 if (lineno == 2) break;
680 }
681 }
682 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000683 /* need to check line 1 and 2 separately since check_coding_spec
684 assumes a single line as input */
685 if (newl[0]) {
686 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
687 return error_ret(tok);
688 if (tok->enc == NULL && newl[1]) {
689 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
690 tok, buf_setreadl))
691 return error_ret(tok);
692 }
693 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000694 if (tok->enc != NULL) {
695 assert(utf8 == NULL);
696 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Petersond76c8da2009-06-28 17:35:48 +0000697 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000698 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000699 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700 }
701 assert(tok->decoding_buffer == NULL);
702 tok->decoding_buffer = utf8; /* CAUTION */
703 return str;
704}
705
706#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000707
708/* Set up tokenizer for string */
709
710struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000711PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712{
713 struct tok_state *tok = tok_new();
714 if (tok == NULL)
715 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000716 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000717 if (str == NULL) {
718 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000720 }
721
Martin v. Löwis95292d62002-12-11 14:04:59 +0000722 /* XXX: constify members. */
723 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000724 return tok;
725}
726
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000727struct tok_state *
728PyTokenizer_FromUTF8(const char *str)
729{
730 struct tok_state *tok = tok_new();
731 if (tok == NULL)
732 return NULL;
733 tok->decoding_state = STATE_RAW;
734 tok->read_coding_spec = 1;
735 tok->enc = NULL;
736 tok->str = str;
737 tok->encoding = (char *)PyMem_MALLOC(6);
738 if (!tok->encoding) {
739 PyTokenizer_Free(tok);
740 return NULL;
741 }
742 strcpy(tok->encoding, "utf-8");
743
744 /* XXX: constify members. */
745 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
746 return tok;
747}
748
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000749
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000750/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751
752struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000753PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754{
755 struct tok_state *tok = tok_new();
756 if (tok == NULL)
757 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000758 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000759 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 return NULL;
761 }
762 tok->cur = tok->inp = tok->buf;
763 tok->end = tok->buf + BUFSIZ;
764 tok->fp = fp;
765 tok->prompt = ps1;
766 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000767 if (enc != NULL) {
768 /* Must copy encoding declaration since it
769 gets copied into the parse tree. */
770 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
771 if (!tok->encoding) {
772 PyTokenizer_Free(tok);
773 return NULL;
774 }
775 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000776 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000777 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000778 return tok;
779}
780
781
782/* Free a tok_state structure */
783
784void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000785PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000786{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000787 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000788 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000789#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000790 Py_XDECREF(tok->decoding_readline);
791 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000792#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000794 PyMem_FREE(tok->buf);
795 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000796}
797
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798/* Get next char, updating state; error code goes into tok->done */
799
800static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000801tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000804 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000805 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000806 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000807 if (tok->done != E_OK)
808 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000810 char *end = strchr(tok->inp, '\n');
811 if (end != NULL)
812 end++;
813 else {
814 end = strchr(tok->inp, '\0');
815 if (end == tok->inp) {
816 tok->done = E_EOF;
817 return EOF;
818 }
819 }
820 if (tok->start == NULL)
821 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000822 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000823 tok->lineno++;
824 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000825 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000826 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000827 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000828 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000829#ifndef PGEN
830 if (tok->encoding && newtok && *newtok) {
831 /* Recode to UTF-8 */
832 Py_ssize_t buflen;
833 const char* buf;
834 PyObject *u = translate_into_utf8(newtok, tok->encoding);
835 PyMem_FREE(newtok);
836 if (!u) {
837 tok->done = E_DECODE;
838 return EOF;
839 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000840 buflen = PyBytes_GET_SIZE(u);
841 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000842 if (!buf) {
843 Py_DECREF(u);
844 tok->done = E_DECODE;
845 return EOF;
846 }
847 newtok = PyMem_MALLOC(buflen+1);
848 strcpy(newtok, buf);
849 Py_DECREF(u);
850 }
851#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852 if (tok->nextprompt != NULL)
853 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000854 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000855 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000856 else if (*newtok == '\0') {
857 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858 tok->done = E_EOF;
859 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000861 size_t start = tok->start - tok->buf;
862 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000863 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000864 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000865 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000866 tok->lineno++;
867 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000868 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000869 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000870 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000871 tok->done = E_NOMEM;
872 return EOF;
873 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 tok->buf = buf;
875 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000876 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000877 strcpy(tok->buf + oldlen, newtok);
878 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 tok->inp = tok->buf + newlen;
880 tok->end = tok->inp + 1;
881 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000882 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000883 else {
884 tok->lineno++;
885 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000886 PyMem_FREE(tok->buf);
887 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000888 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000890 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000891 tok->inp = strchr(tok->buf, '\0');
892 tok->end = tok->inp + 1;
893 }
894 }
895 else {
896 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000898 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000899 if (tok->start == NULL) {
900 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 tok->buf = (char *)
902 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000903 if (tok->buf == NULL) {
904 tok->done = E_NOMEM;
905 return EOF;
906 }
907 tok->end = tok->buf + BUFSIZ;
908 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000909 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
910 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000911 tok->done = E_EOF;
912 done = 1;
913 }
914 else {
915 tok->done = E_OK;
916 tok->inp = strchr(tok->buf, '\0');
917 done = tok->inp[-1] == '\n';
918 }
919 }
920 else {
921 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000922 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000923 tok->done = E_EOF;
924 done = 1;
925 }
926 else
927 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000928 }
929 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000930 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000931 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000932 Py_ssize_t curstart = tok->start == NULL ? -1 :
933 tok->start - tok->buf;
934 Py_ssize_t curvalid = tok->inp - tok->buf;
935 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000936 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000937 newbuf = (char *)PyMem_REALLOC(newbuf,
938 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000939 if (newbuf == NULL) {
940 tok->done = E_NOMEM;
941 tok->cur = tok->inp;
942 return EOF;
943 }
944 tok->buf = newbuf;
945 tok->inp = tok->buf + curvalid;
946 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000947 tok->start = curstart < 0 ? NULL :
948 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000949 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000950 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000951 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000952 /* Break out early on decoding
953 errors, as tok->buf will be NULL
954 */
955 if (tok->decoding_erred)
956 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000957 /* Last line does not end in \n,
958 fake one */
959 strcpy(tok->inp, "\n");
960 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000961 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000962 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000963 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000964 if (tok->buf != NULL) {
965 tok->cur = tok->buf + cur;
966 tok->line_start = tok->cur;
967 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000968 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000969 pt = tok->inp - 2;
970 if (pt >= tok->buf && *pt == '\r') {
971 *pt++ = '\n';
972 *pt = '\0';
973 tok->inp = pt;
974 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000975 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000976 }
977 if (tok->done != E_OK) {
978 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000979 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000980 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000981 return EOF;
982 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000984 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000985}
986
987
988/* Back-up one character */
989
990static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000991tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000992{
993 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000994 if (--tok->cur < tok->buf)
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000995 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000996 if (*tok->cur != c)
997 *tok->cur = c;
998 }
999}
1000
1001
1002/* Return the token corresponding to a single character */
1003
1004int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001005PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001006{
1007 switch (c) {
1008 case '(': return LPAR;
1009 case ')': return RPAR;
1010 case '[': return LSQB;
1011 case ']': return RSQB;
1012 case ':': return COLON;
1013 case ',': return COMMA;
1014 case ';': return SEMI;
1015 case '+': return PLUS;
1016 case '-': return MINUS;
1017 case '*': return STAR;
1018 case '/': return SLASH;
1019 case '|': return VBAR;
1020 case '&': return AMPER;
1021 case '<': return LESS;
1022 case '>': return GREATER;
1023 case '=': return EQUAL;
1024 case '.': return DOT;
1025 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001026 case '{': return LBRACE;
1027 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001028 case '^': return CIRCUMFLEX;
1029 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001030 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031 default: return OP;
1032 }
1033}
1034
1035
Guido van Rossumfbab9051991-10-20 20:25:03 +00001036int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001037PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001038{
1039 switch (c1) {
1040 case '=':
1041 switch (c2) {
1042 case '=': return EQEQUAL;
1043 }
1044 break;
1045 case '!':
1046 switch (c2) {
1047 case '=': return NOTEQUAL;
1048 }
1049 break;
1050 case '<':
1051 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001052 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001053 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001054 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001055 }
1056 break;
1057 case '>':
1058 switch (c2) {
1059 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001060 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001061 }
1062 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001063 case '+':
1064 switch (c2) {
1065 case '=': return PLUSEQUAL;
1066 }
1067 break;
1068 case '-':
1069 switch (c2) {
1070 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001071 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001072 }
1073 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001074 case '*':
1075 switch (c2) {
1076 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 case '=': return STAREQUAL;
1078 }
1079 break;
1080 case '/':
1081 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001082 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001083 case '=': return SLASHEQUAL;
1084 }
1085 break;
1086 case '|':
1087 switch (c2) {
1088 case '=': return VBAREQUAL;
1089 }
1090 break;
1091 case '%':
1092 switch (c2) {
1093 case '=': return PERCENTEQUAL;
1094 }
1095 break;
1096 case '&':
1097 switch (c2) {
1098 case '=': return AMPEREQUAL;
1099 }
1100 break;
1101 case '^':
1102 switch (c2) {
1103 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001104 }
1105 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001106 }
1107 return OP;
1108}
1109
Thomas Wouters434d0822000-08-24 20:11:32 +00001110int
1111PyToken_ThreeChars(int c1, int c2, int c3)
1112{
1113 switch (c1) {
1114 case '<':
1115 switch (c2) {
1116 case '<':
1117 switch (c3) {
1118 case '=':
1119 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001120 }
1121 break;
1122 }
1123 break;
1124 case '>':
1125 switch (c2) {
1126 case '>':
1127 switch (c3) {
1128 case '=':
1129 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001130 }
1131 break;
1132 }
1133 break;
1134 case '*':
1135 switch (c2) {
1136 case '*':
1137 switch (c3) {
1138 case '=':
1139 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001140 }
1141 break;
1142 }
1143 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001144 case '/':
1145 switch (c2) {
1146 case '/':
1147 switch (c3) {
1148 case '=':
1149 return DOUBLESLASHEQUAL;
1150 }
1151 break;
1152 }
1153 break;
Georg Brandldde00282007-03-18 19:01:53 +00001154 case '.':
1155 switch (c2) {
1156 case '.':
1157 switch (c3) {
1158 case '.':
1159 return ELLIPSIS;
1160 }
1161 break;
1162 }
1163 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001164 }
1165 return OP;
1166}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001167
Guido van Rossum926f13a1998-04-09 21:38:06 +00001168static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001169indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170{
1171 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001172 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001173 tok->cur = tok->inp;
1174 return 1;
1175 }
1176 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001177 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1178 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001179 tok->altwarning = 0;
1180 }
1181 return 0;
1182}
1183
Martin v. Löwis47383402007-08-15 07:32:56 +00001184#ifdef PGEN
1185#define verify_identifier(s,e) 1
1186#else
1187/* Verify that the identifier follows PEP 3131. */
1188static int
1189verify_identifier(char *start, char *end)
1190{
Guido van Rossume3e37012007-08-29 18:54:41 +00001191 PyObject *s;
1192 int result;
1193 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1194 if (s == NULL) {
1195 PyErr_Clear();
1196 return 0;
1197 }
1198 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001199 Py_DECREF(s);
1200 return result;
1201}
1202#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001203
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001204/* Get next token, after space stripping etc. */
1205
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001206static int
1207tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208{
1209 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001210 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001211
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001212 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001213 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001214 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001215 blankline = 0;
1216
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 /* Get indentation level */
1218 if (tok->atbol) {
1219 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001220 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001221 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001222 for (;;) {
1223 c = tok_nextc(tok);
1224 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001225 col++, altcol++;
1226 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001227 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228 altcol = (altcol/tok->alttabsize + 1)
1229 * tok->alttabsize;
1230 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001231 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001232 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 else
1234 break;
1235 }
1236 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001237 if (c == '#' || c == '\n') {
1238 /* Lines with only whitespace and/or comments
1239 shouldn't affect the indentation and are
1240 not passed to the parser as NEWLINE tokens,
1241 except *totally* empty lines in interactive
1242 mode, which signal the end of a command group. */
1243 if (col == 0 && c == '\n' && tok->prompt != NULL)
1244 blankline = 0; /* Let it through */
1245 else
1246 blankline = 1; /* Ignore completely */
1247 /* We can't jump back right here since we still
1248 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001250 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001251 if (col == tok->indstack[tok->indent]) {
1252 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001253 if (altcol != tok->altindstack[tok->indent]) {
1254 if (indenterror(tok))
1255 return ERRORTOKEN;
1256 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001257 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001258 else if (col > tok->indstack[tok->indent]) {
1259 /* Indent -- always one */
1260 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001261 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001262 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001263 return ERRORTOKEN;
1264 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001265 if (altcol <= tok->altindstack[tok->indent]) {
1266 if (indenterror(tok))
1267 return ERRORTOKEN;
1268 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001269 tok->pendin++;
1270 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001271 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001273 else /* col < tok->indstack[tok->indent] */ {
1274 /* Dedent -- any number, must be consistent */
1275 while (tok->indent > 0 &&
1276 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001277 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001278 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001279 }
1280 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001281 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001282 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001283 return ERRORTOKEN;
1284 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001285 if (altcol != tok->altindstack[tok->indent]) {
1286 if (indenterror(tok))
1287 return ERRORTOKEN;
1288 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 }
1290 }
1291 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001292
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001293 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001294
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001295 /* Return pending indents/dedents */
1296 if (tok->pendin != 0) {
1297 if (tok->pendin < 0) {
1298 tok->pendin++;
1299 return DEDENT;
1300 }
1301 else {
1302 tok->pendin--;
1303 return INDENT;
1304 }
1305 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001306
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001307 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001308 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 /* Skip spaces */
1310 do {
1311 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001312 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001313
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001315 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001316
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001317 /* Skip comment */
1318 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001319 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001320 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001321
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001323 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001325 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001326
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001328 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001329 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001330 /* Process b"", r"" and br"" */
1331 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001332 c = tok_nextc(tok);
1333 if (c == '"' || c == '\'')
1334 goto letter_quote;
1335 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001336 if (c == 'r' || c == 'R') {
1337 c = tok_nextc(tok);
1338 if (c == '"' || c == '\'')
1339 goto letter_quote;
1340 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001341 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001342 if (c >= 128)
1343 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001345 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001346 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001347 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001348 !verify_identifier(tok->start, tok->cur)) {
1349 tok->done = E_IDENTIFIER;
1350 return ERRORTOKEN;
1351 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001352 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 *p_end = tok->cur;
1354 return NAME;
1355 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001356
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001357 /* Newline */
1358 if (c == '\n') {
1359 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001360 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001361 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001362 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001364 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 return NEWLINE;
1366 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001367
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001368 /* Period or number starting with period? */
1369 if (c == '.') {
1370 c = tok_nextc(tok);
1371 if (isdigit(c)) {
1372 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001373 } else if (c == '.') {
1374 c = tok_nextc(tok);
1375 if (c == '.') {
1376 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001377 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001378 return ELLIPSIS;
1379 } else {
1380 tok_backup(tok, c);
1381 }
1382 tok_backup(tok, '.');
1383 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001384 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001385 }
Georg Brandldde00282007-03-18 19:01:53 +00001386 *p_start = tok->start;
1387 *p_end = tok->cur;
1388 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001389 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001390
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 /* Number */
1392 if (isdigit(c)) {
1393 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001394 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 c = tok_nextc(tok);
1396 if (c == '.')
1397 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001398#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001399 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001400 goto imaginary;
1401#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001403
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001405 c = tok_nextc(tok);
1406 if (!isxdigit(c)) {
1407 tok->done = E_TOKEN;
1408 tok_backup(tok, c);
1409 return ERRORTOKEN;
1410 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 do {
1412 c = tok_nextc(tok);
1413 } while (isxdigit(c));
1414 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001415 else if (c == 'o' || c == 'O') {
1416 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001417 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001418 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001419 tok->done = E_TOKEN;
1420 tok_backup(tok, c);
1421 return ERRORTOKEN;
1422 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001423 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001425 } while ('0' <= c && c < '8');
1426 }
1427 else if (c == 'b' || c == 'B') {
1428 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001429 c = tok_nextc(tok);
1430 if (c != '0' && c != '1') {
1431 tok->done = E_TOKEN;
1432 tok_backup(tok, c);
1433 return ERRORTOKEN;
1434 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001435 do {
1436 c = tok_nextc(tok);
1437 } while (c == '0' || c == '1');
1438 }
1439 else {
1440 int nonzero = 0;
1441 /* maybe old-style octal; c is first char of it */
1442 /* in any case, allow '0' as a literal */
1443 while (c == '0')
1444 c = tok_nextc(tok);
1445 while (isdigit(c)) {
1446 nonzero = 1;
1447 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001448 }
1449 if (c == '.')
1450 goto fraction;
1451 else if (c == 'e' || c == 'E')
1452 goto exponent;
1453#ifndef WITHOUT_COMPLEX
1454 else if (c == 'j' || c == 'J')
1455 goto imaginary;
1456#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001457 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001458 tok->done = E_TOKEN;
1459 tok_backup(tok, c);
1460 return ERRORTOKEN;
1461 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 }
1463 }
1464 else {
1465 /* Decimal */
1466 do {
1467 c = tok_nextc(tok);
1468 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001469 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001470 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001471 if (c == '.') {
1472 fraction:
1473 /* Fraction */
1474 do {
1475 c = tok_nextc(tok);
1476 } while (isdigit(c));
1477 }
1478 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001479 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001480 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001481 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001482 if (c == '+' || c == '-')
1483 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001484 if (!isdigit(c)) {
1485 tok->done = E_TOKEN;
1486 tok_backup(tok, c);
1487 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001488 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001489 do {
1490 c = tok_nextc(tok);
1491 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001492 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001493#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001494 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001495 /* Imaginary part */
1496 imaginary:
1497 c = tok_nextc(tok);
1498#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001499 }
1500 }
1501 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001502 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001503 *p_end = tok->cur;
1504 return NUMBER;
1505 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001506
1507 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001508 /* String */
1509 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001510 int quote = c;
1511 int quote_size = 1; /* 1 or 3 */
1512 int end_quote_size = 0;
1513
1514 /* Find the quote size and start of string */
1515 c = tok_nextc(tok);
1516 if (c == quote) {
1517 c = tok_nextc(tok);
1518 if (c == quote)
1519 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001520 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001521 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001522 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001523 if (c != quote)
1524 tok_backup(tok, c);
1525
1526 /* Get rest of string */
1527 while (end_quote_size != quote_size) {
1528 c = tok_nextc(tok);
1529 if (c == EOF) {
1530 if (quote_size == 3)
1531 tok->done = E_EOFS;
1532 else
1533 tok->done = E_EOLS;
1534 tok->cur = tok->inp;
1535 return ERRORTOKEN;
1536 }
1537 if (quote_size == 1 && c == '\n') {
1538 tok->done = E_EOLS;
1539 tok->cur = tok->inp;
1540 return ERRORTOKEN;
1541 }
1542 if (c == quote)
1543 end_quote_size += 1;
1544 else {
1545 end_quote_size = 0;
1546 if (c == '\\')
1547 c = tok_nextc(tok); /* skip escaped char */
1548 }
1549 }
1550
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001551 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001552 *p_end = tok->cur;
1553 return STRING;
1554 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001556 /* Line continuation */
1557 if (c == '\\') {
1558 c = tok_nextc(tok);
1559 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001560 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001561 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001562 return ERRORTOKEN;
1563 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001564 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565 goto again; /* Read next line */
1566 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001567
Guido van Rossumfbab9051991-10-20 20:25:03 +00001568 /* Check for two-character token */
1569 {
1570 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001571 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001572 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001573 int c3 = tok_nextc(tok);
1574 int token3 = PyToken_ThreeChars(c, c2, c3);
1575 if (token3 != OP) {
1576 token = token3;
1577 } else {
1578 tok_backup(tok, c3);
1579 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001580 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001581 *p_end = tok->cur;
1582 return token;
1583 }
1584 tok_backup(tok, c2);
1585 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001587 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001588 switch (c) {
1589 case '(':
1590 case '[':
1591 case '{':
1592 tok->level++;
1593 break;
1594 case ')':
1595 case ']':
1596 case '}':
1597 tok->level--;
1598 break;
1599 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001600
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001601 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001602 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001603 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001604 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001605}
1606
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001607int
1608PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1609{
1610 int result = tok_get(tok, p_start, p_end);
1611 if (tok->decoding_erred) {
1612 result = ERRORTOKEN;
1613 tok->done = E_DECODE;
1614 }
1615 return result;
1616}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001617
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001618/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001619
1620 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001621 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001622 should be assumed to be PyUnicode_GetDefaultEncoding()).
1623
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001624 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1625 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001626*/
1627char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001628PyTokenizer_FindEncoding(int fd)
1629{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001630 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001631 FILE *fp;
1632 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001633
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001634 fd = dup(fd);
1635 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001636 return NULL;
1637 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001638 fp = fdopen(fd, "r");
1639 if (fp == NULL) {
1640 return NULL;
1641 }
1642 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1643 if (tok == NULL) {
1644 fclose(fp);
1645 return NULL;
1646 }
1647 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001648 PyTokenizer_Get(tok, &p_start, &p_end);
1649 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001650 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001651 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001652 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001653 if (encoding)
1654 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001655 }
1656 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001657 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001658}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001659
Guido van Rossum408027e1996-12-30 16:17:54 +00001660#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001661
1662void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001663tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001664{
Guido van Rossum86bea461997-04-29 21:03:06 +00001665 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001666 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1667 printf("(%.*s)", (int)(end - start), start);
1668}
1669
1670#endif