blob: 15e8185f8f6907950f04e712c37af77b4725deb5 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000129 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000130 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000135 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000138 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000142 tok->decoding_readline = NULL;
143 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000144#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000145 return tok;
146}
147
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148#ifdef PGEN
149
150static char *
151decoding_fgets(char *s, int size, struct tok_state *tok)
152{
153 return fgets(s, size, tok->fp);
154}
155
156static int
157decoding_feof(struct tok_state *tok)
158{
159 return feof(tok->fp);
160}
161
162static const char *
163decode_str(const char *str, struct tok_state *tok)
164{
165 return str;
166}
167
168#else /* PGEN */
169
170static char *
171error_ret(struct tok_state *tok) /* XXX */
172{
173 tok->decoding_erred = 1;
174 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000175 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176 tok->buf = NULL;
177 return NULL; /* as if it were EOF */
178}
179
180static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000181new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000183 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000184 if (result != NULL) {
185 memcpy(result, s, len);
186 result[len] = '\0';
187 }
188 return result;
189}
190
191static char *
192get_normal_name(char *s) /* for utf-8 and latin-1 */
193{
194 char buf[13];
195 int i;
196 for (i = 0; i < 12; i++) {
197 int c = s[i];
198 if (c == '\0') break;
199 else if (c == '_') buf[i] = '-';
200 else buf[i] = tolower(c);
201 }
202 buf[i] = '\0';
203 if (strcmp(buf, "utf-8") == 0 ||
204 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
205 else if (strcmp(buf, "latin-1") == 0 ||
206 strcmp(buf, "iso-8859-1") == 0 ||
207 strcmp(buf, "iso-latin-1") == 0 ||
208 strncmp(buf, "latin-1-", 8) == 0 ||
209 strncmp(buf, "iso-8859-1-", 11) == 0 ||
210 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
211 else return s;
212}
213
214/* Return the coding spec in S, or NULL if none is found. */
215
216static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000217get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000218{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000219 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000220 /* Coding spec must be in a comment, and that comment must be
221 * the only statement on the source code line. */
222 for (i = 0; i < size - 6; i++) {
223 if (s[i] == '#')
224 break;
225 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
226 return NULL;
227 }
228 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000229 const char* t = s + i;
230 if (strncmp(t, "coding", 6) == 0) {
231 const char* begin = NULL;
232 t += 6;
233 if (t[0] != ':' && t[0] != '=')
234 continue;
235 do {
236 t++;
237 } while (t[0] == '\x20' || t[0] == '\t');
238
239 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000240 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000241 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000242 t++;
243
244 if (begin < t) {
245 char* r = new_string(begin, t - begin);
246 char* q = get_normal_name(r);
247 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000248 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000249 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 }
251 return r;
252 }
253 }
254 }
255 return NULL;
256}
257
258/* Check whether the line contains a coding spec. If it does,
259 invoke the set_readline function for the new encoding.
260 This function receives the tok_state and the new encoding.
261 Return 1 on success, 0 on failure. */
262
263static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000264check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000265 int set_readline(struct tok_state *, const char *))
266{
Tim Peters17db21f2002-09-03 15:39:58 +0000267 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000269
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000270 if (tok->cont_line)
271 /* It's a continuation line, so it can't be a coding spec. */
272 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000273 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 if (cs != NULL) {
275 tok->read_coding_spec = 1;
276 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000277 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000278 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000279 tok->encoding = cs;
280 } else {
281 r = set_readline(tok, cs);
282 if (r) {
283 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000286 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000291 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000314 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000318 ch = get_char(tok);
319 if (ch != 0xBB) {
320 unget_char(ch, tok);
321 unget_char(0xEF, tok);
322 /* any token beginning with '\xEF' is a bad token */
323 return 1;
324 }
325 ch = get_char(tok);
326 if (ch != 0xBF) {
327 unget_char(ch, tok);
328 unget_char(0xBB, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch == 0xFE) {
337 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
338 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000339 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 } else if (ch == 0xFF) {
341 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
342 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000343 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344#endif
345 } else {
346 unget_char(ch, tok);
347 return 1;
348 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000349 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000352 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000353 return 1;
354}
355
356/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000363 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 reached): see tok_nextc and its calls to decoding_fgets.
369*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370
371static char *
372fp_readl(char *s, int size, struct tok_state *tok)
373{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000374 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000375 const char *buf;
376 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377
378 /* Ask for one less byte so we can terminate it */
379 assert(size > 0);
380 size--;
381
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 if (tok->decoding_buffer) {
383 bufobj = tok->decoding_buffer;
384 Py_INCREF(bufobj);
385 }
386 else
387 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000388 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
389 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000390 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000391 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000392 if (PyUnicode_CheckExact(bufobj))
393 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000394 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000395 if (buf == NULL) {
396 goto error;
397 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000399 else
400 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000401 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000402 if (buf == NULL) {
403 goto error;
404 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000405 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000406 }
407
408 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000409 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000410 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000411 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000412 buflen-size);
413 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000414 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000415 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000417 else
418 tok->decoding_buffer = NULL;
419
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000420 memcpy(s, buf, buflen);
421 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 if (buflen == 0) /* EOF */
423 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426
427error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000428 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000429 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430}
431
432/* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
434
435 This function is called from check_bom and check_coding_spec.
436
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
439
440 Return 1 on success, 0 on failure. */
441
442static int
443fp_setreadl(struct tok_state *tok, const char* enc)
444{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000445 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000446
Christian Heimes819b8bf2008-01-03 23:05:47 +0000447 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000448 if (io == NULL)
449 goto cleanup;
450
Brett Cannon8a9583e2008-09-04 05:04:25 +0000451 if (tok->filename)
452 stream = PyObject_CallMethod(io, "open", "ssis",
453 tok->filename, "r", -1, enc);
454 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000455 stream = PyObject_CallMethod(io, "open", "isisOOO",
456 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000457 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000458 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000460 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000461 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000463
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000464 /* The file has been reopened; parsing will restart from
465 * the beginning of the file, we have to reset the line number.
466 * But this function has been called from inside tok_nextc() which
467 * will increment lineno before it returns. So we set it -1 so that
468 * the next call to tok_nextc() will start with tok->lineno == 0.
469 */
470 tok->lineno = -1;
471
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472 cleanup:
473 Py_XDECREF(stream);
474 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000475 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476}
477
478/* Fetch the next byte from TOK. */
479
480static int fp_getc(struct tok_state *tok) {
481 return getc(tok->fp);
482}
483
484/* Unfetch the last byte back into TOK. */
485
486static void fp_ungetc(int c, struct tok_state *tok) {
487 ungetc(c, tok->fp);
488}
489
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000490/* Check whether the characters at s start a valid
491 UTF-8 sequence. Return the number of characters forming
492 the sequence if yes, 0 if not. */
493static int valid_utf8(const unsigned char* s)
494{
495 int expected = 0;
496 int length;
497 if (*s < 0x80)
498 /* single-byte code */
499 return 1;
500 if (*s < 0xc0)
501 /* following byte */
502 return 0;
503 if (*s < 0xE0)
504 expected = 1;
505 else if (*s < 0xF0)
506 expected = 2;
507 else if (*s < 0xF8)
508 expected = 3;
509 else
510 return 0;
511 length = expected + 1;
512 for (; expected; expected--)
513 if (s[expected] < 0x80 || s[expected] >= 0xC0)
514 return 0;
515 return length;
516}
517
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000518/* Read a line of input from TOK. Determine encoding
519 if necessary. */
520
521static char *
522decoding_fgets(char *s, int size, struct tok_state *tok)
523{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000524 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000525 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000526 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000527 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 /* We already have a codec associated with
529 this input. */
530 line = fp_readl(s, size, tok);
531 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000532 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000534 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000536 break;
537 } else {
538 /* We have not yet determined the encoding.
539 If an encoding is found, use the file-pointer
540 reader functions from now on. */
541 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
542 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000543 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000544 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000545 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
547 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
548 return error_ret(tok);
549 }
550 }
551#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000552 /* The default encoding is UTF-8, so make sure we don't have any
553 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000554 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000556 int length;
557 for (c = (unsigned char *)line; *c; c += length)
558 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000559 badchar = *c;
560 break;
561 }
562 }
563 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000564 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000565 /* Need to add 1 to the line number, since this line
566 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000567 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000568 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000569 "in file %.200s on line %i, "
570 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000571 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000572 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000573 PyErr_SetString(PyExc_SyntaxError, buf);
574 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575 }
576#endif
577 return line;
578}
579
580static int
581decoding_feof(struct tok_state *tok)
582{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000583 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584 return feof(tok->fp);
585 } else {
586 PyObject* buf = tok->decoding_buffer;
587 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000588 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 if (buf == NULL) {
590 error_ret(tok);
591 return 1;
592 } else {
593 tok->decoding_buffer = buf;
594 }
595 }
596 return PyObject_Length(buf) == 0;
597 }
598}
599
600/* Fetch a byte from TOK, using the string buffer. */
601
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000602static int
603buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000604 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605}
606
607/* Unfetch a byte from TOK, using the string buffer. */
608
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000609static void
610buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000611 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000612 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613}
614
615/* Set the readline function for TOK to ENC. For the string-based
616 tokenizer, this means to just record the encoding. */
617
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000618static int
619buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000620 tok->enc = enc;
621 return 1;
622}
623
624/* Return a UTF-8 encoding Python string object from the
625 C byte string STR, which is encoded with ENC. */
626
627static PyObject *
628translate_into_utf8(const char* str, const char* enc) {
629 PyObject *utf8;
630 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
631 if (buf == NULL)
632 return NULL;
633 utf8 = PyUnicode_AsUTF8String(buf);
634 Py_DECREF(buf);
635 return utf8;
636}
637
638/* Decode a byte string STR for use as the buffer of TOK.
639 Look for encoding declarations inside STR, and record them
640 inside TOK. */
641
642static const char *
643decode_str(const char *str, struct tok_state *tok)
644{
645 PyObject* utf8 = NULL;
646 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000647 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 int lineno = 0;
649 tok->enc = NULL;
650 tok->str = str;
651 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000652 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000653 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000654 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000655 if (tok->enc != NULL) {
656 utf8 = translate_into_utf8(str, tok->enc);
657 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000658 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000659 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000660 }
661 for (s = str;; s++) {
662 if (*s == '\0') break;
663 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000664 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000665 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000666 lineno++;
667 if (lineno == 2) break;
668 }
669 }
670 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000671 /* need to check line 1 and 2 separately since check_coding_spec
672 assumes a single line as input */
673 if (newl[0]) {
674 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
675 return error_ret(tok);
676 if (tok->enc == NULL && newl[1]) {
677 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
678 tok, buf_setreadl))
679 return error_ret(tok);
680 }
681 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682 if (tok->enc != NULL) {
683 assert(utf8 == NULL);
684 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000685 if (utf8 == NULL) {
686 PyErr_Format(PyExc_SyntaxError,
687 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000688 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000689 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000690 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 }
692 assert(tok->decoding_buffer == NULL);
693 tok->decoding_buffer = utf8; /* CAUTION */
694 return str;
695}
696
697#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000698
699/* Set up tokenizer for string */
700
701struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000702PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703{
704 struct tok_state *tok = tok_new();
705 if (tok == NULL)
706 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000707 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000708 if (str == NULL) {
709 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000710 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000711 }
712
Martin v. Löwis95292d62002-12-11 14:04:59 +0000713 /* XXX: constify members. */
714 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715 return tok;
716}
717
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000718struct tok_state *
719PyTokenizer_FromUTF8(const char *str)
720{
721 struct tok_state *tok = tok_new();
722 if (tok == NULL)
723 return NULL;
724 tok->decoding_state = STATE_RAW;
725 tok->read_coding_spec = 1;
726 tok->enc = NULL;
727 tok->str = str;
728 tok->encoding = (char *)PyMem_MALLOC(6);
729 if (!tok->encoding) {
730 PyTokenizer_Free(tok);
731 return NULL;
732 }
733 strcpy(tok->encoding, "utf-8");
734
735 /* XXX: constify members. */
736 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
737 return tok;
738}
739
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000740
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000741/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000742
743struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000744PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000745{
746 struct tok_state *tok = tok_new();
747 if (tok == NULL)
748 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000749 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000750 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000751 return NULL;
752 }
753 tok->cur = tok->inp = tok->buf;
754 tok->end = tok->buf + BUFSIZ;
755 tok->fp = fp;
756 tok->prompt = ps1;
757 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000758 if (enc != NULL) {
759 /* Must copy encoding declaration since it
760 gets copied into the parse tree. */
761 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
762 if (!tok->encoding) {
763 PyTokenizer_Free(tok);
764 return NULL;
765 }
766 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000767 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000768 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769 return tok;
770}
771
772
773/* Free a tok_state structure */
774
775void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000776PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000777{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000778 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000779 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000780#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000781 Py_XDECREF(tok->decoding_readline);
782 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000783#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000784 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000785 PyMem_FREE(tok->buf);
786 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787}
788
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789/* Get next char, updating state; error code goes into tok->done */
790
791static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000792tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000794 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000795 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000796 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000797 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000798 if (tok->done != E_OK)
799 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000800 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000801 char *end = strchr(tok->inp, '\n');
802 if (end != NULL)
803 end++;
804 else {
805 end = strchr(tok->inp, '\0');
806 if (end == tok->inp) {
807 tok->done = E_EOF;
808 return EOF;
809 }
810 }
811 if (tok->start == NULL)
812 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000813 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000814 tok->lineno++;
815 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000816 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000817 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000819 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000820#ifndef PGEN
821 if (tok->encoding && newtok && *newtok) {
822 /* Recode to UTF-8 */
823 Py_ssize_t buflen;
824 const char* buf;
825 PyObject *u = translate_into_utf8(newtok, tok->encoding);
826 PyMem_FREE(newtok);
827 if (!u) {
828 tok->done = E_DECODE;
829 return EOF;
830 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000831 buflen = PyBytes_GET_SIZE(u);
832 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000833 if (!buf) {
834 Py_DECREF(u);
835 tok->done = E_DECODE;
836 return EOF;
837 }
838 newtok = PyMem_MALLOC(buflen+1);
839 strcpy(newtok, buf);
840 Py_DECREF(u);
841 }
842#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000843 if (tok->nextprompt != NULL)
844 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000845 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000847 else if (*newtok == '\0') {
848 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849 tok->done = E_EOF;
850 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000851 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000852 size_t start = tok->start - tok->buf;
853 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000854 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000855 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000856 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000857 tok->lineno++;
858 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000859 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000860 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000861 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000862 tok->done = E_NOMEM;
863 return EOF;
864 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 tok->buf = buf;
866 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000867 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000868 strcpy(tok->buf + oldlen, newtok);
869 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000870 tok->inp = tok->buf + newlen;
871 tok->end = tok->inp + 1;
872 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000873 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000874 else {
875 tok->lineno++;
876 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000877 PyMem_FREE(tok->buf);
878 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000879 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000880 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000881 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000882 tok->inp = strchr(tok->buf, '\0');
883 tok->end = tok->inp + 1;
884 }
885 }
886 else {
887 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000888 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000889 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000890 if (tok->start == NULL) {
891 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000892 tok->buf = (char *)
893 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000894 if (tok->buf == NULL) {
895 tok->done = E_NOMEM;
896 return EOF;
897 }
898 tok->end = tok->buf + BUFSIZ;
899 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000900 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
901 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000902 tok->done = E_EOF;
903 done = 1;
904 }
905 else {
906 tok->done = E_OK;
907 tok->inp = strchr(tok->buf, '\0');
908 done = tok->inp[-1] == '\n';
909 }
910 }
911 else {
912 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000913 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000914 tok->done = E_EOF;
915 done = 1;
916 }
917 else
918 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000919 }
920 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000921 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000922 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000923 Py_ssize_t curstart = tok->start == NULL ? -1 :
924 tok->start - tok->buf;
925 Py_ssize_t curvalid = tok->inp - tok->buf;
926 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000927 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000928 newbuf = (char *)PyMem_REALLOC(newbuf,
929 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000930 if (newbuf == NULL) {
931 tok->done = E_NOMEM;
932 tok->cur = tok->inp;
933 return EOF;
934 }
935 tok->buf = newbuf;
936 tok->inp = tok->buf + curvalid;
937 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000938 tok->start = curstart < 0 ? NULL :
939 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000940 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000942 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000943 /* Break out early on decoding
944 errors, as tok->buf will be NULL
945 */
946 if (tok->decoding_erred)
947 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000948 /* Last line does not end in \n,
949 fake one */
950 strcpy(tok->inp, "\n");
951 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000952 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000953 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000954 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000955 if (tok->buf != NULL) {
956 tok->cur = tok->buf + cur;
957 tok->line_start = tok->cur;
958 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000959 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000960 pt = tok->inp - 2;
961 if (pt >= tok->buf && *pt == '\r') {
962 *pt++ = '\n';
963 *pt = '\0';
964 tok->inp = pt;
965 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000966 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000967 }
968 if (tok->done != E_OK) {
969 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000970 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000971 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000972 return EOF;
973 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000974 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000975 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000976}
977
978
979/* Back-up one character */
980
981static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000982tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983{
984 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000985 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000986 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000987 if (*tok->cur != c)
988 *tok->cur = c;
989 }
990}
991
992
993/* Return the token corresponding to a single character */
994
995int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000996PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000997{
998 switch (c) {
999 case '(': return LPAR;
1000 case ')': return RPAR;
1001 case '[': return LSQB;
1002 case ']': return RSQB;
1003 case ':': return COLON;
1004 case ',': return COMMA;
1005 case ';': return SEMI;
1006 case '+': return PLUS;
1007 case '-': return MINUS;
1008 case '*': return STAR;
1009 case '/': return SLASH;
1010 case '|': return VBAR;
1011 case '&': return AMPER;
1012 case '<': return LESS;
1013 case '>': return GREATER;
1014 case '=': return EQUAL;
1015 case '.': return DOT;
1016 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001017 case '{': return LBRACE;
1018 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001019 case '^': return CIRCUMFLEX;
1020 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001021 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001022 default: return OP;
1023 }
1024}
1025
1026
Guido van Rossumfbab9051991-10-20 20:25:03 +00001027int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001028PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001029{
1030 switch (c1) {
1031 case '=':
1032 switch (c2) {
1033 case '=': return EQEQUAL;
1034 }
1035 break;
1036 case '!':
1037 switch (c2) {
1038 case '=': return NOTEQUAL;
1039 }
1040 break;
1041 case '<':
1042 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001043 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001044 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001045 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001046 }
1047 break;
1048 case '>':
1049 switch (c2) {
1050 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001051 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001052 }
1053 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001054 case '+':
1055 switch (c2) {
1056 case '=': return PLUSEQUAL;
1057 }
1058 break;
1059 case '-':
1060 switch (c2) {
1061 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001062 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001063 }
1064 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001065 case '*':
1066 switch (c2) {
1067 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001068 case '=': return STAREQUAL;
1069 }
1070 break;
1071 case '/':
1072 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001073 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001074 case '=': return SLASHEQUAL;
1075 }
1076 break;
1077 case '|':
1078 switch (c2) {
1079 case '=': return VBAREQUAL;
1080 }
1081 break;
1082 case '%':
1083 switch (c2) {
1084 case '=': return PERCENTEQUAL;
1085 }
1086 break;
1087 case '&':
1088 switch (c2) {
1089 case '=': return AMPEREQUAL;
1090 }
1091 break;
1092 case '^':
1093 switch (c2) {
1094 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001095 }
1096 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001097 }
1098 return OP;
1099}
1100
Thomas Wouters434d0822000-08-24 20:11:32 +00001101int
1102PyToken_ThreeChars(int c1, int c2, int c3)
1103{
1104 switch (c1) {
1105 case '<':
1106 switch (c2) {
1107 case '<':
1108 switch (c3) {
1109 case '=':
1110 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001111 }
1112 break;
1113 }
1114 break;
1115 case '>':
1116 switch (c2) {
1117 case '>':
1118 switch (c3) {
1119 case '=':
1120 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001121 }
1122 break;
1123 }
1124 break;
1125 case '*':
1126 switch (c2) {
1127 case '*':
1128 switch (c3) {
1129 case '=':
1130 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001131 }
1132 break;
1133 }
1134 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001135 case '/':
1136 switch (c2) {
1137 case '/':
1138 switch (c3) {
1139 case '=':
1140 return DOUBLESLASHEQUAL;
1141 }
1142 break;
1143 }
1144 break;
Georg Brandldde00282007-03-18 19:01:53 +00001145 case '.':
1146 switch (c2) {
1147 case '.':
1148 switch (c3) {
1149 case '.':
1150 return ELLIPSIS;
1151 }
1152 break;
1153 }
1154 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001155 }
1156 return OP;
1157}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001158
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001160indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001161{
1162 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001163 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001164 tok->cur = tok->inp;
1165 return 1;
1166 }
1167 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001168 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1169 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001170 tok->altwarning = 0;
1171 }
1172 return 0;
1173}
1174
Martin v. Löwis47383402007-08-15 07:32:56 +00001175#ifdef PGEN
1176#define verify_identifier(s,e) 1
1177#else
1178/* Verify that the identifier follows PEP 3131. */
1179static int
1180verify_identifier(char *start, char *end)
1181{
Guido van Rossume3e37012007-08-29 18:54:41 +00001182 PyObject *s;
1183 int result;
1184 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1185 if (s == NULL) {
1186 PyErr_Clear();
1187 return 0;
1188 }
1189 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001190 Py_DECREF(s);
1191 return result;
1192}
1193#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001194
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001195/* Get next token, after space stripping etc. */
1196
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001197static int
1198tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001199{
1200 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001201 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001202
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001203 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001204 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001205 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001206 blankline = 0;
1207
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001208 /* Get indentation level */
1209 if (tok->atbol) {
1210 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001211 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001212 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 for (;;) {
1214 c = tok_nextc(tok);
1215 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001216 col++, altcol++;
1217 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001218 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001219 altcol = (altcol/tok->alttabsize + 1)
1220 * tok->alttabsize;
1221 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001222 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001223 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001224 else
1225 break;
1226 }
1227 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001228 if (c == '#' || c == '\n') {
1229 /* Lines with only whitespace and/or comments
1230 shouldn't affect the indentation and are
1231 not passed to the parser as NEWLINE tokens,
1232 except *totally* empty lines in interactive
1233 mode, which signal the end of a command group. */
1234 if (col == 0 && c == '\n' && tok->prompt != NULL)
1235 blankline = 0; /* Let it through */
1236 else
1237 blankline = 1; /* Ignore completely */
1238 /* We can't jump back right here since we still
1239 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001240 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001241 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001242 if (col == tok->indstack[tok->indent]) {
1243 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001244 if (altcol != tok->altindstack[tok->indent]) {
1245 if (indenterror(tok))
1246 return ERRORTOKEN;
1247 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001249 else if (col > tok->indstack[tok->indent]) {
1250 /* Indent -- always one */
1251 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001252 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001253 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001254 return ERRORTOKEN;
1255 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001256 if (altcol <= tok->altindstack[tok->indent]) {
1257 if (indenterror(tok))
1258 return ERRORTOKEN;
1259 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001260 tok->pendin++;
1261 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001262 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001264 else /* col < tok->indstack[tok->indent] */ {
1265 /* Dedent -- any number, must be consistent */
1266 while (tok->indent > 0 &&
1267 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001268 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001269 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001270 }
1271 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001272 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001273 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001274 return ERRORTOKEN;
1275 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001276 if (altcol != tok->altindstack[tok->indent]) {
1277 if (indenterror(tok))
1278 return ERRORTOKEN;
1279 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 }
1281 }
1282 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001283
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001284 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001285
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 /* Return pending indents/dedents */
1287 if (tok->pendin != 0) {
1288 if (tok->pendin < 0) {
1289 tok->pendin++;
1290 return DEDENT;
1291 }
1292 else {
1293 tok->pendin--;
1294 return INDENT;
1295 }
1296 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001297
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001299 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 /* Skip spaces */
1301 do {
1302 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001303 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001306 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001307
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001308 /* Skip comment */
1309 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001310 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001311 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001312
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001314 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001315 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001316 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001317
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001318 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001319 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001320 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001321 /* Process b"", r"" and br"" */
1322 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001323 c = tok_nextc(tok);
1324 if (c == '"' || c == '\'')
1325 goto letter_quote;
1326 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001327 if (c == 'r' || c == 'R') {
1328 c = tok_nextc(tok);
1329 if (c == '"' || c == '\'')
1330 goto letter_quote;
1331 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001332 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001333 if (c >= 128)
1334 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001335 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001336 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001337 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001338 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001339 !verify_identifier(tok->start, tok->cur)) {
1340 tok->done = E_IDENTIFIER;
1341 return ERRORTOKEN;
1342 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001343 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 *p_end = tok->cur;
1345 return NAME;
1346 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001347
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001348 /* Newline */
1349 if (c == '\n') {
1350 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001351 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001352 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001353 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001355 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001356 return NEWLINE;
1357 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001358
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001359 /* Period or number starting with period? */
1360 if (c == '.') {
1361 c = tok_nextc(tok);
1362 if (isdigit(c)) {
1363 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001364 } else if (c == '.') {
1365 c = tok_nextc(tok);
1366 if (c == '.') {
1367 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001368 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001369 return ELLIPSIS;
1370 } else {
1371 tok_backup(tok, c);
1372 }
1373 tok_backup(tok, '.');
1374 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001375 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001376 }
Georg Brandldde00282007-03-18 19:01:53 +00001377 *p_start = tok->start;
1378 *p_end = tok->cur;
1379 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001380 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001381
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001382 /* Number */
1383 if (isdigit(c)) {
1384 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001385 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001386 c = tok_nextc(tok);
1387 if (c == '.')
1388 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001389#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001390 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001391 goto imaginary;
1392#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001393 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001394
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001395 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001396 c = tok_nextc(tok);
1397 if (!isxdigit(c)) {
1398 tok->done = E_TOKEN;
1399 tok_backup(tok, c);
1400 return ERRORTOKEN;
1401 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 do {
1403 c = tok_nextc(tok);
1404 } while (isxdigit(c));
1405 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001406 else if (c == 'o' || c == 'O') {
1407 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001408 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001409 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001410 tok->done = E_TOKEN;
1411 tok_backup(tok, c);
1412 return ERRORTOKEN;
1413 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001414 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001416 } while ('0' <= c && c < '8');
1417 }
1418 else if (c == 'b' || c == 'B') {
1419 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001420 c = tok_nextc(tok);
1421 if (c != '0' && c != '1') {
1422 tok->done = E_TOKEN;
1423 tok_backup(tok, c);
1424 return ERRORTOKEN;
1425 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001426 do {
1427 c = tok_nextc(tok);
1428 } while (c == '0' || c == '1');
1429 }
1430 else {
1431 int nonzero = 0;
1432 /* maybe old-style octal; c is first char of it */
1433 /* in any case, allow '0' as a literal */
1434 while (c == '0')
1435 c = tok_nextc(tok);
1436 while (isdigit(c)) {
1437 nonzero = 1;
1438 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001439 }
1440 if (c == '.')
1441 goto fraction;
1442 else if (c == 'e' || c == 'E')
1443 goto exponent;
1444#ifndef WITHOUT_COMPLEX
1445 else if (c == 'j' || c == 'J')
1446 goto imaginary;
1447#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001448 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001449 tok->done = E_TOKEN;
1450 tok_backup(tok, c);
1451 return ERRORTOKEN;
1452 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001453 }
1454 }
1455 else {
1456 /* Decimal */
1457 do {
1458 c = tok_nextc(tok);
1459 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001460 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001461 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001462 if (c == '.') {
1463 fraction:
1464 /* Fraction */
1465 do {
1466 c = tok_nextc(tok);
1467 } while (isdigit(c));
1468 }
1469 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001470 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001471 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001472 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001473 if (c == '+' || c == '-')
1474 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001475 if (!isdigit(c)) {
1476 tok->done = E_TOKEN;
1477 tok_backup(tok, c);
1478 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001479 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001480 do {
1481 c = tok_nextc(tok);
1482 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001483 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001484#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001485 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001486 /* Imaginary part */
1487 imaginary:
1488 c = tok_nextc(tok);
1489#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001490 }
1491 }
1492 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001493 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001494 *p_end = tok->cur;
1495 return NUMBER;
1496 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001497
1498 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001499 /* String */
1500 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001501 int quote = c;
1502 int quote_size = 1; /* 1 or 3 */
1503 int end_quote_size = 0;
1504
1505 /* Find the quote size and start of string */
1506 c = tok_nextc(tok);
1507 if (c == quote) {
1508 c = tok_nextc(tok);
1509 if (c == quote)
1510 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001511 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001512 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001514 if (c != quote)
1515 tok_backup(tok, c);
1516
1517 /* Get rest of string */
1518 while (end_quote_size != quote_size) {
1519 c = tok_nextc(tok);
1520 if (c == EOF) {
1521 if (quote_size == 3)
1522 tok->done = E_EOFS;
1523 else
1524 tok->done = E_EOLS;
1525 tok->cur = tok->inp;
1526 return ERRORTOKEN;
1527 }
1528 if (quote_size == 1 && c == '\n') {
1529 tok->done = E_EOLS;
1530 tok->cur = tok->inp;
1531 return ERRORTOKEN;
1532 }
1533 if (c == quote)
1534 end_quote_size += 1;
1535 else {
1536 end_quote_size = 0;
1537 if (c == '\\')
1538 c = tok_nextc(tok); /* skip escaped char */
1539 }
1540 }
1541
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001542 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001543 *p_end = tok->cur;
1544 return STRING;
1545 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001546
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001547 /* Line continuation */
1548 if (c == '\\') {
1549 c = tok_nextc(tok);
1550 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001551 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001552 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001553 return ERRORTOKEN;
1554 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001555 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001556 goto again; /* Read next line */
1557 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001558
Guido van Rossumfbab9051991-10-20 20:25:03 +00001559 /* Check for two-character token */
1560 {
1561 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001562 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001563 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001564 int c3 = tok_nextc(tok);
1565 int token3 = PyToken_ThreeChars(c, c2, c3);
1566 if (token3 != OP) {
1567 token = token3;
1568 } else {
1569 tok_backup(tok, c3);
1570 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001571 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001572 *p_end = tok->cur;
1573 return token;
1574 }
1575 tok_backup(tok, c2);
1576 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001577
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001578 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001579 switch (c) {
1580 case '(':
1581 case '[':
1582 case '{':
1583 tok->level++;
1584 break;
1585 case ')':
1586 case ']':
1587 case '}':
1588 tok->level--;
1589 break;
1590 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001592 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001593 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001594 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001595 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001596}
1597
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001598int
1599PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1600{
1601 int result = tok_get(tok, p_start, p_end);
1602 if (tok->decoding_erred) {
1603 result = ERRORTOKEN;
1604 tok->done = E_DECODE;
1605 }
1606 return result;
1607}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001608
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001609/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001610
1611 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001612 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001613 should be assumed to be PyUnicode_GetDefaultEncoding()).
1614
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001615 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1616 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001617*/
1618char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001619PyTokenizer_FindEncoding(int fd)
1620{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001621 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001622 FILE *fp;
1623 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001624
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001625 fd = dup(fd);
1626 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001627 return NULL;
1628 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001629 fp = fdopen(fd, "r");
1630 if (fp == NULL) {
1631 return NULL;
1632 }
1633 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1634 if (tok == NULL) {
1635 fclose(fp);
1636 return NULL;
1637 }
1638 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001639 PyTokenizer_Get(tok, &p_start, &p_end);
1640 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001641 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001642 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001643 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001644 if (encoding)
1645 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001646 }
1647 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001648 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001649}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001650
Guido van Rossum408027e1996-12-30 16:17:54 +00001651#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001652
1653void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001654tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655{
Guido van Rossum86bea461997-04-29 21:03:06 +00001656 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001657 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1658 printf("(%.*s)", (int)(end - start), start);
1659}
1660
1661#endif