blob: af99deff38cd0af6a3198f3d2682ffb4ae75ce22 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000129 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000130 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000135 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000138 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000139 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000142 tok->decoding_readline = NULL;
143 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000144#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000145 return tok;
146}
147
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000148#ifdef PGEN
149
150static char *
151decoding_fgets(char *s, int size, struct tok_state *tok)
152{
153 return fgets(s, size, tok->fp);
154}
155
156static int
157decoding_feof(struct tok_state *tok)
158{
159 return feof(tok->fp);
160}
161
162static const char *
163decode_str(const char *str, struct tok_state *tok)
164{
165 return str;
166}
167
168#else /* PGEN */
169
170static char *
171error_ret(struct tok_state *tok) /* XXX */
172{
173 tok->decoding_erred = 1;
174 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000175 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176 tok->buf = NULL;
177 return NULL; /* as if it were EOF */
178}
179
180static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000181new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000182{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000183 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000184 if (result != NULL) {
185 memcpy(result, s, len);
186 result[len] = '\0';
187 }
188 return result;
189}
190
191static char *
192get_normal_name(char *s) /* for utf-8 and latin-1 */
193{
194 char buf[13];
195 int i;
196 for (i = 0; i < 12; i++) {
197 int c = s[i];
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000198 if (c == '\0')
199 break;
200 else if (c == '_')
201 buf[i] = '-';
202 else
203 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000204 }
205 buf[i] = '\0';
206 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000207 strncmp(buf, "utf-8-", 6) == 0)
208 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000209 else if (strcmp(buf, "latin-1") == 0 ||
210 strcmp(buf, "iso-8859-1") == 0 ||
211 strcmp(buf, "iso-latin-1") == 0 ||
212 strncmp(buf, "latin-1-", 8) == 0 ||
213 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000214 strncmp(buf, "iso-latin-1-", 12) == 0)
215 return "iso-8859-1";
216 else
217 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000218}
219
220/* Return the coding spec in S, or NULL if none is found. */
221
222static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000223get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000224{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000226 /* Coding spec must be in a comment, and that comment must be
227 * the only statement on the source code line. */
228 for (i = 0; i < size - 6; i++) {
229 if (s[i] == '#')
230 break;
231 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
232 return NULL;
233 }
234 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000235 const char* t = s + i;
236 if (strncmp(t, "coding", 6) == 0) {
237 const char* begin = NULL;
238 t += 6;
239 if (t[0] != ':' && t[0] != '=')
240 continue;
241 do {
242 t++;
243 } while (t[0] == '\x20' || t[0] == '\t');
244
245 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000246 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000247 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000248 t++;
249
250 if (begin < t) {
251 char* r = new_string(begin, t - begin);
252 char* q = get_normal_name(r);
253 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000255 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000256 }
257 return r;
258 }
259 }
260 }
261 return NULL;
262}
263
264/* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
268
269static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000270check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000271 int set_readline(struct tok_state *, const char *))
272{
Tim Peters17db21f2002-09-03 15:39:58 +0000273 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000274 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000275
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000276 if (tok->cont_line)
277 /* It's a continuation line, so it can't be a coding spec. */
278 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000279 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000280 if (cs != NULL) {
281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000283 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000284 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 tok->encoding = cs;
286 } else {
287 r = set_readline(tok, cs);
288 if (r) {
289 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000290 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000291 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000292 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 } else { /* then, compare cs with BOM */
296 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000298 }
299 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000300 if (!r) {
301 cs = tok->encoding;
302 if (!cs)
303 cs = "with BOM";
304 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
305 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000306 return r;
307}
308
309/* See whether the file starts with a BOM. If it does,
310 invoke the set_readline function with the new encoding.
311 Return 1 on success, 0 on failure. */
312
313static int
314check_bom(int get_char(struct tok_state *),
315 void unget_char(int, struct tok_state *),
316 int set_readline(struct tok_state *, const char *),
317 struct tok_state *tok)
318{
Victor Stinner151205f2010-03-03 00:22:21 +0000319 int ch1, ch2, ch3;
320 ch1 = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000321 tok->decoding_state = STATE_RAW;
Victor Stinner151205f2010-03-03 00:22:21 +0000322 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 return 1;
Victor Stinner151205f2010-03-03 00:22:21 +0000324 } else if (ch1 == 0xEF) {
325 ch2 = get_char(tok);
326 if (ch2 != 0xBB) {
327 unget_char(ch2, tok);
328 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000329 return 1;
330 }
Victor Stinner151205f2010-03-03 00:22:21 +0000331 ch3 = get_char(tok);
332 if (ch3 != 0xBF) {
333 unget_char(ch3, tok);
334 unget_char(ch2, tok);
335 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000336 return 1;
337 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000338#if 0
339 /* Disable support for UTF-16 BOMs until a decision
340 is made whether this needs to be supported. */
Victor Stinner151205f2010-03-03 00:22:21 +0000341 } else if (ch1 == 0xFE) {
342 ch2 = get_char(tok);
343 if (ch2 != 0xFF) {
344 unget_char(ch2, tok);
345 unget_char(ch1, tok);
346 return 1;
347 }
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000348 if (!set_readline(tok, "utf-16-be"))
349 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000350 tok->decoding_state = STATE_NORMAL;
Victor Stinner151205f2010-03-03 00:22:21 +0000351 } else if (ch1 == 0xFF) {
352 ch2 = get_char(tok);
353 if (ch2 != 0xFE) {
354 unget_char(ch2, tok);
355 unget_char(ch1, tok);
356 return 1;
357 }
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000358 if (!set_readline(tok, "utf-16-le"))
359 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000360 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000361#endif
362 } else {
Victor Stinner151205f2010-03-03 00:22:21 +0000363 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000364 return 1;
365 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000366 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000368 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000369 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 return 1;
371}
372
373/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000374 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 On entry, tok->decoding_buffer will be one of:
377 1) NULL: need to call tok->decoding_readline to get a new line
378 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
379 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000380 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000381 (in the s buffer) to copy entire contents of the line read
382 by tok->decoding_readline. tok->decoding_buffer has the overflow.
383 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000385 reached): see tok_nextc and its calls to decoding_fgets.
386*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000387
388static char *
389fp_readl(char *s, int size, struct tok_state *tok)
390{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000391 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000392 const char *buf;
393 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000394
395 /* Ask for one less byte so we can terminate it */
396 assert(size > 0);
397 size--;
398
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000399 if (tok->decoding_buffer) {
400 bufobj = tok->decoding_buffer;
401 Py_INCREF(bufobj);
402 }
403 else
404 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000405 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
406 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000407 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000408 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000409 if (PyUnicode_CheckExact(bufobj))
410 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000411 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000412 if (buf == NULL) {
413 goto error;
414 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000415 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000416 else
417 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000418 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000419 if (buf == NULL) {
420 goto error;
421 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000422 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000423 }
424
425 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000426 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000427 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000428 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000429 buflen-size);
430 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000431 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000432 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000433 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000434 else
435 tok->decoding_buffer = NULL;
436
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000437 memcpy(s, buf, buflen);
438 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000439 if (buflen == 0) /* EOF */
440 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000441 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000442 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000443
444error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000445 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000446 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000447}
448
449/* Set the readline function for TOK to a StreamReader's
450 readline function. The StreamReader is named ENC.
451
452 This function is called from check_bom and check_coding_spec.
453
454 ENC is usually identical to the future value of tok->encoding,
455 except for the (currently unsupported) case of UTF-16.
456
457 Return 1 on success, 0 on failure. */
458
459static int
460fp_setreadl(struct tok_state *tok, const char* enc)
461{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000463
Christian Heimes819b8bf2008-01-03 23:05:47 +0000464 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000465 if (io == NULL)
466 goto cleanup;
467
Brett Cannon8a9583e2008-09-04 05:04:25 +0000468 if (tok->filename)
469 stream = PyObject_CallMethod(io, "open", "ssis",
470 tok->filename, "r", -1, enc);
471 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000472 stream = PyObject_CallMethod(io, "open", "isisOOO",
473 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000474 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000475 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000477 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000478 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000479 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000480
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000481 /* The file has been reopened; parsing will restart from
482 * the beginning of the file, we have to reset the line number.
483 * But this function has been called from inside tok_nextc() which
484 * will increment lineno before it returns. So we set it -1 so that
485 * the next call to tok_nextc() will start with tok->lineno == 0.
486 */
487 tok->lineno = -1;
488
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000489 cleanup:
490 Py_XDECREF(stream);
491 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000492 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000493}
494
495/* Fetch the next byte from TOK. */
496
497static int fp_getc(struct tok_state *tok) {
498 return getc(tok->fp);
499}
500
501/* Unfetch the last byte back into TOK. */
502
503static void fp_ungetc(int c, struct tok_state *tok) {
504 ungetc(c, tok->fp);
505}
506
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000507/* Check whether the characters at s start a valid
508 UTF-8 sequence. Return the number of characters forming
509 the sequence if yes, 0 if not. */
510static int valid_utf8(const unsigned char* s)
511{
512 int expected = 0;
513 int length;
514 if (*s < 0x80)
515 /* single-byte code */
516 return 1;
517 if (*s < 0xc0)
518 /* following byte */
519 return 0;
520 if (*s < 0xE0)
521 expected = 1;
522 else if (*s < 0xF0)
523 expected = 2;
524 else if (*s < 0xF8)
525 expected = 3;
526 else
527 return 0;
528 length = expected + 1;
529 for (; expected; expected--)
530 if (s[expected] < 0x80 || s[expected] >= 0xC0)
531 return 0;
532 return length;
533}
534
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535/* Read a line of input from TOK. Determine encoding
536 if necessary. */
537
538static char *
539decoding_fgets(char *s, int size, struct tok_state *tok)
540{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000541 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000542 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000543 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000544 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000545 /* We already have a codec associated with
546 this input. */
547 line = fp_readl(s, size, tok);
548 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000549 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000551 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000553 break;
554 } else {
555 /* We have not yet determined the encoding.
556 If an encoding is found, use the file-pointer
557 reader functions from now on. */
558 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
559 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000560 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000561 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000562 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
564 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
565 return error_ret(tok);
566 }
567 }
568#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000569 /* The default encoding is UTF-8, so make sure we don't have any
570 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000571 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000573 int length;
574 for (c = (unsigned char *)line; *c; c += length)
575 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576 badchar = *c;
577 break;
578 }
579 }
580 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000581 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000582 /* Need to add 1 to the line number, since this line
583 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000584 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000585 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000586 "in file %.200s on line %i, "
587 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000588 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000589 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000590 PyErr_SetString(PyExc_SyntaxError, buf);
591 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 }
593#endif
594 return line;
595}
596
597static int
598decoding_feof(struct tok_state *tok)
599{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000600 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 return feof(tok->fp);
602 } else {
603 PyObject* buf = tok->decoding_buffer;
604 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000605 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606 if (buf == NULL) {
607 error_ret(tok);
608 return 1;
609 } else {
610 tok->decoding_buffer = buf;
611 }
612 }
613 return PyObject_Length(buf) == 0;
614 }
615}
616
617/* Fetch a byte from TOK, using the string buffer. */
618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000619static int
620buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000621 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622}
623
624/* Unfetch a byte from TOK, using the string buffer. */
625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000626static void
627buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000628 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000629 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630}
631
632/* Set the readline function for TOK to ENC. For the string-based
633 tokenizer, this means to just record the encoding. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637 tok->enc = enc;
638 return 1;
639}
640
641/* Return a UTF-8 encoding Python string object from the
642 C byte string STR, which is encoded with ENC. */
643
644static PyObject *
645translate_into_utf8(const char* str, const char* enc) {
646 PyObject *utf8;
647 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
648 if (buf == NULL)
649 return NULL;
650 utf8 = PyUnicode_AsUTF8String(buf);
651 Py_DECREF(buf);
652 return utf8;
653}
654
655/* Decode a byte string STR for use as the buffer of TOK.
656 Look for encoding declarations inside STR, and record them
657 inside TOK. */
658
659static const char *
660decode_str(const char *str, struct tok_state *tok)
661{
662 PyObject* utf8 = NULL;
663 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000664 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665 int lineno = 0;
666 tok->enc = NULL;
667 tok->str = str;
668 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000669 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000671 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000672 if (tok->enc != NULL) {
673 utf8 = translate_into_utf8(str, tok->enc);
674 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000675 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000676 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000677 }
678 for (s = str;; s++) {
679 if (*s == '\0') break;
680 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000681 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000682 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683 lineno++;
684 if (lineno == 2) break;
685 }
686 }
687 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000688 /* need to check line 1 and 2 separately since check_coding_spec
689 assumes a single line as input */
690 if (newl[0]) {
691 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
692 return error_ret(tok);
693 if (tok->enc == NULL && newl[1]) {
694 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
695 tok, buf_setreadl))
696 return error_ret(tok);
697 }
698 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699 if (tok->enc != NULL) {
700 assert(utf8 == NULL);
701 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Petersond76c8da2009-06-28 17:35:48 +0000702 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000703 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000704 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705 }
706 assert(tok->decoding_buffer == NULL);
707 tok->decoding_buffer = utf8; /* CAUTION */
708 return str;
709}
710
711#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712
713/* Set up tokenizer for string */
714
715struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000716PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717{
718 struct tok_state *tok = tok_new();
719 if (tok == NULL)
720 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000721 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000722 if (str == NULL) {
723 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000724 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000725 }
726
Martin v. Löwis95292d62002-12-11 14:04:59 +0000727 /* XXX: constify members. */
728 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000729 return tok;
730}
731
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000732struct tok_state *
733PyTokenizer_FromUTF8(const char *str)
734{
735 struct tok_state *tok = tok_new();
736 if (tok == NULL)
737 return NULL;
738 tok->decoding_state = STATE_RAW;
739 tok->read_coding_spec = 1;
740 tok->enc = NULL;
741 tok->str = str;
742 tok->encoding = (char *)PyMem_MALLOC(6);
743 if (!tok->encoding) {
744 PyTokenizer_Free(tok);
745 return NULL;
746 }
747 strcpy(tok->encoding, "utf-8");
748
749 /* XXX: constify members. */
750 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
751 return tok;
752}
753
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000755/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000756
757struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000758PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759{
760 struct tok_state *tok = tok_new();
761 if (tok == NULL)
762 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000763 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000764 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000765 return NULL;
766 }
767 tok->cur = tok->inp = tok->buf;
768 tok->end = tok->buf + BUFSIZ;
769 tok->fp = fp;
770 tok->prompt = ps1;
771 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000772 if (enc != NULL) {
773 /* Must copy encoding declaration since it
774 gets copied into the parse tree. */
775 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
776 if (!tok->encoding) {
777 PyTokenizer_Free(tok);
778 return NULL;
779 }
780 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000781 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000782 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 return tok;
784}
785
786
787/* Free a tok_state structure */
788
789void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000790PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000791{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000792 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000793 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000794#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000795 Py_XDECREF(tok->decoding_readline);
796 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000797#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000798 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000799 PyMem_FREE(tok->buf);
800 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000801}
802
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000803/* Get next char, updating state; error code goes into tok->done */
804
805static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000806tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000809 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000810 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000811 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000812 if (tok->done != E_OK)
813 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000814 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000815 char *end = strchr(tok->inp, '\n');
816 if (end != NULL)
817 end++;
818 else {
819 end = strchr(tok->inp, '\0');
820 if (end == tok->inp) {
821 tok->done = E_EOF;
822 return EOF;
823 }
824 }
825 if (tok->start == NULL)
826 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000827 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000828 tok->lineno++;
829 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000830 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000831 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000832 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000833 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000834#ifndef PGEN
835 if (tok->encoding && newtok && *newtok) {
836 /* Recode to UTF-8 */
837 Py_ssize_t buflen;
838 const char* buf;
839 PyObject *u = translate_into_utf8(newtok, tok->encoding);
840 PyMem_FREE(newtok);
841 if (!u) {
842 tok->done = E_DECODE;
843 return EOF;
844 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000845 buflen = PyBytes_GET_SIZE(u);
846 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000847 if (!buf) {
848 Py_DECREF(u);
849 tok->done = E_DECODE;
850 return EOF;
851 }
852 newtok = PyMem_MALLOC(buflen+1);
853 strcpy(newtok, buf);
854 Py_DECREF(u);
855 }
856#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857 if (tok->nextprompt != NULL)
858 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000859 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000861 else if (*newtok == '\0') {
862 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863 tok->done = E_EOF;
864 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000865 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000866 size_t start = tok->start - tok->buf;
867 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000868 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000869 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000870 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000871 tok->lineno++;
872 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000873 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000874 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000875 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000876 tok->done = E_NOMEM;
877 return EOF;
878 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000879 tok->buf = buf;
880 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000881 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000882 strcpy(tok->buf + oldlen, newtok);
883 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000884 tok->inp = tok->buf + newlen;
885 tok->end = tok->inp + 1;
886 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 else {
889 tok->lineno++;
890 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000891 PyMem_FREE(tok->buf);
892 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000893 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000894 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000895 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000896 tok->inp = strchr(tok->buf, '\0');
897 tok->end = tok->inp + 1;
898 }
899 }
900 else {
901 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000902 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000903 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 if (tok->start == NULL) {
905 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000906 tok->buf = (char *)
907 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000908 if (tok->buf == NULL) {
909 tok->done = E_NOMEM;
910 return EOF;
911 }
912 tok->end = tok->buf + BUFSIZ;
913 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000914 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
915 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000916 tok->done = E_EOF;
917 done = 1;
918 }
919 else {
920 tok->done = E_OK;
921 tok->inp = strchr(tok->buf, '\0');
922 done = tok->inp[-1] == '\n';
923 }
924 }
925 else {
926 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000927 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000928 tok->done = E_EOF;
929 done = 1;
930 }
931 else
932 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000933 }
934 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000935 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000936 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000937 Py_ssize_t curstart = tok->start == NULL ? -1 :
938 tok->start - tok->buf;
939 Py_ssize_t curvalid = tok->inp - tok->buf;
940 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000942 newbuf = (char *)PyMem_REALLOC(newbuf,
943 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000944 if (newbuf == NULL) {
945 tok->done = E_NOMEM;
946 tok->cur = tok->inp;
947 return EOF;
948 }
949 tok->buf = newbuf;
950 tok->inp = tok->buf + curvalid;
951 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000952 tok->start = curstart < 0 ? NULL :
953 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000954 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000955 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000956 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000957 /* Break out early on decoding
958 errors, as tok->buf will be NULL
959 */
960 if (tok->decoding_erred)
961 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000962 /* Last line does not end in \n,
963 fake one */
964 strcpy(tok->inp, "\n");
965 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000966 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000967 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000968 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000969 if (tok->buf != NULL) {
970 tok->cur = tok->buf + cur;
971 tok->line_start = tok->cur;
972 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000973 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000974 pt = tok->inp - 2;
975 if (pt >= tok->buf && *pt == '\r') {
976 *pt++ = '\n';
977 *pt = '\0';
978 tok->inp = pt;
979 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000980 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000981 }
982 if (tok->done != E_OK) {
983 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000984 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000985 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000986 return EOF;
987 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000988 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000989 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000990}
991
992
993/* Back-up one character */
994
995static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000996tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000997{
998 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000999 if (--tok->cur < tok->buf)
Benjamin Peterson8f6713f2009-11-13 02:29:35 +00001000 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001001 if (*tok->cur != c)
1002 *tok->cur = c;
1003 }
1004}
1005
1006
1007/* Return the token corresponding to a single character */
1008
1009int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001010PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001011{
1012 switch (c) {
1013 case '(': return LPAR;
1014 case ')': return RPAR;
1015 case '[': return LSQB;
1016 case ']': return RSQB;
1017 case ':': return COLON;
1018 case ',': return COMMA;
1019 case ';': return SEMI;
1020 case '+': return PLUS;
1021 case '-': return MINUS;
1022 case '*': return STAR;
1023 case '/': return SLASH;
1024 case '|': return VBAR;
1025 case '&': return AMPER;
1026 case '<': return LESS;
1027 case '>': return GREATER;
1028 case '=': return EQUAL;
1029 case '.': return DOT;
1030 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031 case '{': return LBRACE;
1032 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001033 case '^': return CIRCUMFLEX;
1034 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001035 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001036 default: return OP;
1037 }
1038}
1039
1040
Guido van Rossumfbab9051991-10-20 20:25:03 +00001041int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001042PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001043{
1044 switch (c1) {
1045 case '=':
1046 switch (c2) {
1047 case '=': return EQEQUAL;
1048 }
1049 break;
1050 case '!':
1051 switch (c2) {
1052 case '=': return NOTEQUAL;
1053 }
1054 break;
1055 case '<':
1056 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001057 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001058 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001059 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001060 }
1061 break;
1062 case '>':
1063 switch (c2) {
1064 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001065 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001066 }
1067 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001068 case '+':
1069 switch (c2) {
1070 case '=': return PLUSEQUAL;
1071 }
1072 break;
1073 case '-':
1074 switch (c2) {
1075 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001076 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001077 }
1078 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001079 case '*':
1080 switch (c2) {
1081 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001082 case '=': return STAREQUAL;
1083 }
1084 break;
1085 case '/':
1086 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001087 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001088 case '=': return SLASHEQUAL;
1089 }
1090 break;
1091 case '|':
1092 switch (c2) {
1093 case '=': return VBAREQUAL;
1094 }
1095 break;
1096 case '%':
1097 switch (c2) {
1098 case '=': return PERCENTEQUAL;
1099 }
1100 break;
1101 case '&':
1102 switch (c2) {
1103 case '=': return AMPEREQUAL;
1104 }
1105 break;
1106 case '^':
1107 switch (c2) {
1108 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001109 }
1110 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001111 }
1112 return OP;
1113}
1114
Thomas Wouters434d0822000-08-24 20:11:32 +00001115int
1116PyToken_ThreeChars(int c1, int c2, int c3)
1117{
1118 switch (c1) {
1119 case '<':
1120 switch (c2) {
1121 case '<':
1122 switch (c3) {
1123 case '=':
1124 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001125 }
1126 break;
1127 }
1128 break;
1129 case '>':
1130 switch (c2) {
1131 case '>':
1132 switch (c3) {
1133 case '=':
1134 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001135 }
1136 break;
1137 }
1138 break;
1139 case '*':
1140 switch (c2) {
1141 case '*':
1142 switch (c3) {
1143 case '=':
1144 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001145 }
1146 break;
1147 }
1148 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001149 case '/':
1150 switch (c2) {
1151 case '/':
1152 switch (c3) {
1153 case '=':
1154 return DOUBLESLASHEQUAL;
1155 }
1156 break;
1157 }
1158 break;
Georg Brandldde00282007-03-18 19:01:53 +00001159 case '.':
1160 switch (c2) {
1161 case '.':
1162 switch (c3) {
1163 case '.':
1164 return ELLIPSIS;
1165 }
1166 break;
1167 }
1168 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001169 }
1170 return OP;
1171}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001172
Guido van Rossum926f13a1998-04-09 21:38:06 +00001173static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001174indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001175{
1176 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001177 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001178 tok->cur = tok->inp;
1179 return 1;
1180 }
1181 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001182 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1183 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001184 tok->altwarning = 0;
1185 }
1186 return 0;
1187}
1188
Martin v. Löwis47383402007-08-15 07:32:56 +00001189#ifdef PGEN
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001190#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001191#else
1192/* Verify that the identifier follows PEP 3131. */
1193static int
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001194verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001195{
Guido van Rossume3e37012007-08-29 18:54:41 +00001196 PyObject *s;
1197 int result;
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001198 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Guido van Rossume3e37012007-08-29 18:54:41 +00001199 if (s == NULL) {
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001200 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1201 PyErr_Clear();
1202 tok->done = E_IDENTIFIER;
1203 } else {
1204 tok->done = E_ERROR;
1205 }
Guido van Rossume3e37012007-08-29 18:54:41 +00001206 return 0;
1207 }
1208 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001209 Py_DECREF(s);
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001210 if (result == 0)
1211 tok->done = E_IDENTIFIER;
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 return result;
1213}
1214#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001215
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001216/* Get next token, after space stripping etc. */
1217
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001218static int
1219tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001220{
1221 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001222 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001223
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001224 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001225 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001226 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001227 blankline = 0;
1228
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001229 /* Get indentation level */
1230 if (tok->atbol) {
1231 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001232 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001233 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001234 for (;;) {
1235 c = tok_nextc(tok);
1236 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001237 col++, altcol++;
1238 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001239 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001240 altcol = (altcol/tok->alttabsize + 1)
1241 * tok->alttabsize;
1242 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001243 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001244 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 else
1246 break;
1247 }
1248 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001249 if (c == '#' || c == '\n') {
1250 /* Lines with only whitespace and/or comments
1251 shouldn't affect the indentation and are
1252 not passed to the parser as NEWLINE tokens,
1253 except *totally* empty lines in interactive
1254 mode, which signal the end of a command group. */
1255 if (col == 0 && c == '\n' && tok->prompt != NULL)
1256 blankline = 0; /* Let it through */
1257 else
1258 blankline = 1; /* Ignore completely */
1259 /* We can't jump back right here since we still
1260 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001262 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001263 if (col == tok->indstack[tok->indent]) {
1264 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001265 if (altcol != tok->altindstack[tok->indent]) {
1266 if (indenterror(tok))
1267 return ERRORTOKEN;
1268 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001270 else if (col > tok->indstack[tok->indent]) {
1271 /* Indent -- always one */
1272 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001273 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001274 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001275 return ERRORTOKEN;
1276 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001277 if (altcol <= tok->altindstack[tok->indent]) {
1278 if (indenterror(tok))
1279 return ERRORTOKEN;
1280 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001281 tok->pendin++;
1282 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001283 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001285 else /* col < tok->indstack[tok->indent] */ {
1286 /* Dedent -- any number, must be consistent */
1287 while (tok->indent > 0 &&
1288 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001289 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001290 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001291 }
1292 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001293 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001294 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001295 return ERRORTOKEN;
1296 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001297 if (altcol != tok->altindstack[tok->indent]) {
1298 if (indenterror(tok))
1299 return ERRORTOKEN;
1300 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 }
1302 }
1303 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001305 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001306
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001307 /* Return pending indents/dedents */
1308 if (tok->pendin != 0) {
1309 if (tok->pendin < 0) {
1310 tok->pendin++;
1311 return DEDENT;
1312 }
1313 else {
1314 tok->pendin--;
1315 return INDENT;
1316 }
1317 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001318
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001319 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001320 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 /* Skip spaces */
1322 do {
1323 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001324 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001325
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001327 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001328
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001329 /* Skip comment */
1330 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001331 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001332 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001333
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001335 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001336 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001337 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001338
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001340 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001341 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001342 /* Process b"", r"" and br"" */
1343 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001344 c = tok_nextc(tok);
1345 if (c == '"' || c == '\'')
1346 goto letter_quote;
1347 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001348 if (c == 'r' || c == 'R') {
1349 c = tok_nextc(tok);
1350 if (c == '"' || c == '\'')
1351 goto letter_quote;
1352 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001353 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001354 if (c >= 128)
1355 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001356 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001357 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001359 if (nonascii &&
Victor Stinnerffbc2f62010-03-21 21:48:45 +00001360 !verify_identifier(tok)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001361 tok->done = E_IDENTIFIER;
1362 return ERRORTOKEN;
1363 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001364 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 *p_end = tok->cur;
1366 return NAME;
1367 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001368
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 /* Newline */
1370 if (c == '\n') {
1371 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001372 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001373 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001374 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001376 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001377 return NEWLINE;
1378 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001380 /* Period or number starting with period? */
1381 if (c == '.') {
1382 c = tok_nextc(tok);
1383 if (isdigit(c)) {
1384 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001385 } else if (c == '.') {
1386 c = tok_nextc(tok);
1387 if (c == '.') {
1388 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001389 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001390 return ELLIPSIS;
1391 } else {
1392 tok_backup(tok, c);
1393 }
1394 tok_backup(tok, '.');
1395 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001396 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001397 }
Georg Brandldde00282007-03-18 19:01:53 +00001398 *p_start = tok->start;
1399 *p_end = tok->cur;
1400 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001401 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001402
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001403 /* Number */
1404 if (isdigit(c)) {
1405 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001406 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 c = tok_nextc(tok);
1408 if (c == '.')
1409 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001410#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001411 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001412 goto imaginary;
1413#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001414 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001415
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001416 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001417 c = tok_nextc(tok);
1418 if (!isxdigit(c)) {
1419 tok->done = E_TOKEN;
1420 tok_backup(tok, c);
1421 return ERRORTOKEN;
1422 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 do {
1424 c = tok_nextc(tok);
1425 } while (isxdigit(c));
1426 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001427 else if (c == 'o' || c == 'O') {
1428 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001429 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001430 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001431 tok->done = E_TOKEN;
1432 tok_backup(tok, c);
1433 return ERRORTOKEN;
1434 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001435 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001436 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001437 } while ('0' <= c && c < '8');
1438 }
1439 else if (c == 'b' || c == 'B') {
1440 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001441 c = tok_nextc(tok);
1442 if (c != '0' && c != '1') {
1443 tok->done = E_TOKEN;
1444 tok_backup(tok, c);
1445 return ERRORTOKEN;
1446 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001447 do {
1448 c = tok_nextc(tok);
1449 } while (c == '0' || c == '1');
1450 }
1451 else {
1452 int nonzero = 0;
1453 /* maybe old-style octal; c is first char of it */
1454 /* in any case, allow '0' as a literal */
1455 while (c == '0')
1456 c = tok_nextc(tok);
1457 while (isdigit(c)) {
1458 nonzero = 1;
1459 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001460 }
1461 if (c == '.')
1462 goto fraction;
1463 else if (c == 'e' || c == 'E')
1464 goto exponent;
1465#ifndef WITHOUT_COMPLEX
1466 else if (c == 'j' || c == 'J')
1467 goto imaginary;
1468#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001469 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001470 tok->done = E_TOKEN;
1471 tok_backup(tok, c);
1472 return ERRORTOKEN;
1473 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 }
1475 }
1476 else {
1477 /* Decimal */
1478 do {
1479 c = tok_nextc(tok);
1480 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001481 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001482 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001483 if (c == '.') {
1484 fraction:
1485 /* Fraction */
1486 do {
1487 c = tok_nextc(tok);
1488 } while (isdigit(c));
1489 }
1490 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001491 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001492 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001493 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001494 if (c == '+' || c == '-')
1495 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001496 if (!isdigit(c)) {
1497 tok->done = E_TOKEN;
1498 tok_backup(tok, c);
1499 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001500 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001501 do {
1502 c = tok_nextc(tok);
1503 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001504 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001505#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001506 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001507 /* Imaginary part */
1508 imaginary:
1509 c = tok_nextc(tok);
1510#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001511 }
1512 }
1513 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001514 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001515 *p_end = tok->cur;
1516 return NUMBER;
1517 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001518
1519 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001520 /* String */
1521 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001522 int quote = c;
1523 int quote_size = 1; /* 1 or 3 */
1524 int end_quote_size = 0;
1525
1526 /* Find the quote size and start of string */
1527 c = tok_nextc(tok);
1528 if (c == quote) {
1529 c = tok_nextc(tok);
1530 if (c == quote)
1531 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001532 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001533 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001534 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001535 if (c != quote)
1536 tok_backup(tok, c);
1537
1538 /* Get rest of string */
1539 while (end_quote_size != quote_size) {
1540 c = tok_nextc(tok);
1541 if (c == EOF) {
1542 if (quote_size == 3)
1543 tok->done = E_EOFS;
1544 else
1545 tok->done = E_EOLS;
1546 tok->cur = tok->inp;
1547 return ERRORTOKEN;
1548 }
1549 if (quote_size == 1 && c == '\n') {
1550 tok->done = E_EOLS;
1551 tok->cur = tok->inp;
1552 return ERRORTOKEN;
1553 }
1554 if (c == quote)
1555 end_quote_size += 1;
1556 else {
1557 end_quote_size = 0;
1558 if (c == '\\')
1559 c = tok_nextc(tok); /* skip escaped char */
1560 }
1561 }
1562
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001563 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001564 *p_end = tok->cur;
1565 return STRING;
1566 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001567
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001568 /* Line continuation */
1569 if (c == '\\') {
1570 c = tok_nextc(tok);
1571 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001572 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001573 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001574 return ERRORTOKEN;
1575 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001576 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001577 goto again; /* Read next line */
1578 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001579
Guido van Rossumfbab9051991-10-20 20:25:03 +00001580 /* Check for two-character token */
1581 {
1582 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001583 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001584 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001585 int c3 = tok_nextc(tok);
1586 int token3 = PyToken_ThreeChars(c, c2, c3);
1587 if (token3 != OP) {
1588 token = token3;
1589 } else {
1590 tok_backup(tok, c3);
1591 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001592 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001593 *p_end = tok->cur;
1594 return token;
1595 }
1596 tok_backup(tok, c2);
1597 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001598
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001599 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001600 switch (c) {
1601 case '(':
1602 case '[':
1603 case '{':
1604 tok->level++;
1605 break;
1606 case ')':
1607 case ']':
1608 case '}':
1609 tok->level--;
1610 break;
1611 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001612
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001613 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001614 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001615 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001616 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001617}
1618
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001619int
1620PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1621{
1622 int result = tok_get(tok, p_start, p_end);
1623 if (tok->decoding_erred) {
1624 result = ERRORTOKEN;
1625 tok->done = E_DECODE;
1626 }
1627 return result;
1628}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001629
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001630/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001631
1632 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001633 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001634 should be assumed to be PyUnicode_GetDefaultEncoding()).
1635
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001636 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1637 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001638*/
1639char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001640PyTokenizer_FindEncoding(int fd)
1641{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001642 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001643 FILE *fp;
1644 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001645
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001646 fd = dup(fd);
1647 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001648 return NULL;
1649 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001650 fp = fdopen(fd, "r");
1651 if (fp == NULL) {
1652 return NULL;
1653 }
1654 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1655 if (tok == NULL) {
1656 fclose(fp);
1657 return NULL;
1658 }
1659 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001660 PyTokenizer_Get(tok, &p_start, &p_end);
1661 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001662 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001663 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001664 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001665 if (encoding)
1666 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001667 }
1668 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001669 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001670}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001671
Guido van Rossum408027e1996-12-30 16:17:54 +00001672#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001673
1674void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001675tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001676{
Guido van Rossum86bea461997-04-29 21:03:06 +00001677 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001678 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1679 printf("(%.*s)", (int)(end - start), start);
1680}
1681
1682#endif