blob: 77fec7400857428cb688a3b7c602f542f4aee9e3 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Benjamin Peterson4893abc2010-04-03 23:10:01 +0000248 while (Py_ISALNUM(t[0]) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000285 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000286 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
Victor Stinner6aa278e2010-03-03 00:18:49 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000323 tok->decoding_state = STATE_RAW;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000324 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000325 return 1;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000331 return 1;
332 }
Victor Stinner6aa278e2010-03-03 00:18:49 +0000333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
Victor Stinner6aa278e2010-03-03 00:18:49 +0000343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000352 tok->decoding_state = STATE_NORMAL;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
364 } else {
Victor Stinner6aa278e2010-03-03 00:18:49 +0000365 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366 return 1;
367 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000368 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000371 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 return 1;
373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 reached): see tok_nextc and its calls to decoding_fgets.
388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
400
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000409 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000411 if (PyUnicode_CheckExact(bufobj))
412 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000414 if (buf == NULL) {
415 goto error;
416 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000417 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 else
419 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000420 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000421 if (buf == NULL) {
422 goto error;
423 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000424 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 }
426
427 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000429 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000431 buflen-size);
432 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000433 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000434 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000435 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 else
437 tok->decoding_buffer = NULL;
438
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000441 if (buflen == 0) /* EOF */
442 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000443 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000447 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465
Christian Heimes819b8bf2008-01-03 23:05:47 +0000466 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467 if (io == NULL)
468 goto cleanup;
469
Brett Cannon8a9583e2008-09-04 05:04:25 +0000470 if (tok->filename)
471 stream = PyObject_CallMethod(io, "open", "ssis",
472 tok->filename, "r", -1, enc);
473 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000474 stream = PyObject_CallMethod(io, "open", "isisOOO",
475 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000476 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000479 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000480 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000482
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000483 /* The file has been reopened; parsing will restart from
484 * the beginning of the file, we have to reset the line number.
485 * But this function has been called from inside tok_nextc() which
486 * will increment lineno before it returns. So we set it -1 so that
487 * the next call to tok_nextc() will start with tok->lineno == 0.
488 */
489 tok->lineno = -1;
490
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000491 cleanup:
492 Py_XDECREF(stream);
493 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000494 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495}
496
497/* Fetch the next byte from TOK. */
498
499static int fp_getc(struct tok_state *tok) {
500 return getc(tok->fp);
501}
502
503/* Unfetch the last byte back into TOK. */
504
505static void fp_ungetc(int c, struct tok_state *tok) {
506 ungetc(c, tok->fp);
507}
508
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000509/* Check whether the characters at s start a valid
510 UTF-8 sequence. Return the number of characters forming
511 the sequence if yes, 0 if not. */
512static int valid_utf8(const unsigned char* s)
513{
514 int expected = 0;
515 int length;
516 if (*s < 0x80)
517 /* single-byte code */
518 return 1;
519 if (*s < 0xc0)
520 /* following byte */
521 return 0;
522 if (*s < 0xE0)
523 expected = 1;
524 else if (*s < 0xF0)
525 expected = 2;
526 else if (*s < 0xF8)
527 expected = 3;
528 else
529 return 0;
530 length = expected + 1;
531 for (; expected; expected--)
532 if (s[expected] < 0x80 || s[expected] >= 0xC0)
533 return 0;
534 return length;
535}
536
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537/* Read a line of input from TOK. Determine encoding
538 if necessary. */
539
540static char *
541decoding_fgets(char *s, int size, struct tok_state *tok)
542{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000543 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000544 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000545 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000546 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 /* We already have a codec associated with
548 this input. */
549 line = fp_readl(s, size, tok);
550 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000551 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000553 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 break;
556 } else {
557 /* We have not yet determined the encoding.
558 If an encoding is found, use the file-pointer
559 reader functions from now on. */
560 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
561 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000562 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000564 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
566 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
567 return error_ret(tok);
568 }
569 }
570#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571 /* The default encoding is UTF-8, so make sure we don't have any
572 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000573 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000575 int length;
576 for (c = (unsigned char *)line; *c; c += length)
577 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 badchar = *c;
579 break;
580 }
581 }
582 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000583 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000584 /* Need to add 1 to the line number, since this line
585 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000586 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000587 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000588 "in file %.200s on line %i, "
589 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000590 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000591 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000592 PyErr_SetString(PyExc_SyntaxError, buf);
593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 }
595#endif
596 return line;
597}
598
599static int
600decoding_feof(struct tok_state *tok)
601{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000602 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 return feof(tok->fp);
604 } else {
605 PyObject* buf = tok->decoding_buffer;
606 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000607 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 if (buf == NULL) {
609 error_ret(tok);
610 return 1;
611 } else {
612 tok->decoding_buffer = buf;
613 }
614 }
615 return PyObject_Length(buf) == 0;
616 }
617}
618
619/* Fetch a byte from TOK, using the string buffer. */
620
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000621static int
622buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000623 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624}
625
626/* Unfetch a byte from TOK, using the string buffer. */
627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000628static void
629buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000631 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
634/* Set the readline function for TOK to ENC. For the string-based
635 tokenizer, this means to just record the encoding. */
636
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000637static int
638buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 tok->enc = enc;
640 return 1;
641}
642
643/* Return a UTF-8 encoding Python string object from the
644 C byte string STR, which is encoded with ENC. */
645
646static PyObject *
647translate_into_utf8(const char* str, const char* enc) {
648 PyObject *utf8;
649 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
650 if (buf == NULL)
651 return NULL;
652 utf8 = PyUnicode_AsUTF8String(buf);
653 Py_DECREF(buf);
654 return utf8;
655}
656
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000657
658static char *
659translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000660 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000661 char *buf, *current;
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000662 char c = '\0';
663 buf = PyMem_MALLOC(needed_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000664 if (buf == NULL) {
665 tok->done = E_NOMEM;
666 return NULL;
667 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000668 for (current = buf; *s; s++, current++) {
669 c = *s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000670 if (skip_next_lf) {
671 skip_next_lf = 0;
672 if (c == '\n') {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000673 c = *++s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000674 if (!c)
675 break;
676 }
677 }
678 if (c == '\r') {
679 skip_next_lf = 1;
680 c = '\n';
681 }
682 *current = c;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000683 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000684 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000685 there isn't one already. */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000686 if (exec_input && c != '\n') {
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000687 *current = '\n';
688 current++;
689 }
690 *current = '\0';
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000691 final_length = current - buf + 1;
692 if (final_length < needed_length && final_length)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000693 /* should never fail */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000694 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000695 return buf;
696}
697
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698/* Decode a byte string STR for use as the buffer of TOK.
699 Look for encoding declarations inside STR, and record them
700 inside TOK. */
701
702static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000703decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000704{
705 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000706 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000707 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000708 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000709 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000710 tok->input = str = translate_newlines(input, single, tok);
711 if (str == NULL)
712 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713 tok->enc = NULL;
714 tok->str = str;
715 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000718 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 if (tok->enc != NULL) {
720 utf8 = translate_into_utf8(str, tok->enc);
721 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000722 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000723 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000724 }
725 for (s = str;; s++) {
726 if (*s == '\0') break;
727 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000728 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000729 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000730 lineno++;
731 if (lineno == 2) break;
732 }
733 }
734 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000735 /* need to check line 1 and 2 separately since check_coding_spec
736 assumes a single line as input */
737 if (newl[0]) {
738 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
739 return error_ret(tok);
740 if (tok->enc == NULL && newl[1]) {
741 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
742 tok, buf_setreadl))
743 return error_ret(tok);
744 }
745 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000746 if (tok->enc != NULL) {
747 assert(utf8 == NULL);
748 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000749 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000750 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000751 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000752 }
753 assert(tok->decoding_buffer == NULL);
754 tok->decoding_buffer = utf8; /* CAUTION */
755 return str;
756}
757
758#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759
760/* Set up tokenizer for string */
761
762struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764{
765 struct tok_state *tok = tok_new();
766 if (tok == NULL)
767 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000768 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000769 if (str == NULL) {
770 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000771 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000772 }
773
Martin v. Löwis95292d62002-12-11 14:04:59 +0000774 /* XXX: constify members. */
775 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 return tok;
777}
778
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000779struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000780PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000781{
782 struct tok_state *tok = tok_new();
783 if (tok == NULL)
784 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000785#ifndef PGEN
786 tok->input = str = translate_newlines(str, exec_input, tok);
787#endif
788 if (str == NULL) {
789 PyTokenizer_Free(tok);
790 return NULL;
791 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000792 tok->decoding_state = STATE_RAW;
793 tok->read_coding_spec = 1;
794 tok->enc = NULL;
795 tok->str = str;
796 tok->encoding = (char *)PyMem_MALLOC(6);
797 if (!tok->encoding) {
798 PyTokenizer_Free(tok);
799 return NULL;
800 }
801 strcpy(tok->encoding, "utf-8");
802
803 /* XXX: constify members. */
804 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
805 return tok;
806}
807
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000808/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809
810struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000811PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812{
813 struct tok_state *tok = tok_new();
814 if (tok == NULL)
815 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000816 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000817 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818 return NULL;
819 }
820 tok->cur = tok->inp = tok->buf;
821 tok->end = tok->buf + BUFSIZ;
822 tok->fp = fp;
823 tok->prompt = ps1;
824 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000825 if (enc != NULL) {
826 /* Must copy encoding declaration since it
827 gets copied into the parse tree. */
828 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
829 if (!tok->encoding) {
830 PyTokenizer_Free(tok);
831 return NULL;
832 }
833 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000834 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000835 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836 return tok;
837}
838
839
840/* Free a tok_state structure */
841
842void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000843PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000844{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000845 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000846 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000847#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000848 Py_XDECREF(tok->decoding_readline);
849 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000850#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000851 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000852 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000853 if (tok->input)
854 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000855 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856}
857
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858/* Get next char, updating state; error code goes into tok->done */
859
860static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000861tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000864 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000865 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000866 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 if (tok->done != E_OK)
868 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000869 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000870 char *end = strchr(tok->inp, '\n');
871 if (end != NULL)
872 end++;
873 else {
874 end = strchr(tok->inp, '\0');
875 if (end == tok->inp) {
876 tok->done = E_EOF;
877 return EOF;
878 }
879 }
880 if (tok->start == NULL)
881 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000882 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000883 tok->lineno++;
884 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000885 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000886 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000888 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000889#ifndef PGEN
890 if (tok->encoding && newtok && *newtok) {
891 /* Recode to UTF-8 */
892 Py_ssize_t buflen;
893 const char* buf;
894 PyObject *u = translate_into_utf8(newtok, tok->encoding);
895 PyMem_FREE(newtok);
896 if (!u) {
897 tok->done = E_DECODE;
898 return EOF;
899 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000900 buflen = PyBytes_GET_SIZE(u);
901 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000902 if (!buf) {
903 Py_DECREF(u);
904 tok->done = E_DECODE;
905 return EOF;
906 }
907 newtok = PyMem_MALLOC(buflen+1);
908 strcpy(newtok, buf);
909 Py_DECREF(u);
910 }
911#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 if (tok->nextprompt != NULL)
913 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000915 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000916 else if (*newtok == '\0') {
917 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918 tok->done = E_EOF;
919 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000920 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000921 size_t start = tok->start - tok->buf;
922 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000923 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000924 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000925 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000926 tok->lineno++;
927 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000928 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000929 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000930 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000931 tok->done = E_NOMEM;
932 return EOF;
933 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000934 tok->buf = buf;
935 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000936 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000937 strcpy(tok->buf + oldlen, newtok);
938 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000939 tok->inp = tok->buf + newlen;
940 tok->end = tok->inp + 1;
941 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000942 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000943 else {
944 tok->lineno++;
945 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000946 PyMem_FREE(tok->buf);
947 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000948 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000949 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000950 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000951 tok->inp = strchr(tok->buf, '\0');
952 tok->end = tok->inp + 1;
953 }
954 }
955 else {
956 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000958 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000959 if (tok->start == NULL) {
960 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000961 tok->buf = (char *)
962 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000963 if (tok->buf == NULL) {
964 tok->done = E_NOMEM;
965 return EOF;
966 }
967 tok->end = tok->buf + BUFSIZ;
968 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000969 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
970 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000971 tok->done = E_EOF;
972 done = 1;
973 }
974 else {
975 tok->done = E_OK;
976 tok->inp = strchr(tok->buf, '\0');
977 done = tok->inp[-1] == '\n';
978 }
979 }
980 else {
981 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000982 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000983 tok->done = E_EOF;
984 done = 1;
985 }
986 else
987 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000988 }
989 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000990 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000991 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000992 Py_ssize_t curstart = tok->start == NULL ? -1 :
993 tok->start - tok->buf;
994 Py_ssize_t curvalid = tok->inp - tok->buf;
995 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000996 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000997 newbuf = (char *)PyMem_REALLOC(newbuf,
998 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000999 if (newbuf == NULL) {
1000 tok->done = E_NOMEM;
1001 tok->cur = tok->inp;
1002 return EOF;
1003 }
1004 tok->buf = newbuf;
1005 tok->inp = tok->buf + curvalid;
1006 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001007 tok->start = curstart < 0 ? NULL :
1008 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001009 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001010 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001011 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001012 /* Break out early on decoding
1013 errors, as tok->buf will be NULL
1014 */
1015 if (tok->decoding_erred)
1016 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001017 /* Last line does not end in \n,
1018 fake one */
1019 strcpy(tok->inp, "\n");
1020 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001021 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001022 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001023 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001024 if (tok->buf != NULL) {
1025 tok->cur = tok->buf + cur;
1026 tok->line_start = tok->cur;
1027 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001028 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001029 pt = tok->inp - 2;
1030 if (pt >= tok->buf && *pt == '\r') {
1031 *pt++ = '\n';
1032 *pt = '\0';
1033 tok->inp = pt;
1034 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001035 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001036 }
1037 if (tok->done != E_OK) {
1038 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001039 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001040 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041 return EOF;
1042 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001043 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001044 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001045}
1046
1047
1048/* Back-up one character */
1049
1050static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001051tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001052{
1053 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001054 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001055 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056 if (*tok->cur != c)
1057 *tok->cur = c;
1058 }
1059}
1060
1061
1062/* Return the token corresponding to a single character */
1063
1064int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001065PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001066{
1067 switch (c) {
1068 case '(': return LPAR;
1069 case ')': return RPAR;
1070 case '[': return LSQB;
1071 case ']': return RSQB;
1072 case ':': return COLON;
1073 case ',': return COMMA;
1074 case ';': return SEMI;
1075 case '+': return PLUS;
1076 case '-': return MINUS;
1077 case '*': return STAR;
1078 case '/': return SLASH;
1079 case '|': return VBAR;
1080 case '&': return AMPER;
1081 case '<': return LESS;
1082 case '>': return GREATER;
1083 case '=': return EQUAL;
1084 case '.': return DOT;
1085 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 case '{': return LBRACE;
1087 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001088 case '^': return CIRCUMFLEX;
1089 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001090 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 default: return OP;
1092 }
1093}
1094
1095
Guido van Rossumfbab9051991-10-20 20:25:03 +00001096int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001097PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001098{
1099 switch (c1) {
1100 case '=':
1101 switch (c2) {
1102 case '=': return EQEQUAL;
1103 }
1104 break;
1105 case '!':
1106 switch (c2) {
1107 case '=': return NOTEQUAL;
1108 }
1109 break;
1110 case '<':
1111 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001112 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001113 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001114 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001115 }
1116 break;
1117 case '>':
1118 switch (c2) {
1119 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001120 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001121 }
1122 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001123 case '+':
1124 switch (c2) {
1125 case '=': return PLUSEQUAL;
1126 }
1127 break;
1128 case '-':
1129 switch (c2) {
1130 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001131 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001132 }
1133 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001134 case '*':
1135 switch (c2) {
1136 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001137 case '=': return STAREQUAL;
1138 }
1139 break;
1140 case '/':
1141 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001142 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001143 case '=': return SLASHEQUAL;
1144 }
1145 break;
1146 case '|':
1147 switch (c2) {
1148 case '=': return VBAREQUAL;
1149 }
1150 break;
1151 case '%':
1152 switch (c2) {
1153 case '=': return PERCENTEQUAL;
1154 }
1155 break;
1156 case '&':
1157 switch (c2) {
1158 case '=': return AMPEREQUAL;
1159 }
1160 break;
1161 case '^':
1162 switch (c2) {
1163 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001164 }
1165 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001166 }
1167 return OP;
1168}
1169
Thomas Wouters434d0822000-08-24 20:11:32 +00001170int
1171PyToken_ThreeChars(int c1, int c2, int c3)
1172{
1173 switch (c1) {
1174 case '<':
1175 switch (c2) {
1176 case '<':
1177 switch (c3) {
1178 case '=':
1179 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001180 }
1181 break;
1182 }
1183 break;
1184 case '>':
1185 switch (c2) {
1186 case '>':
1187 switch (c3) {
1188 case '=':
1189 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001190 }
1191 break;
1192 }
1193 break;
1194 case '*':
1195 switch (c2) {
1196 case '*':
1197 switch (c3) {
1198 case '=':
1199 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001200 }
1201 break;
1202 }
1203 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001204 case '/':
1205 switch (c2) {
1206 case '/':
1207 switch (c3) {
1208 case '=':
1209 return DOUBLESLASHEQUAL;
1210 }
1211 break;
1212 }
1213 break;
Georg Brandldde00282007-03-18 19:01:53 +00001214 case '.':
1215 switch (c2) {
1216 case '.':
1217 switch (c3) {
1218 case '.':
1219 return ELLIPSIS;
1220 }
1221 break;
1222 }
1223 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001224 }
1225 return OP;
1226}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001227
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001229indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001230{
1231 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001232 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001233 tok->cur = tok->inp;
1234 return 1;
1235 }
1236 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001237 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1238 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001239 tok->altwarning = 0;
1240 }
1241 return 0;
1242}
1243
Martin v. Löwis47383402007-08-15 07:32:56 +00001244#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001245#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001246#else
1247/* Verify that the identifier follows PEP 3131. */
1248static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001249verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001250{
Guido van Rossume3e37012007-08-29 18:54:41 +00001251 PyObject *s;
1252 int result;
Victor Stinner52f6dd72010-03-12 14:45:56 +00001253 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Guido van Rossume3e37012007-08-29 18:54:41 +00001254 if (s == NULL) {
Victor Stinner52f6dd72010-03-12 14:45:56 +00001255 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1256 PyErr_Clear();
1257 tok->done = E_IDENTIFIER;
1258 } else {
1259 tok->done = E_ERROR;
1260 }
Guido van Rossume3e37012007-08-29 18:54:41 +00001261 return 0;
1262 }
1263 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001264 Py_DECREF(s);
Victor Stinner52f6dd72010-03-12 14:45:56 +00001265 if (result == 0)
1266 tok->done = E_IDENTIFIER;
Martin v. Löwis47383402007-08-15 07:32:56 +00001267 return result;
1268}
1269#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001270
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271/* Get next token, after space stripping etc. */
1272
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001273static int
1274tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001275{
1276 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001277 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001278
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001279 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001280 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001281 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001282 blankline = 0;
1283
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 /* Get indentation level */
1285 if (tok->atbol) {
1286 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001287 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 for (;;) {
1290 c = tok_nextc(tok);
1291 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001292 col++, altcol++;
1293 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001295 altcol = (altcol/tok->alttabsize + 1)
1296 * tok->alttabsize;
1297 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001298 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001299 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 else
1301 break;
1302 }
1303 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001304 if (c == '#' || c == '\n') {
1305 /* Lines with only whitespace and/or comments
1306 shouldn't affect the indentation and are
1307 not passed to the parser as NEWLINE tokens,
1308 except *totally* empty lines in interactive
1309 mode, which signal the end of a command group. */
1310 if (col == 0 && c == '\n' && tok->prompt != NULL)
1311 blankline = 0; /* Let it through */
1312 else
1313 blankline = 1; /* Ignore completely */
1314 /* We can't jump back right here since we still
1315 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001317 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001318 if (col == tok->indstack[tok->indent]) {
1319 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001320 if (altcol != tok->altindstack[tok->indent]) {
1321 if (indenterror(tok))
1322 return ERRORTOKEN;
1323 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001324 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001325 else if (col > tok->indstack[tok->indent]) {
1326 /* Indent -- always one */
1327 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001328 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001329 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001330 return ERRORTOKEN;
1331 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001332 if (altcol <= tok->altindstack[tok->indent]) {
1333 if (indenterror(tok))
1334 return ERRORTOKEN;
1335 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001336 tok->pendin++;
1337 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001338 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001340 else /* col < tok->indstack[tok->indent] */ {
1341 /* Dedent -- any number, must be consistent */
1342 while (tok->indent > 0 &&
1343 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001344 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001345 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001346 }
1347 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001348 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001349 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001350 return ERRORTOKEN;
1351 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001352 if (altcol != tok->altindstack[tok->indent]) {
1353 if (indenterror(tok))
1354 return ERRORTOKEN;
1355 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001356 }
1357 }
1358 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001360 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001361
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 /* Return pending indents/dedents */
1363 if (tok->pendin != 0) {
1364 if (tok->pendin < 0) {
1365 tok->pendin++;
1366 return DEDENT;
1367 }
1368 else {
1369 tok->pendin--;
1370 return INDENT;
1371 }
1372 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001373
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001375 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 /* Skip spaces */
1377 do {
1378 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001379 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001380
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001381 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001382 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001383
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001384 /* Skip comment */
1385 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001386 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001388
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001390 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001392 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001393
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001394 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001395 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001396 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001397 /* Process b"", r"" and br"" */
1398 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001399 c = tok_nextc(tok);
1400 if (c == '"' || c == '\'')
1401 goto letter_quote;
1402 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001403 if (c == 'r' || c == 'R') {
1404 c = tok_nextc(tok);
1405 if (c == '"' || c == '\'')
1406 goto letter_quote;
1407 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001408 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001409 if (c >= 128)
1410 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001412 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001414 if (nonascii &&
Victor Stinner52f6dd72010-03-12 14:45:56 +00001415 !verify_identifier(tok)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001416 tok->done = E_IDENTIFIER;
1417 return ERRORTOKEN;
1418 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001419 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 *p_end = tok->cur;
1421 return NAME;
1422 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001424 /* Newline */
1425 if (c == '\n') {
1426 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001427 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001428 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001431 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001432 return NEWLINE;
1433 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001435 /* Period or number starting with period? */
1436 if (c == '.') {
1437 c = tok_nextc(tok);
1438 if (isdigit(c)) {
1439 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001440 } else if (c == '.') {
1441 c = tok_nextc(tok);
1442 if (c == '.') {
1443 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001444 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001445 return ELLIPSIS;
1446 } else {
1447 tok_backup(tok, c);
1448 }
1449 tok_backup(tok, '.');
1450 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001451 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001452 }
Georg Brandldde00282007-03-18 19:01:53 +00001453 *p_start = tok->start;
1454 *p_end = tok->cur;
1455 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001456 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001457
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001458 /* Number */
1459 if (isdigit(c)) {
1460 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001461 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 c = tok_nextc(tok);
1463 if (c == '.')
1464 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001465 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001466 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001468
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001470 c = tok_nextc(tok);
1471 if (!isxdigit(c)) {
1472 tok->done = E_TOKEN;
1473 tok_backup(tok, c);
1474 return ERRORTOKEN;
1475 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001476 do {
1477 c = tok_nextc(tok);
1478 } while (isxdigit(c));
1479 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001480 else if (c == 'o' || c == 'O') {
1481 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001482 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001483 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001484 tok->done = E_TOKEN;
1485 tok_backup(tok, c);
1486 return ERRORTOKEN;
1487 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001488 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001489 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001490 } while ('0' <= c && c < '8');
1491 }
1492 else if (c == 'b' || c == 'B') {
1493 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001494 c = tok_nextc(tok);
1495 if (c != '0' && c != '1') {
1496 tok->done = E_TOKEN;
1497 tok_backup(tok, c);
1498 return ERRORTOKEN;
1499 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001500 do {
1501 c = tok_nextc(tok);
1502 } while (c == '0' || c == '1');
1503 }
1504 else {
1505 int nonzero = 0;
1506 /* maybe old-style octal; c is first char of it */
1507 /* in any case, allow '0' as a literal */
1508 while (c == '0')
1509 c = tok_nextc(tok);
1510 while (isdigit(c)) {
1511 nonzero = 1;
1512 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001513 }
1514 if (c == '.')
1515 goto fraction;
1516 else if (c == 'e' || c == 'E')
1517 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001518 else if (c == 'j' || c == 'J')
1519 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001520 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001521 tok->done = E_TOKEN;
1522 tok_backup(tok, c);
1523 return ERRORTOKEN;
1524 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001525 }
1526 }
1527 else {
1528 /* Decimal */
1529 do {
1530 c = tok_nextc(tok);
1531 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001532 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001533 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001534 if (c == '.') {
1535 fraction:
1536 /* Fraction */
1537 do {
1538 c = tok_nextc(tok);
1539 } while (isdigit(c));
1540 }
1541 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001542 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001543 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001544 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001545 if (c == '+' || c == '-')
1546 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001547 if (!isdigit(c)) {
1548 tok->done = E_TOKEN;
1549 tok_backup(tok, c);
1550 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001551 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001552 do {
1553 c = tok_nextc(tok);
1554 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001555 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001556 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001557 /* Imaginary part */
1558 imaginary:
1559 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001560 }
1561 }
1562 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001563 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001564 *p_end = tok->cur;
1565 return NUMBER;
1566 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001567
1568 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001569 /* String */
1570 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001571 int quote = c;
1572 int quote_size = 1; /* 1 or 3 */
1573 int end_quote_size = 0;
1574
1575 /* Find the quote size and start of string */
1576 c = tok_nextc(tok);
1577 if (c == quote) {
1578 c = tok_nextc(tok);
1579 if (c == quote)
1580 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001581 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001582 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001583 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001584 if (c != quote)
1585 tok_backup(tok, c);
1586
1587 /* Get rest of string */
1588 while (end_quote_size != quote_size) {
1589 c = tok_nextc(tok);
1590 if (c == EOF) {
1591 if (quote_size == 3)
1592 tok->done = E_EOFS;
1593 else
1594 tok->done = E_EOLS;
1595 tok->cur = tok->inp;
1596 return ERRORTOKEN;
1597 }
1598 if (quote_size == 1 && c == '\n') {
1599 tok->done = E_EOLS;
1600 tok->cur = tok->inp;
1601 return ERRORTOKEN;
1602 }
1603 if (c == quote)
1604 end_quote_size += 1;
1605 else {
1606 end_quote_size = 0;
1607 if (c == '\\')
1608 c = tok_nextc(tok); /* skip escaped char */
1609 }
1610 }
1611
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001612 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001613 *p_end = tok->cur;
1614 return STRING;
1615 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001616
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001617 /* Line continuation */
1618 if (c == '\\') {
1619 c = tok_nextc(tok);
1620 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001621 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001622 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001623 return ERRORTOKEN;
1624 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001625 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001626 goto again; /* Read next line */
1627 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001628
Guido van Rossumfbab9051991-10-20 20:25:03 +00001629 /* Check for two-character token */
1630 {
1631 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001632 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001633 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001634 int c3 = tok_nextc(tok);
1635 int token3 = PyToken_ThreeChars(c, c2, c3);
1636 if (token3 != OP) {
1637 token = token3;
1638 } else {
1639 tok_backup(tok, c3);
1640 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001641 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001642 *p_end = tok->cur;
1643 return token;
1644 }
1645 tok_backup(tok, c2);
1646 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001647
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001648 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001649 switch (c) {
1650 case '(':
1651 case '[':
1652 case '{':
1653 tok->level++;
1654 break;
1655 case ')':
1656 case ']':
1657 case '}':
1658 tok->level--;
1659 break;
1660 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001661
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001662 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001663 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001664 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001665 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001666}
1667
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001668int
1669PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1670{
1671 int result = tok_get(tok, p_start, p_end);
1672 if (tok->decoding_erred) {
1673 result = ERRORTOKEN;
1674 tok->done = E_DECODE;
1675 }
1676 return result;
1677}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001678
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001679/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001680
1681 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001682 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001683 should be assumed to be PyUnicode_GetDefaultEncoding()).
1684
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001685 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1686 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001687*/
1688char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001689PyTokenizer_FindEncoding(int fd)
1690{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001691 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001692 FILE *fp;
1693 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001694
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001695 fd = dup(fd);
1696 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001697 return NULL;
1698 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001699 fp = fdopen(fd, "r");
1700 if (fp == NULL) {
1701 return NULL;
1702 }
1703 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1704 if (tok == NULL) {
1705 fclose(fp);
1706 return NULL;
1707 }
1708 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001709 PyTokenizer_Get(tok, &p_start, &p_end);
1710 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001711 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001712 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001713 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001714 if (encoding)
1715 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001716 }
1717 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001718 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001719}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001720
Guido van Rossum408027e1996-12-30 16:17:54 +00001721#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001722
1723void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001724tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001725{
Guido van Rossum86bea461997-04-29 21:03:06 +00001726 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001727 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1728 printf("(%.*s)", (int)(end - start), start);
1729}
1730
1731#endif