blob: aef081de342812598a035b4e2d35592da8829228 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Benjamin Peterson4893abc2010-04-03 23:10:01 +0000248 while (Py_ISALNUM(t[0]) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000285 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000286 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
Victor Stinner6aa278e2010-03-03 00:18:49 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000323 tok->decoding_state = STATE_RAW;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000324 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000325 return 1;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000331 return 1;
332 }
Victor Stinner6aa278e2010-03-03 00:18:49 +0000333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
Victor Stinner6aa278e2010-03-03 00:18:49 +0000343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000352 tok->decoding_state = STATE_NORMAL;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
364 } else {
Victor Stinner6aa278e2010-03-03 00:18:49 +0000365 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366 return 1;
367 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000368 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000371 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 return 1;
373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 reached): see tok_nextc and its calls to decoding_fgets.
388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
400
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000409 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000411 if (PyUnicode_CheckExact(bufobj))
412 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000414 if (buf == NULL) {
415 goto error;
416 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000417 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 else
419 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000420 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000421 if (buf == NULL) {
422 goto error;
423 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000424 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 }
426
427 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000429 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000431 buflen-size);
432 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000433 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000434 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000435 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 else
437 tok->decoding_buffer = NULL;
438
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000441 if (buflen == 0) /* EOF */
442 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000443 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000447 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465
Christian Heimes819b8bf2008-01-03 23:05:47 +0000466 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467 if (io == NULL)
468 goto cleanup;
469
Brett Cannon8a9583e2008-09-04 05:04:25 +0000470 if (tok->filename)
471 stream = PyObject_CallMethod(io, "open", "ssis",
472 tok->filename, "r", -1, enc);
473 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000474 stream = PyObject_CallMethod(io, "open", "isisOOO",
475 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000476 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000479 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000480 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000482
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000483 /* The file has been reopened; parsing will restart from
484 * the beginning of the file, we have to reset the line number.
485 * But this function has been called from inside tok_nextc() which
486 * will increment lineno before it returns. So we set it -1 so that
487 * the next call to tok_nextc() will start with tok->lineno == 0.
488 */
489 tok->lineno = -1;
490
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000491 cleanup:
492 Py_XDECREF(stream);
493 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000494 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495}
496
497/* Fetch the next byte from TOK. */
498
499static int fp_getc(struct tok_state *tok) {
500 return getc(tok->fp);
501}
502
503/* Unfetch the last byte back into TOK. */
504
505static void fp_ungetc(int c, struct tok_state *tok) {
506 ungetc(c, tok->fp);
507}
508
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000509/* Check whether the characters at s start a valid
510 UTF-8 sequence. Return the number of characters forming
511 the sequence if yes, 0 if not. */
512static int valid_utf8(const unsigned char* s)
513{
514 int expected = 0;
515 int length;
516 if (*s < 0x80)
517 /* single-byte code */
518 return 1;
519 if (*s < 0xc0)
520 /* following byte */
521 return 0;
522 if (*s < 0xE0)
523 expected = 1;
524 else if (*s < 0xF0)
525 expected = 2;
526 else if (*s < 0xF8)
527 expected = 3;
528 else
529 return 0;
530 length = expected + 1;
531 for (; expected; expected--)
532 if (s[expected] < 0x80 || s[expected] >= 0xC0)
533 return 0;
534 return length;
535}
536
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537/* Read a line of input from TOK. Determine encoding
538 if necessary. */
539
540static char *
541decoding_fgets(char *s, int size, struct tok_state *tok)
542{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000543 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000544 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000545 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000546 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 /* We already have a codec associated with
548 this input. */
549 line = fp_readl(s, size, tok);
550 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000551 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000553 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 break;
556 } else {
557 /* We have not yet determined the encoding.
558 If an encoding is found, use the file-pointer
559 reader functions from now on. */
560 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
561 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000562 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000564 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
566 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
567 return error_ret(tok);
568 }
569 }
570#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571 /* The default encoding is UTF-8, so make sure we don't have any
572 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000573 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000575 int length;
576 for (c = (unsigned char *)line; *c; c += length)
577 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 badchar = *c;
579 break;
580 }
581 }
582 if (badchar) {
Martin v. Löwis725bb232002-08-05 01:49:16 +0000583 /* Need to add 1 to the line number, since this line
584 has not been counted, yet. */
Victor Stinner003a5e72010-04-28 17:06:46 +0000585 PyErr_Format(PyExc_SyntaxError,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000586 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000587 "in file %.200s on line %i, "
588 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000589 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000590 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000591 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000592 }
593#endif
594 return line;
595}
596
597static int
598decoding_feof(struct tok_state *tok)
599{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000600 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601 return feof(tok->fp);
602 } else {
603 PyObject* buf = tok->decoding_buffer;
604 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000605 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000606 if (buf == NULL) {
607 error_ret(tok);
608 return 1;
609 } else {
610 tok->decoding_buffer = buf;
611 }
612 }
613 return PyObject_Length(buf) == 0;
614 }
615}
616
617/* Fetch a byte from TOK, using the string buffer. */
618
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000619static int
620buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000621 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622}
623
624/* Unfetch a byte from TOK, using the string buffer. */
625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000626static void
627buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000628 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000629 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630}
631
632/* Set the readline function for TOK to ENC. For the string-based
633 tokenizer, this means to just record the encoding. */
634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000635static int
636buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000637 tok->enc = enc;
638 return 1;
639}
640
641/* Return a UTF-8 encoding Python string object from the
642 C byte string STR, which is encoded with ENC. */
643
644static PyObject *
645translate_into_utf8(const char* str, const char* enc) {
646 PyObject *utf8;
647 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
648 if (buf == NULL)
649 return NULL;
650 utf8 = PyUnicode_AsUTF8String(buf);
651 Py_DECREF(buf);
652 return utf8;
653}
654
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000655
656static char *
657translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000658 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000659 char *buf, *current;
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000660 char c = '\0';
661 buf = PyMem_MALLOC(needed_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000662 if (buf == NULL) {
663 tok->done = E_NOMEM;
664 return NULL;
665 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000666 for (current = buf; *s; s++, current++) {
667 c = *s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000668 if (skip_next_lf) {
669 skip_next_lf = 0;
670 if (c == '\n') {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000671 c = *++s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000672 if (!c)
673 break;
674 }
675 }
676 if (c == '\r') {
677 skip_next_lf = 1;
678 c = '\n';
679 }
680 *current = c;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000681 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000682 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000683 there isn't one already. */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000684 if (exec_input && c != '\n') {
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000685 *current = '\n';
686 current++;
687 }
688 *current = '\0';
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000689 final_length = current - buf + 1;
690 if (final_length < needed_length && final_length)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000691 /* should never fail */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000692 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000693 return buf;
694}
695
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000696/* Decode a byte string STR for use as the buffer of TOK.
697 Look for encoding declarations inside STR, and record them
698 inside TOK. */
699
700static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000701decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702{
703 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000704 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000706 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000707 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000708 tok->input = str = translate_newlines(input, single, tok);
709 if (str == NULL)
710 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000711 tok->enc = NULL;
712 tok->str = str;
713 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000714 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000715 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000716 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717 if (tok->enc != NULL) {
718 utf8 = translate_into_utf8(str, tok->enc);
719 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000720 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000721 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000722 }
723 for (s = str;; s++) {
724 if (*s == '\0') break;
725 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000726 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000727 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000728 lineno++;
729 if (lineno == 2) break;
730 }
731 }
732 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000733 /* need to check line 1 and 2 separately since check_coding_spec
734 assumes a single line as input */
735 if (newl[0]) {
736 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
737 return error_ret(tok);
738 if (tok->enc == NULL && newl[1]) {
739 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
740 tok, buf_setreadl))
741 return error_ret(tok);
742 }
743 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000744 if (tok->enc != NULL) {
745 assert(utf8 == NULL);
746 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000747 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000748 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000749 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000750 }
751 assert(tok->decoding_buffer == NULL);
752 tok->decoding_buffer = utf8; /* CAUTION */
753 return str;
754}
755
756#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757
758/* Set up tokenizer for string */
759
760struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000761PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000762{
763 struct tok_state *tok = tok_new();
764 if (tok == NULL)
765 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000766 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000767 if (str == NULL) {
768 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000769 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000770 }
771
Martin v. Löwis95292d62002-12-11 14:04:59 +0000772 /* XXX: constify members. */
773 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774 return tok;
775}
776
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000777struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000778PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000779{
780 struct tok_state *tok = tok_new();
781 if (tok == NULL)
782 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000783#ifndef PGEN
784 tok->input = str = translate_newlines(str, exec_input, tok);
785#endif
786 if (str == NULL) {
787 PyTokenizer_Free(tok);
788 return NULL;
789 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000790 tok->decoding_state = STATE_RAW;
791 tok->read_coding_spec = 1;
792 tok->enc = NULL;
793 tok->str = str;
794 tok->encoding = (char *)PyMem_MALLOC(6);
795 if (!tok->encoding) {
796 PyTokenizer_Free(tok);
797 return NULL;
798 }
799 strcpy(tok->encoding, "utf-8");
800
801 /* XXX: constify members. */
802 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
803 return tok;
804}
805
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000806/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807
808struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000809PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000810{
811 struct tok_state *tok = tok_new();
812 if (tok == NULL)
813 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000814 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000815 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000816 return NULL;
817 }
818 tok->cur = tok->inp = tok->buf;
819 tok->end = tok->buf + BUFSIZ;
820 tok->fp = fp;
821 tok->prompt = ps1;
822 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000823 if (enc != NULL) {
824 /* Must copy encoding declaration since it
825 gets copied into the parse tree. */
826 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
827 if (!tok->encoding) {
828 PyTokenizer_Free(tok);
829 return NULL;
830 }
831 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000832 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000833 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000834 return tok;
835}
836
837
838/* Free a tok_state structure */
839
840void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000841PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000843 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000844 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000845#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000846 Py_XDECREF(tok->decoding_readline);
847 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000848#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000849 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000850 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000851 if (tok->input)
852 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000853 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854}
855
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856/* Get next char, updating state; error code goes into tok->done */
857
858static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000859tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000860{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000862 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000863 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000864 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000865 if (tok->done != E_OK)
866 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000867 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000868 char *end = strchr(tok->inp, '\n');
869 if (end != NULL)
870 end++;
871 else {
872 end = strchr(tok->inp, '\0');
873 if (end == tok->inp) {
874 tok->done = E_EOF;
875 return EOF;
876 }
877 }
878 if (tok->start == NULL)
879 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000880 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000881 tok->lineno++;
882 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000883 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000884 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000885 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000886 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000887#ifndef PGEN
888 if (tok->encoding && newtok && *newtok) {
889 /* Recode to UTF-8 */
890 Py_ssize_t buflen;
891 const char* buf;
892 PyObject *u = translate_into_utf8(newtok, tok->encoding);
893 PyMem_FREE(newtok);
894 if (!u) {
895 tok->done = E_DECODE;
896 return EOF;
897 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000898 buflen = PyBytes_GET_SIZE(u);
899 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000900 if (!buf) {
901 Py_DECREF(u);
902 tok->done = E_DECODE;
903 return EOF;
904 }
905 newtok = PyMem_MALLOC(buflen+1);
906 strcpy(newtok, buf);
907 Py_DECREF(u);
908 }
909#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000910 if (tok->nextprompt != NULL)
911 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000912 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000913 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914 else if (*newtok == '\0') {
915 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000916 tok->done = E_EOF;
917 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000918 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000919 size_t start = tok->start - tok->buf;
920 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000921 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000922 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000923 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000924 tok->lineno++;
925 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000926 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000927 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000928 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000929 tok->done = E_NOMEM;
930 return EOF;
931 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000932 tok->buf = buf;
933 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000934 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000935 strcpy(tok->buf + oldlen, newtok);
936 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000937 tok->inp = tok->buf + newlen;
938 tok->end = tok->inp + 1;
939 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000940 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000941 else {
942 tok->lineno++;
943 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000944 PyMem_FREE(tok->buf);
945 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000946 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000947 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000948 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000949 tok->inp = strchr(tok->buf, '\0');
950 tok->end = tok->inp + 1;
951 }
952 }
953 else {
954 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000955 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000956 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000957 if (tok->start == NULL) {
958 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000959 tok->buf = (char *)
960 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000961 if (tok->buf == NULL) {
962 tok->done = E_NOMEM;
963 return EOF;
964 }
965 tok->end = tok->buf + BUFSIZ;
966 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000967 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
968 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000969 tok->done = E_EOF;
970 done = 1;
971 }
972 else {
973 tok->done = E_OK;
974 tok->inp = strchr(tok->buf, '\0');
975 done = tok->inp[-1] == '\n';
976 }
977 }
978 else {
979 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000980 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000981 tok->done = E_EOF;
982 done = 1;
983 }
984 else
985 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000986 }
987 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000988 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000989 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000990 Py_ssize_t curstart = tok->start == NULL ? -1 :
991 tok->start - tok->buf;
992 Py_ssize_t curvalid = tok->inp - tok->buf;
993 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000994 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000995 newbuf = (char *)PyMem_REALLOC(newbuf,
996 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000997 if (newbuf == NULL) {
998 tok->done = E_NOMEM;
999 tok->cur = tok->inp;
1000 return EOF;
1001 }
1002 tok->buf = newbuf;
1003 tok->inp = tok->buf + curvalid;
1004 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001005 tok->start = curstart < 0 ? NULL :
1006 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001007 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001008 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001009 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001010 /* Break out early on decoding
1011 errors, as tok->buf will be NULL
1012 */
1013 if (tok->decoding_erred)
1014 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001015 /* Last line does not end in \n,
1016 fake one */
1017 strcpy(tok->inp, "\n");
1018 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001019 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001020 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001021 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001022 if (tok->buf != NULL) {
1023 tok->cur = tok->buf + cur;
1024 tok->line_start = tok->cur;
1025 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001026 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001027 pt = tok->inp - 2;
1028 if (pt >= tok->buf && *pt == '\r') {
1029 *pt++ = '\n';
1030 *pt = '\0';
1031 tok->inp = pt;
1032 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001033 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001034 }
1035 if (tok->done != E_OK) {
1036 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001037 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001038 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039 return EOF;
1040 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001042 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001043}
1044
1045
1046/* Back-up one character */
1047
1048static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001049tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001050{
1051 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001052 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001053 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001054 if (*tok->cur != c)
1055 *tok->cur = c;
1056 }
1057}
1058
1059
1060/* Return the token corresponding to a single character */
1061
1062int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001063PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001064{
1065 switch (c) {
1066 case '(': return LPAR;
1067 case ')': return RPAR;
1068 case '[': return LSQB;
1069 case ']': return RSQB;
1070 case ':': return COLON;
1071 case ',': return COMMA;
1072 case ';': return SEMI;
1073 case '+': return PLUS;
1074 case '-': return MINUS;
1075 case '*': return STAR;
1076 case '/': return SLASH;
1077 case '|': return VBAR;
1078 case '&': return AMPER;
1079 case '<': return LESS;
1080 case '>': return GREATER;
1081 case '=': return EQUAL;
1082 case '.': return DOT;
1083 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001084 case '{': return LBRACE;
1085 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001086 case '^': return CIRCUMFLEX;
1087 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001088 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001089 default: return OP;
1090 }
1091}
1092
1093
Guido van Rossumfbab9051991-10-20 20:25:03 +00001094int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001095PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001096{
1097 switch (c1) {
1098 case '=':
1099 switch (c2) {
1100 case '=': return EQEQUAL;
1101 }
1102 break;
1103 case '!':
1104 switch (c2) {
1105 case '=': return NOTEQUAL;
1106 }
1107 break;
1108 case '<':
1109 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001110 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001111 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001112 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001113 }
1114 break;
1115 case '>':
1116 switch (c2) {
1117 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001118 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001119 }
1120 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001121 case '+':
1122 switch (c2) {
1123 case '=': return PLUSEQUAL;
1124 }
1125 break;
1126 case '-':
1127 switch (c2) {
1128 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001129 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001130 }
1131 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001132 case '*':
1133 switch (c2) {
1134 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001135 case '=': return STAREQUAL;
1136 }
1137 break;
1138 case '/':
1139 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001140 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001141 case '=': return SLASHEQUAL;
1142 }
1143 break;
1144 case '|':
1145 switch (c2) {
1146 case '=': return VBAREQUAL;
1147 }
1148 break;
1149 case '%':
1150 switch (c2) {
1151 case '=': return PERCENTEQUAL;
1152 }
1153 break;
1154 case '&':
1155 switch (c2) {
1156 case '=': return AMPEREQUAL;
1157 }
1158 break;
1159 case '^':
1160 switch (c2) {
1161 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001162 }
1163 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001164 }
1165 return OP;
1166}
1167
Thomas Wouters434d0822000-08-24 20:11:32 +00001168int
1169PyToken_ThreeChars(int c1, int c2, int c3)
1170{
1171 switch (c1) {
1172 case '<':
1173 switch (c2) {
1174 case '<':
1175 switch (c3) {
1176 case '=':
1177 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001178 }
1179 break;
1180 }
1181 break;
1182 case '>':
1183 switch (c2) {
1184 case '>':
1185 switch (c3) {
1186 case '=':
1187 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001188 }
1189 break;
1190 }
1191 break;
1192 case '*':
1193 switch (c2) {
1194 case '*':
1195 switch (c3) {
1196 case '=':
1197 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001198 }
1199 break;
1200 }
1201 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001202 case '/':
1203 switch (c2) {
1204 case '/':
1205 switch (c3) {
1206 case '=':
1207 return DOUBLESLASHEQUAL;
1208 }
1209 break;
1210 }
1211 break;
Georg Brandldde00282007-03-18 19:01:53 +00001212 case '.':
1213 switch (c2) {
1214 case '.':
1215 switch (c3) {
1216 case '.':
1217 return ELLIPSIS;
1218 }
1219 break;
1220 }
1221 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001222 }
1223 return OP;
1224}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001225
Guido van Rossum926f13a1998-04-09 21:38:06 +00001226static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001227indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228{
1229 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001230 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001231 tok->cur = tok->inp;
1232 return 1;
1233 }
1234 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001235 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1236 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001237 tok->altwarning = 0;
1238 }
1239 return 0;
1240}
1241
Martin v. Löwis47383402007-08-15 07:32:56 +00001242#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001243#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001244#else
1245/* Verify that the identifier follows PEP 3131. */
1246static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001247verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001248{
Guido van Rossume3e37012007-08-29 18:54:41 +00001249 PyObject *s;
1250 int result;
Victor Stinner52f6dd72010-03-12 14:45:56 +00001251 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Guido van Rossume3e37012007-08-29 18:54:41 +00001252 if (s == NULL) {
Victor Stinner52f6dd72010-03-12 14:45:56 +00001253 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1254 PyErr_Clear();
1255 tok->done = E_IDENTIFIER;
1256 } else {
1257 tok->done = E_ERROR;
1258 }
Guido van Rossume3e37012007-08-29 18:54:41 +00001259 return 0;
1260 }
1261 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001262 Py_DECREF(s);
Victor Stinner52f6dd72010-03-12 14:45:56 +00001263 if (result == 0)
1264 tok->done = E_IDENTIFIER;
Martin v. Löwis47383402007-08-15 07:32:56 +00001265 return result;
1266}
1267#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001268
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269/* Get next token, after space stripping etc. */
1270
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001271static int
1272tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273{
1274 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001275 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001276
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001277 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001278 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001279 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001280 blankline = 0;
1281
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 /* Get indentation level */
1283 if (tok->atbol) {
1284 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001285 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001286 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 for (;;) {
1288 c = tok_nextc(tok);
1289 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001290 col++, altcol++;
1291 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001292 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001293 altcol = (altcol/tok->alttabsize + 1)
1294 * tok->alttabsize;
1295 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001296 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001297 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 else
1299 break;
1300 }
1301 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001302 if (c == '#' || c == '\n') {
1303 /* Lines with only whitespace and/or comments
1304 shouldn't affect the indentation and are
1305 not passed to the parser as NEWLINE tokens,
1306 except *totally* empty lines in interactive
1307 mode, which signal the end of a command group. */
1308 if (col == 0 && c == '\n' && tok->prompt != NULL)
1309 blankline = 0; /* Let it through */
1310 else
1311 blankline = 1; /* Ignore completely */
1312 /* We can't jump back right here since we still
1313 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001314 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001315 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001316 if (col == tok->indstack[tok->indent]) {
1317 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001318 if (altcol != tok->altindstack[tok->indent]) {
1319 if (indenterror(tok))
1320 return ERRORTOKEN;
1321 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001322 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001323 else if (col > tok->indstack[tok->indent]) {
1324 /* Indent -- always one */
1325 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001326 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001327 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001328 return ERRORTOKEN;
1329 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001330 if (altcol <= tok->altindstack[tok->indent]) {
1331 if (indenterror(tok))
1332 return ERRORTOKEN;
1333 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001334 tok->pendin++;
1335 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001336 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001337 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001338 else /* col < tok->indstack[tok->indent] */ {
1339 /* Dedent -- any number, must be consistent */
1340 while (tok->indent > 0 &&
1341 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001342 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001343 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001344 }
1345 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001346 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001347 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001348 return ERRORTOKEN;
1349 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001350 if (altcol != tok->altindstack[tok->indent]) {
1351 if (indenterror(tok))
1352 return ERRORTOKEN;
1353 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001354 }
1355 }
1356 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001357
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001358 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 /* Return pending indents/dedents */
1361 if (tok->pendin != 0) {
1362 if (tok->pendin < 0) {
1363 tok->pendin++;
1364 return DEDENT;
1365 }
1366 else {
1367 tok->pendin--;
1368 return INDENT;
1369 }
1370 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001373 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 /* Skip spaces */
1375 do {
1376 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001377 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001378
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001380 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001381
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001382 /* Skip comment */
1383 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001384 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001385 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001388 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001389 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001390 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001391
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001392 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001393 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001394 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001395 /* Process b"", r"" and br"" */
1396 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001397 c = tok_nextc(tok);
1398 if (c == '"' || c == '\'')
1399 goto letter_quote;
1400 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001401 if (c == 'r' || c == 'R') {
1402 c = tok_nextc(tok);
1403 if (c == '"' || c == '\'')
1404 goto letter_quote;
1405 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001406 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001407 if (c >= 128)
1408 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001410 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001411 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001412 if (nonascii &&
Victor Stinner52f6dd72010-03-12 14:45:56 +00001413 !verify_identifier(tok)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001414 tok->done = E_IDENTIFIER;
1415 return ERRORTOKEN;
1416 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001417 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 *p_end = tok->cur;
1419 return NAME;
1420 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001421
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 /* Newline */
1423 if (c == '\n') {
1424 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001425 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001426 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001427 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001428 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001429 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 return NEWLINE;
1431 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001433 /* Period or number starting with period? */
1434 if (c == '.') {
1435 c = tok_nextc(tok);
1436 if (isdigit(c)) {
1437 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001438 } else if (c == '.') {
1439 c = tok_nextc(tok);
1440 if (c == '.') {
1441 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001442 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001443 return ELLIPSIS;
1444 } else {
1445 tok_backup(tok, c);
1446 }
1447 tok_backup(tok, '.');
1448 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001449 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001450 }
Georg Brandldde00282007-03-18 19:01:53 +00001451 *p_start = tok->start;
1452 *p_end = tok->cur;
1453 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001454 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001455
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 /* Number */
1457 if (isdigit(c)) {
1458 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001459 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001460 c = tok_nextc(tok);
1461 if (c == '.')
1462 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001463 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001464 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001465 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001466
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001467 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001468 c = tok_nextc(tok);
1469 if (!isxdigit(c)) {
1470 tok->done = E_TOKEN;
1471 tok_backup(tok, c);
1472 return ERRORTOKEN;
1473 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001474 do {
1475 c = tok_nextc(tok);
1476 } while (isxdigit(c));
1477 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001478 else if (c == 'o' || c == 'O') {
1479 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001480 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001481 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001482 tok->done = E_TOKEN;
1483 tok_backup(tok, c);
1484 return ERRORTOKEN;
1485 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001486 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001487 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001488 } while ('0' <= c && c < '8');
1489 }
1490 else if (c == 'b' || c == 'B') {
1491 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001492 c = tok_nextc(tok);
1493 if (c != '0' && c != '1') {
1494 tok->done = E_TOKEN;
1495 tok_backup(tok, c);
1496 return ERRORTOKEN;
1497 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001498 do {
1499 c = tok_nextc(tok);
1500 } while (c == '0' || c == '1');
1501 }
1502 else {
1503 int nonzero = 0;
1504 /* maybe old-style octal; c is first char of it */
1505 /* in any case, allow '0' as a literal */
1506 while (c == '0')
1507 c = tok_nextc(tok);
1508 while (isdigit(c)) {
1509 nonzero = 1;
1510 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001511 }
1512 if (c == '.')
1513 goto fraction;
1514 else if (c == 'e' || c == 'E')
1515 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001516 else if (c == 'j' || c == 'J')
1517 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001518 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001519 tok->done = E_TOKEN;
1520 tok_backup(tok, c);
1521 return ERRORTOKEN;
1522 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001523 }
1524 }
1525 else {
1526 /* Decimal */
1527 do {
1528 c = tok_nextc(tok);
1529 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001530 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001531 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001532 if (c == '.') {
1533 fraction:
1534 /* Fraction */
1535 do {
1536 c = tok_nextc(tok);
1537 } while (isdigit(c));
1538 }
1539 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001540 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001541 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001542 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001543 if (c == '+' || c == '-')
1544 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001545 if (!isdigit(c)) {
1546 tok->done = E_TOKEN;
1547 tok_backup(tok, c);
1548 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001549 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001550 do {
1551 c = tok_nextc(tok);
1552 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001553 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001554 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001555 /* Imaginary part */
1556 imaginary:
1557 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001558 }
1559 }
1560 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001561 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001562 *p_end = tok->cur;
1563 return NUMBER;
1564 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001565
1566 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001567 /* String */
1568 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001569 int quote = c;
1570 int quote_size = 1; /* 1 or 3 */
1571 int end_quote_size = 0;
1572
1573 /* Find the quote size and start of string */
1574 c = tok_nextc(tok);
1575 if (c == quote) {
1576 c = tok_nextc(tok);
1577 if (c == quote)
1578 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001579 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001580 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001581 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001582 if (c != quote)
1583 tok_backup(tok, c);
1584
1585 /* Get rest of string */
1586 while (end_quote_size != quote_size) {
1587 c = tok_nextc(tok);
1588 if (c == EOF) {
1589 if (quote_size == 3)
1590 tok->done = E_EOFS;
1591 else
1592 tok->done = E_EOLS;
1593 tok->cur = tok->inp;
1594 return ERRORTOKEN;
1595 }
1596 if (quote_size == 1 && c == '\n') {
1597 tok->done = E_EOLS;
1598 tok->cur = tok->inp;
1599 return ERRORTOKEN;
1600 }
1601 if (c == quote)
1602 end_quote_size += 1;
1603 else {
1604 end_quote_size = 0;
1605 if (c == '\\')
1606 c = tok_nextc(tok); /* skip escaped char */
1607 }
1608 }
1609
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001610 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001611 *p_end = tok->cur;
1612 return STRING;
1613 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001614
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001615 /* Line continuation */
1616 if (c == '\\') {
1617 c = tok_nextc(tok);
1618 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001619 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001620 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001621 return ERRORTOKEN;
1622 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001623 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001624 goto again; /* Read next line */
1625 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626
Guido van Rossumfbab9051991-10-20 20:25:03 +00001627 /* Check for two-character token */
1628 {
1629 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001630 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001631 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001632 int c3 = tok_nextc(tok);
1633 int token3 = PyToken_ThreeChars(c, c2, c3);
1634 if (token3 != OP) {
1635 token = token3;
1636 } else {
1637 tok_backup(tok, c3);
1638 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001639 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001640 *p_end = tok->cur;
1641 return token;
1642 }
1643 tok_backup(tok, c2);
1644 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001645
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001646 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001647 switch (c) {
1648 case '(':
1649 case '[':
1650 case '{':
1651 tok->level++;
1652 break;
1653 case ')':
1654 case ']':
1655 case '}':
1656 tok->level--;
1657 break;
1658 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001659
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001660 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001661 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001662 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001663 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001664}
1665
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001666int
1667PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1668{
1669 int result = tok_get(tok, p_start, p_end);
1670 if (tok->decoding_erred) {
1671 result = ERRORTOKEN;
1672 tok->done = E_DECODE;
1673 }
1674 return result;
1675}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001676
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001677/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001678
1679 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001680 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001681 should be assumed to be PyUnicode_GetDefaultEncoding()).
1682
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001683 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1684 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001685*/
1686char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001687PyTokenizer_FindEncoding(int fd)
1688{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001689 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001690 FILE *fp;
1691 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001692
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001693 fd = dup(fd);
1694 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001695 return NULL;
1696 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001697 fp = fdopen(fd, "r");
1698 if (fp == NULL) {
1699 return NULL;
1700 }
1701 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1702 if (tok == NULL) {
1703 fclose(fp);
1704 return NULL;
1705 }
1706 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001707 PyTokenizer_Get(tok, &p_start, &p_end);
1708 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001709 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001710 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001711 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001712 if (encoding)
1713 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001714 }
1715 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001716 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001717}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001718
Guido van Rossum408027e1996-12-30 16:17:54 +00001719#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001720
1721void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001722tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001723{
Guido van Rossum86bea461997-04-29 21:03:06 +00001724 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001725 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1726 printf("(%.*s)", (int)(end - start), start);
1727}
1728
1729#endif