blob: 50b0ff193d3c9cf577bc8e5f875b7cb73cbead58 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000248 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000285 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000286 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
Victor Stinner6aa278e2010-03-03 00:18:49 +0000321 int ch1, ch2, ch3;
322 ch1 = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000323 tok->decoding_state = STATE_RAW;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000324 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000325 return 1;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000326 } else if (ch1 == 0xEF) {
327 ch2 = get_char(tok);
328 if (ch2 != 0xBB) {
329 unget_char(ch2, tok);
330 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000331 return 1;
332 }
Victor Stinner6aa278e2010-03-03 00:18:49 +0000333 ch3 = get_char(tok);
334 if (ch3 != 0xBF) {
335 unget_char(ch3, tok);
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000338 return 1;
339 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340#if 0
341 /* Disable support for UTF-16 BOMs until a decision
342 is made whether this needs to be supported. */
Victor Stinner6aa278e2010-03-03 00:18:49 +0000343 } else if (ch1 == 0xFE) {
344 ch2 = get_char(tok);
345 if (ch2 != 0xFF) {
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
348 return 1;
349 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000350 if (!set_readline(tok, "utf-16-be"))
351 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000352 tok->decoding_state = STATE_NORMAL;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000353 } else if (ch1 == 0xFF) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFE) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000360 if (!set_readline(tok, "utf-16-le"))
361 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000362 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#endif
364 } else {
Victor Stinner6aa278e2010-03-03 00:18:49 +0000365 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000366 return 1;
367 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000368 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000371 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372 return 1;
373}
374
375/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000376 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 On entry, tok->decoding_buffer will be one of:
379 1) NULL: need to call tok->decoding_readline to get a new line
380 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
381 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000382 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000383 (in the s buffer) to copy entire contents of the line read
384 by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000387 reached): see tok_nextc and its calls to decoding_fgets.
388*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000389
390static char *
391fp_readl(char *s, int size, struct tok_state *tok)
392{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000393 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000394 const char *buf;
395 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000396
397 /* Ask for one less byte so we can terminate it */
398 assert(size > 0);
399 size--;
400
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000401 if (tok->decoding_buffer) {
402 bufobj = tok->decoding_buffer;
403 Py_INCREF(bufobj);
404 }
405 else
406 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000407 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000409 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000410 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000411 if (PyUnicode_CheckExact(bufobj))
412 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000413 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000414 if (buf == NULL) {
415 goto error;
416 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000417 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000418 else
419 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000420 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000421 if (buf == NULL) {
422 goto error;
423 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000424 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000425 }
426
427 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000428 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000429 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000430 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000431 buflen-size);
432 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000433 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000434 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000435 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000436 else
437 tok->decoding_buffer = NULL;
438
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000439 memcpy(s, buf, buflen);
440 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000441 if (buflen == 0) /* EOF */
442 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000443 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000444 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000445
446error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000447 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000448 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000449}
450
451/* Set the readline function for TOK to a StreamReader's
452 readline function. The StreamReader is named ENC.
453
454 This function is called from check_bom and check_coding_spec.
455
456 ENC is usually identical to the future value of tok->encoding,
457 except for the (currently unsupported) case of UTF-16.
458
459 Return 1 on success, 0 on failure. */
460
461static int
462fp_setreadl(struct tok_state *tok, const char* enc)
463{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000464 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000465
Christian Heimes819b8bf2008-01-03 23:05:47 +0000466 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000467 if (io == NULL)
468 goto cleanup;
469
Brett Cannon8a9583e2008-09-04 05:04:25 +0000470 if (tok->filename)
471 stream = PyObject_CallMethod(io, "open", "ssis",
472 tok->filename, "r", -1, enc);
473 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000474 stream = PyObject_CallMethod(io, "open", "isisOOO",
475 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000476 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000479 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000480 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000481 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000482
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000483 /* The file has been reopened; parsing will restart from
484 * the beginning of the file, we have to reset the line number.
485 * But this function has been called from inside tok_nextc() which
486 * will increment lineno before it returns. So we set it -1 so that
487 * the next call to tok_nextc() will start with tok->lineno == 0.
488 */
489 tok->lineno = -1;
490
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000491 cleanup:
492 Py_XDECREF(stream);
493 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000494 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000495}
496
497/* Fetch the next byte from TOK. */
498
499static int fp_getc(struct tok_state *tok) {
500 return getc(tok->fp);
501}
502
503/* Unfetch the last byte back into TOK. */
504
505static void fp_ungetc(int c, struct tok_state *tok) {
506 ungetc(c, tok->fp);
507}
508
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000509/* Check whether the characters at s start a valid
510 UTF-8 sequence. Return the number of characters forming
511 the sequence if yes, 0 if not. */
512static int valid_utf8(const unsigned char* s)
513{
514 int expected = 0;
515 int length;
516 if (*s < 0x80)
517 /* single-byte code */
518 return 1;
519 if (*s < 0xc0)
520 /* following byte */
521 return 0;
522 if (*s < 0xE0)
523 expected = 1;
524 else if (*s < 0xF0)
525 expected = 2;
526 else if (*s < 0xF8)
527 expected = 3;
528 else
529 return 0;
530 length = expected + 1;
531 for (; expected; expected--)
532 if (s[expected] < 0x80 || s[expected] >= 0xC0)
533 return 0;
534 return length;
535}
536
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537/* Read a line of input from TOK. Determine encoding
538 if necessary. */
539
540static char *
541decoding_fgets(char *s, int size, struct tok_state *tok)
542{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000543 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000544 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000545 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000546 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 /* We already have a codec associated with
548 this input. */
549 line = fp_readl(s, size, tok);
550 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000551 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000552 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000553 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000554 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000555 break;
556 } else {
557 /* We have not yet determined the encoding.
558 If an encoding is found, use the file-pointer
559 reader functions from now on. */
560 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
561 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000562 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000564 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
566 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
567 return error_ret(tok);
568 }
569 }
570#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000571 /* The default encoding is UTF-8, so make sure we don't have any
572 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000573 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000574 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000575 int length;
576 for (c = (unsigned char *)line; *c; c += length)
577 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000578 badchar = *c;
579 break;
580 }
581 }
582 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000583 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000584 /* Need to add 1 to the line number, since this line
585 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000586 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000587 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000588 "in file %.200s on line %i, "
589 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000590 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000591 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000592 PyErr_SetString(PyExc_SyntaxError, buf);
593 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000594 }
595#endif
596 return line;
597}
598
599static int
600decoding_feof(struct tok_state *tok)
601{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000602 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 return feof(tok->fp);
604 } else {
605 PyObject* buf = tok->decoding_buffer;
606 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000607 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 if (buf == NULL) {
609 error_ret(tok);
610 return 1;
611 } else {
612 tok->decoding_buffer = buf;
613 }
614 }
615 return PyObject_Length(buf) == 0;
616 }
617}
618
619/* Fetch a byte from TOK, using the string buffer. */
620
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000621static int
622buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000623 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000624}
625
626/* Unfetch a byte from TOK, using the string buffer. */
627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000628static void
629buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000630 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000631 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000632}
633
634/* Set the readline function for TOK to ENC. For the string-based
635 tokenizer, this means to just record the encoding. */
636
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000637static int
638buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000639 tok->enc = enc;
640 return 1;
641}
642
643/* Return a UTF-8 encoding Python string object from the
644 C byte string STR, which is encoded with ENC. */
645
646static PyObject *
647translate_into_utf8(const char* str, const char* enc) {
648 PyObject *utf8;
649 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
650 if (buf == NULL)
651 return NULL;
652 utf8 = PyUnicode_AsUTF8String(buf);
653 Py_DECREF(buf);
654 return utf8;
655}
656
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000657
658static char *
659translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000660 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000661 char *buf, *current;
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000662 char c = '\0';
663 buf = PyMem_MALLOC(needed_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000664 if (buf == NULL) {
665 tok->done = E_NOMEM;
666 return NULL;
667 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000668 for (current = buf; *s; s++, current++) {
669 c = *s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000670 if (skip_next_lf) {
671 skip_next_lf = 0;
672 if (c == '\n') {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000673 c = *++s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000674 if (!c)
675 break;
676 }
677 }
678 if (c == '\r') {
679 skip_next_lf = 1;
680 c = '\n';
681 }
682 *current = c;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000683 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000684 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000685 there isn't one already. */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000686 if (exec_input && c != '\n') {
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000687 *current = '\n';
688 current++;
689 }
690 *current = '\0';
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000691 final_length = current - buf + 1;
692 if (final_length < needed_length && final_length)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000693 /* should never fail */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000694 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000695 return buf;
696}
697
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698/* Decode a byte string STR for use as the buffer of TOK.
699 Look for encoding declarations inside STR, and record them
700 inside TOK. */
701
702static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000703decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000704{
705 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000706 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000707 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000708 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000709 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000710 tok->input = str = translate_newlines(input, single, tok);
711 if (str == NULL)
712 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713 tok->enc = NULL;
714 tok->str = str;
715 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000718 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 if (tok->enc != NULL) {
720 utf8 = translate_into_utf8(str, tok->enc);
721 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000722 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000723 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000724 }
725 for (s = str;; s++) {
726 if (*s == '\0') break;
727 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000728 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000729 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000730 lineno++;
731 if (lineno == 2) break;
732 }
733 }
734 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000735 /* need to check line 1 and 2 separately since check_coding_spec
736 assumes a single line as input */
737 if (newl[0]) {
738 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
739 return error_ret(tok);
740 if (tok->enc == NULL && newl[1]) {
741 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
742 tok, buf_setreadl))
743 return error_ret(tok);
744 }
745 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000746 if (tok->enc != NULL) {
747 assert(utf8 == NULL);
748 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000749 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000750 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000751 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000752 }
753 assert(tok->decoding_buffer == NULL);
754 tok->decoding_buffer = utf8; /* CAUTION */
755 return str;
756}
757
758#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759
760/* Set up tokenizer for string */
761
762struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764{
765 struct tok_state *tok = tok_new();
766 if (tok == NULL)
767 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000768 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000769 if (str == NULL) {
770 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000771 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000772 }
773
Martin v. Löwis95292d62002-12-11 14:04:59 +0000774 /* XXX: constify members. */
775 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000776 return tok;
777}
778
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000779struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000780PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000781{
782 struct tok_state *tok = tok_new();
783 if (tok == NULL)
784 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000785#ifndef PGEN
786 tok->input = str = translate_newlines(str, exec_input, tok);
787#endif
788 if (str == NULL) {
789 PyTokenizer_Free(tok);
790 return NULL;
791 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000792 tok->decoding_state = STATE_RAW;
793 tok->read_coding_spec = 1;
794 tok->enc = NULL;
795 tok->str = str;
796 tok->encoding = (char *)PyMem_MALLOC(6);
797 if (!tok->encoding) {
798 PyTokenizer_Free(tok);
799 return NULL;
800 }
801 strcpy(tok->encoding, "utf-8");
802
803 /* XXX: constify members. */
804 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
805 return tok;
806}
807
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000808/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809
810struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000811PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000812{
813 struct tok_state *tok = tok_new();
814 if (tok == NULL)
815 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000816 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000817 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000818 return NULL;
819 }
820 tok->cur = tok->inp = tok->buf;
821 tok->end = tok->buf + BUFSIZ;
822 tok->fp = fp;
823 tok->prompt = ps1;
824 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000825 if (enc != NULL) {
826 /* Must copy encoding declaration since it
827 gets copied into the parse tree. */
828 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
829 if (!tok->encoding) {
830 PyTokenizer_Free(tok);
831 return NULL;
832 }
833 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000834 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000835 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000836 return tok;
837}
838
839
840/* Free a tok_state structure */
841
842void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000843PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000844{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000845 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000846 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000847#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000848 Py_XDECREF(tok->decoding_readline);
849 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000850#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000851 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000852 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000853 if (tok->input)
854 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000855 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000856}
857
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858/* Get next char, updating state; error code goes into tok->done */
859
860static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000861tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000862{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000864 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000865 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000866 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000867 if (tok->done != E_OK)
868 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000869 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000870 char *end = strchr(tok->inp, '\n');
871 if (end != NULL)
872 end++;
873 else {
874 end = strchr(tok->inp, '\0');
875 if (end == tok->inp) {
876 tok->done = E_EOF;
877 return EOF;
878 }
879 }
880 if (tok->start == NULL)
881 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000882 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000883 tok->lineno++;
884 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000885 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000886 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000887 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000888 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000889#ifndef PGEN
890 if (tok->encoding && newtok && *newtok) {
891 /* Recode to UTF-8 */
892 Py_ssize_t buflen;
893 const char* buf;
894 PyObject *u = translate_into_utf8(newtok, tok->encoding);
895 PyMem_FREE(newtok);
896 if (!u) {
897 tok->done = E_DECODE;
898 return EOF;
899 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000900 buflen = PyBytes_GET_SIZE(u);
901 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000902 if (!buf) {
903 Py_DECREF(u);
904 tok->done = E_DECODE;
905 return EOF;
906 }
907 newtok = PyMem_MALLOC(buflen+1);
908 strcpy(newtok, buf);
909 Py_DECREF(u);
910 }
911#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000912 if (tok->nextprompt != NULL)
913 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000915 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000916 else if (*newtok == '\0') {
917 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000918 tok->done = E_EOF;
919 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000920 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000921 size_t start = tok->start - tok->buf;
922 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000923 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000924 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000925 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000926 tok->lineno++;
927 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000928 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000929 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000930 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000931 tok->done = E_NOMEM;
932 return EOF;
933 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000934 tok->buf = buf;
935 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000936 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000937 strcpy(tok->buf + oldlen, newtok);
938 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000939 tok->inp = tok->buf + newlen;
940 tok->end = tok->inp + 1;
941 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000942 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000943 else {
944 tok->lineno++;
945 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000946 PyMem_FREE(tok->buf);
947 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000948 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000949 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000950 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000951 tok->inp = strchr(tok->buf, '\0');
952 tok->end = tok->inp + 1;
953 }
954 }
955 else {
956 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000958 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000959 if (tok->start == NULL) {
960 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000961 tok->buf = (char *)
962 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000963 if (tok->buf == NULL) {
964 tok->done = E_NOMEM;
965 return EOF;
966 }
967 tok->end = tok->buf + BUFSIZ;
968 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000969 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
970 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000971 tok->done = E_EOF;
972 done = 1;
973 }
974 else {
975 tok->done = E_OK;
976 tok->inp = strchr(tok->buf, '\0');
977 done = tok->inp[-1] == '\n';
978 }
979 }
980 else {
981 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000982 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000983 tok->done = E_EOF;
984 done = 1;
985 }
986 else
987 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000988 }
989 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000990 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000991 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000992 Py_ssize_t curstart = tok->start == NULL ? -1 :
993 tok->start - tok->buf;
994 Py_ssize_t curvalid = tok->inp - tok->buf;
995 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000996 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000997 newbuf = (char *)PyMem_REALLOC(newbuf,
998 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000999 if (newbuf == NULL) {
1000 tok->done = E_NOMEM;
1001 tok->cur = tok->inp;
1002 return EOF;
1003 }
1004 tok->buf = newbuf;
1005 tok->inp = tok->buf + curvalid;
1006 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001007 tok->start = curstart < 0 ? NULL :
1008 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001009 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001010 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001011 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001012 /* Break out early on decoding
1013 errors, as tok->buf will be NULL
1014 */
1015 if (tok->decoding_erred)
1016 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001017 /* Last line does not end in \n,
1018 fake one */
1019 strcpy(tok->inp, "\n");
1020 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001021 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001022 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001023 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001024 if (tok->buf != NULL) {
1025 tok->cur = tok->buf + cur;
1026 tok->line_start = tok->cur;
1027 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001028 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001029 pt = tok->inp - 2;
1030 if (pt >= tok->buf && *pt == '\r') {
1031 *pt++ = '\n';
1032 *pt = '\0';
1033 tok->inp = pt;
1034 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001035 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001036 }
1037 if (tok->done != E_OK) {
1038 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001039 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001040 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041 return EOF;
1042 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001043 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001044 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001045}
1046
1047
1048/* Back-up one character */
1049
1050static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001051tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001052{
1053 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001054 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001055 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001056 if (*tok->cur != c)
1057 *tok->cur = c;
1058 }
1059}
1060
1061
1062/* Return the token corresponding to a single character */
1063
1064int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001065PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001066{
1067 switch (c) {
1068 case '(': return LPAR;
1069 case ')': return RPAR;
1070 case '[': return LSQB;
1071 case ']': return RSQB;
1072 case ':': return COLON;
1073 case ',': return COMMA;
1074 case ';': return SEMI;
1075 case '+': return PLUS;
1076 case '-': return MINUS;
1077 case '*': return STAR;
1078 case '/': return SLASH;
1079 case '|': return VBAR;
1080 case '&': return AMPER;
1081 case '<': return LESS;
1082 case '>': return GREATER;
1083 case '=': return EQUAL;
1084 case '.': return DOT;
1085 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 case '{': return LBRACE;
1087 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001088 case '^': return CIRCUMFLEX;
1089 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001090 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001091 default: return OP;
1092 }
1093}
1094
1095
Guido van Rossumfbab9051991-10-20 20:25:03 +00001096int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001097PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001098{
1099 switch (c1) {
1100 case '=':
1101 switch (c2) {
1102 case '=': return EQEQUAL;
1103 }
1104 break;
1105 case '!':
1106 switch (c2) {
1107 case '=': return NOTEQUAL;
1108 }
1109 break;
1110 case '<':
1111 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001112 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001113 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001114 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001115 }
1116 break;
1117 case '>':
1118 switch (c2) {
1119 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001120 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001121 }
1122 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001123 case '+':
1124 switch (c2) {
1125 case '=': return PLUSEQUAL;
1126 }
1127 break;
1128 case '-':
1129 switch (c2) {
1130 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001131 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001132 }
1133 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001134 case '*':
1135 switch (c2) {
1136 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001137 case '=': return STAREQUAL;
1138 }
1139 break;
1140 case '/':
1141 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001142 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001143 case '=': return SLASHEQUAL;
1144 }
1145 break;
1146 case '|':
1147 switch (c2) {
1148 case '=': return VBAREQUAL;
1149 }
1150 break;
1151 case '%':
1152 switch (c2) {
1153 case '=': return PERCENTEQUAL;
1154 }
1155 break;
1156 case '&':
1157 switch (c2) {
1158 case '=': return AMPEREQUAL;
1159 }
1160 break;
1161 case '^':
1162 switch (c2) {
1163 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001164 }
1165 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001166 }
1167 return OP;
1168}
1169
Thomas Wouters434d0822000-08-24 20:11:32 +00001170int
1171PyToken_ThreeChars(int c1, int c2, int c3)
1172{
1173 switch (c1) {
1174 case '<':
1175 switch (c2) {
1176 case '<':
1177 switch (c3) {
1178 case '=':
1179 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001180 }
1181 break;
1182 }
1183 break;
1184 case '>':
1185 switch (c2) {
1186 case '>':
1187 switch (c3) {
1188 case '=':
1189 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001190 }
1191 break;
1192 }
1193 break;
1194 case '*':
1195 switch (c2) {
1196 case '*':
1197 switch (c3) {
1198 case '=':
1199 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001200 }
1201 break;
1202 }
1203 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001204 case '/':
1205 switch (c2) {
1206 case '/':
1207 switch (c3) {
1208 case '=':
1209 return DOUBLESLASHEQUAL;
1210 }
1211 break;
1212 }
1213 break;
Georg Brandldde00282007-03-18 19:01:53 +00001214 case '.':
1215 switch (c2) {
1216 case '.':
1217 switch (c3) {
1218 case '.':
1219 return ELLIPSIS;
1220 }
1221 break;
1222 }
1223 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001224 }
1225 return OP;
1226}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001227
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001229indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001230{
1231 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001232 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001233 tok->cur = tok->inp;
1234 return 1;
1235 }
1236 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001237 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1238 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001239 tok->altwarning = 0;
1240 }
1241 return 0;
1242}
1243
Martin v. Löwis47383402007-08-15 07:32:56 +00001244#ifdef PGEN
1245#define verify_identifier(s,e) 1
1246#else
1247/* Verify that the identifier follows PEP 3131. */
1248static int
1249verify_identifier(char *start, char *end)
1250{
Guido van Rossume3e37012007-08-29 18:54:41 +00001251 PyObject *s;
1252 int result;
1253 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1254 if (s == NULL) {
1255 PyErr_Clear();
1256 return 0;
1257 }
1258 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001259 Py_DECREF(s);
1260 return result;
1261}
1262#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001263
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264/* Get next token, after space stripping etc. */
1265
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001266static int
1267tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001268{
1269 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001270 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001271
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001272 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001273 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001274 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001275 blankline = 0;
1276
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 /* Get indentation level */
1278 if (tok->atbol) {
1279 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001280 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 for (;;) {
1283 c = tok_nextc(tok);
1284 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001285 col++, altcol++;
1286 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001288 altcol = (altcol/tok->alttabsize + 1)
1289 * tok->alttabsize;
1290 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001291 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001292 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 else
1294 break;
1295 }
1296 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001297 if (c == '#' || c == '\n') {
1298 /* Lines with only whitespace and/or comments
1299 shouldn't affect the indentation and are
1300 not passed to the parser as NEWLINE tokens,
1301 except *totally* empty lines in interactive
1302 mode, which signal the end of a command group. */
1303 if (col == 0 && c == '\n' && tok->prompt != NULL)
1304 blankline = 0; /* Let it through */
1305 else
1306 blankline = 1; /* Ignore completely */
1307 /* We can't jump back right here since we still
1308 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001310 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001311 if (col == tok->indstack[tok->indent]) {
1312 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001313 if (altcol != tok->altindstack[tok->indent]) {
1314 if (indenterror(tok))
1315 return ERRORTOKEN;
1316 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001318 else if (col > tok->indstack[tok->indent]) {
1319 /* Indent -- always one */
1320 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001321 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001322 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001323 return ERRORTOKEN;
1324 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001325 if (altcol <= tok->altindstack[tok->indent]) {
1326 if (indenterror(tok))
1327 return ERRORTOKEN;
1328 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001329 tok->pendin++;
1330 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001331 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001332 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001333 else /* col < tok->indstack[tok->indent] */ {
1334 /* Dedent -- any number, must be consistent */
1335 while (tok->indent > 0 &&
1336 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001337 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001338 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001339 }
1340 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001341 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001342 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001343 return ERRORTOKEN;
1344 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001345 if (altcol != tok->altindstack[tok->indent]) {
1346 if (indenterror(tok))
1347 return ERRORTOKEN;
1348 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 }
1350 }
1351 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001352
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001353 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001354
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355 /* Return pending indents/dedents */
1356 if (tok->pendin != 0) {
1357 if (tok->pendin < 0) {
1358 tok->pendin++;
1359 return DEDENT;
1360 }
1361 else {
1362 tok->pendin--;
1363 return INDENT;
1364 }
1365 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001366
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001368 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 /* Skip spaces */
1370 do {
1371 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001372 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001373
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001374 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001375 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001376
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001377 /* Skip comment */
1378 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001379 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001381
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001382 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001383 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001384 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001385 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001387 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001388 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001389 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001390 /* Process b"", r"" and br"" */
1391 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001392 c = tok_nextc(tok);
1393 if (c == '"' || c == '\'')
1394 goto letter_quote;
1395 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001396 if (c == 'r' || c == 'R') {
1397 c = tok_nextc(tok);
1398 if (c == '"' || c == '\'')
1399 goto letter_quote;
1400 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001401 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001402 if (c >= 128)
1403 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001405 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001406 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001407 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001408 !verify_identifier(tok->start, tok->cur)) {
1409 tok->done = E_IDENTIFIER;
1410 return ERRORTOKEN;
1411 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001412 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 *p_end = tok->cur;
1414 return NAME;
1415 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001417 /* Newline */
1418 if (c == '\n') {
1419 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001420 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001421 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001422 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001424 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001425 return NEWLINE;
1426 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001427
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001428 /* Period or number starting with period? */
1429 if (c == '.') {
1430 c = tok_nextc(tok);
1431 if (isdigit(c)) {
1432 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001433 } else if (c == '.') {
1434 c = tok_nextc(tok);
1435 if (c == '.') {
1436 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001437 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001438 return ELLIPSIS;
1439 } else {
1440 tok_backup(tok, c);
1441 }
1442 tok_backup(tok, '.');
1443 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001444 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001445 }
Georg Brandldde00282007-03-18 19:01:53 +00001446 *p_start = tok->start;
1447 *p_end = tok->cur;
1448 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001449 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001450
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001451 /* Number */
1452 if (isdigit(c)) {
1453 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001454 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 c = tok_nextc(tok);
1456 if (c == '.')
1457 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001458 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001459 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001460 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001461
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001462 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001463 c = tok_nextc(tok);
1464 if (!isxdigit(c)) {
1465 tok->done = E_TOKEN;
1466 tok_backup(tok, c);
1467 return ERRORTOKEN;
1468 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001469 do {
1470 c = tok_nextc(tok);
1471 } while (isxdigit(c));
1472 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001473 else if (c == 'o' || c == 'O') {
1474 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001475 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001476 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001477 tok->done = E_TOKEN;
1478 tok_backup(tok, c);
1479 return ERRORTOKEN;
1480 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001481 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001483 } while ('0' <= c && c < '8');
1484 }
1485 else if (c == 'b' || c == 'B') {
1486 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001487 c = tok_nextc(tok);
1488 if (c != '0' && c != '1') {
1489 tok->done = E_TOKEN;
1490 tok_backup(tok, c);
1491 return ERRORTOKEN;
1492 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001493 do {
1494 c = tok_nextc(tok);
1495 } while (c == '0' || c == '1');
1496 }
1497 else {
1498 int nonzero = 0;
1499 /* maybe old-style octal; c is first char of it */
1500 /* in any case, allow '0' as a literal */
1501 while (c == '0')
1502 c = tok_nextc(tok);
1503 while (isdigit(c)) {
1504 nonzero = 1;
1505 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001506 }
1507 if (c == '.')
1508 goto fraction;
1509 else if (c == 'e' || c == 'E')
1510 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001511 else if (c == 'j' || c == 'J')
1512 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001513 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001514 tok->done = E_TOKEN;
1515 tok_backup(tok, c);
1516 return ERRORTOKEN;
1517 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001518 }
1519 }
1520 else {
1521 /* Decimal */
1522 do {
1523 c = tok_nextc(tok);
1524 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001525 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001526 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001527 if (c == '.') {
1528 fraction:
1529 /* Fraction */
1530 do {
1531 c = tok_nextc(tok);
1532 } while (isdigit(c));
1533 }
1534 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001535 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001536 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001537 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001538 if (c == '+' || c == '-')
1539 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001540 if (!isdigit(c)) {
1541 tok->done = E_TOKEN;
1542 tok_backup(tok, c);
1543 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001544 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001545 do {
1546 c = tok_nextc(tok);
1547 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001548 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001549 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001550 /* Imaginary part */
1551 imaginary:
1552 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001553 }
1554 }
1555 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001556 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001557 *p_end = tok->cur;
1558 return NUMBER;
1559 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001560
1561 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001562 /* String */
1563 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001564 int quote = c;
1565 int quote_size = 1; /* 1 or 3 */
1566 int end_quote_size = 0;
1567
1568 /* Find the quote size and start of string */
1569 c = tok_nextc(tok);
1570 if (c == quote) {
1571 c = tok_nextc(tok);
1572 if (c == quote)
1573 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001574 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001575 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001576 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001577 if (c != quote)
1578 tok_backup(tok, c);
1579
1580 /* Get rest of string */
1581 while (end_quote_size != quote_size) {
1582 c = tok_nextc(tok);
1583 if (c == EOF) {
1584 if (quote_size == 3)
1585 tok->done = E_EOFS;
1586 else
1587 tok->done = E_EOLS;
1588 tok->cur = tok->inp;
1589 return ERRORTOKEN;
1590 }
1591 if (quote_size == 1 && c == '\n') {
1592 tok->done = E_EOLS;
1593 tok->cur = tok->inp;
1594 return ERRORTOKEN;
1595 }
1596 if (c == quote)
1597 end_quote_size += 1;
1598 else {
1599 end_quote_size = 0;
1600 if (c == '\\')
1601 c = tok_nextc(tok); /* skip escaped char */
1602 }
1603 }
1604
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001605 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001606 *p_end = tok->cur;
1607 return STRING;
1608 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001610 /* Line continuation */
1611 if (c == '\\') {
1612 c = tok_nextc(tok);
1613 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001614 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001615 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001616 return ERRORTOKEN;
1617 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001618 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001619 goto again; /* Read next line */
1620 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001621
Guido van Rossumfbab9051991-10-20 20:25:03 +00001622 /* Check for two-character token */
1623 {
1624 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001625 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001626 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001627 int c3 = tok_nextc(tok);
1628 int token3 = PyToken_ThreeChars(c, c2, c3);
1629 if (token3 != OP) {
1630 token = token3;
1631 } else {
1632 tok_backup(tok, c3);
1633 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001634 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001635 *p_end = tok->cur;
1636 return token;
1637 }
1638 tok_backup(tok, c2);
1639 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001640
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001641 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001642 switch (c) {
1643 case '(':
1644 case '[':
1645 case '{':
1646 tok->level++;
1647 break;
1648 case ')':
1649 case ']':
1650 case '}':
1651 tok->level--;
1652 break;
1653 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001654
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001656 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001657 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001658 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001659}
1660
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001661int
1662PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1663{
1664 int result = tok_get(tok, p_start, p_end);
1665 if (tok->decoding_erred) {
1666 result = ERRORTOKEN;
1667 tok->done = E_DECODE;
1668 }
1669 return result;
1670}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001671
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001672/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001673
1674 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001675 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001676 should be assumed to be PyUnicode_GetDefaultEncoding()).
1677
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001678 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1679 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001680*/
1681char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001682PyTokenizer_FindEncoding(int fd)
1683{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001684 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001685 FILE *fp;
1686 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001687
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001688 fd = dup(fd);
1689 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001690 return NULL;
1691 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001692 fp = fdopen(fd, "r");
1693 if (fp == NULL) {
1694 return NULL;
1695 }
1696 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1697 if (tok == NULL) {
1698 fclose(fp);
1699 return NULL;
1700 }
1701 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001702 PyTokenizer_Get(tok, &p_start, &p_end);
1703 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001704 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001705 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001706 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001707 if (encoding)
1708 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001709 }
1710 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001711 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001712}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001713
Guido van Rossum408027e1996-12-30 16:17:54 +00001714#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001715
1716void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001717tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001718{
Guido van Rossum86bea461997-04-29 21:03:06 +00001719 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001720 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1721 printf("(%.*s)", (int)(end - start), start);
1722}
1723
1724#endif