blob: 5a1c268378cdfeb99261d1c14a726e16a6d5cdf6 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000248 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000285 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000286 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000326 ch = get_char(tok);
327 if (ch != 0xBB) {
328 unget_char(ch, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
333 ch = get_char(tok);
334 if (ch != 0xBF) {
335 unget_char(ch, tok);
336 unget_char(0xBB, tok);
337 unget_char(0xEF, tok);
338 /* any token beginning with '\xEF' is a bad token */
339 return 1;
340 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341#if 0
342 /* Disable support for UTF-16 BOMs until a decision
343 is made whether this needs to be supported. */
344 } else if (ch == 0xFE) {
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000345 ch = get_char(tok);
346 if (ch != 0xFF)
347 goto NON_BOM;
348 if (!set_readline(tok, "utf-16-be"))
349 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000350 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351 } else if (ch == 0xFF) {
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000352 ch = get_char(tok);
353 if (ch != 0xFE)
354 goto NON_BOM;
355 if (!set_readline(tok, "utf-16-le"))
356 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000357 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358#endif
359 } else {
360 unget_char(ch, tok);
361 return 1;
362 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000363 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000366 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367 return 1;
368}
369
370/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 On entry, tok->decoding_buffer will be one of:
374 1) NULL: need to call tok->decoding_readline to get a new line
375 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
376 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000377 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 (in the s buffer) to copy entire contents of the line read
379 by tok->decoding_readline. tok->decoding_buffer has the overflow.
380 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000382 reached): see tok_nextc and its calls to decoding_fgets.
383*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384
385static char *
386fp_readl(char *s, int size, struct tok_state *tok)
387{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000388 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 const char *buf;
390 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391
392 /* Ask for one less byte so we can terminate it */
393 assert(size > 0);
394 size--;
395
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000396 if (tok->decoding_buffer) {
397 bufobj = tok->decoding_buffer;
398 Py_INCREF(bufobj);
399 }
400 else
401 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000402 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
403 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000404 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000405 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000406 if (PyUnicode_CheckExact(bufobj))
407 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000408 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000409 if (buf == NULL) {
410 goto error;
411 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000412 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000413 else
414 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000415 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000416 if (buf == NULL) {
417 goto error;
418 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000419 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000420 }
421
422 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000423 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000425 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000426 buflen-size);
427 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000428 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000429 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000430 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000431 else
432 tok->decoding_buffer = NULL;
433
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000434 memcpy(s, buf, buflen);
435 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000436 if (buflen == 0) /* EOF */
437 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000439 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000440
441error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000442 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000443 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444}
445
446/* Set the readline function for TOK to a StreamReader's
447 readline function. The StreamReader is named ENC.
448
449 This function is called from check_bom and check_coding_spec.
450
451 ENC is usually identical to the future value of tok->encoding,
452 except for the (currently unsupported) case of UTF-16.
453
454 Return 1 on success, 0 on failure. */
455
456static int
457fp_setreadl(struct tok_state *tok, const char* enc)
458{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000459 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460
Christian Heimes819b8bf2008-01-03 23:05:47 +0000461 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 if (io == NULL)
463 goto cleanup;
464
Brett Cannon8a9583e2008-09-04 05:04:25 +0000465 if (tok->filename)
466 stream = PyObject_CallMethod(io, "open", "ssis",
467 tok->filename, "r", -1, enc);
468 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000469 stream = PyObject_CallMethod(io, "open", "isisOOO",
470 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000471 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000473
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000474 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000475 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000478 /* The file has been reopened; parsing will restart from
479 * the beginning of the file, we have to reset the line number.
480 * But this function has been called from inside tok_nextc() which
481 * will increment lineno before it returns. So we set it -1 so that
482 * the next call to tok_nextc() will start with tok->lineno == 0.
483 */
484 tok->lineno = -1;
485
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000486 cleanup:
487 Py_XDECREF(stream);
488 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000489 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000490}
491
492/* Fetch the next byte from TOK. */
493
494static int fp_getc(struct tok_state *tok) {
495 return getc(tok->fp);
496}
497
498/* Unfetch the last byte back into TOK. */
499
500static void fp_ungetc(int c, struct tok_state *tok) {
501 ungetc(c, tok->fp);
502}
503
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000504/* Check whether the characters at s start a valid
505 UTF-8 sequence. Return the number of characters forming
506 the sequence if yes, 0 if not. */
507static int valid_utf8(const unsigned char* s)
508{
509 int expected = 0;
510 int length;
511 if (*s < 0x80)
512 /* single-byte code */
513 return 1;
514 if (*s < 0xc0)
515 /* following byte */
516 return 0;
517 if (*s < 0xE0)
518 expected = 1;
519 else if (*s < 0xF0)
520 expected = 2;
521 else if (*s < 0xF8)
522 expected = 3;
523 else
524 return 0;
525 length = expected + 1;
526 for (; expected; expected--)
527 if (s[expected] < 0x80 || s[expected] >= 0xC0)
528 return 0;
529 return length;
530}
531
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532/* Read a line of input from TOK. Determine encoding
533 if necessary. */
534
535static char *
536decoding_fgets(char *s, int size, struct tok_state *tok)
537{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000538 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000539 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000540 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000541 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 /* We already have a codec associated with
543 this input. */
544 line = fp_readl(s, size, tok);
545 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000546 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000548 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 break;
551 } else {
552 /* We have not yet determined the encoding.
553 If an encoding is found, use the file-pointer
554 reader functions from now on. */
555 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
556 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000557 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000559 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
561 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
562 return error_ret(tok);
563 }
564 }
565#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000566 /* The default encoding is UTF-8, so make sure we don't have any
567 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000568 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000569 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000570 int length;
571 for (c = (unsigned char *)line; *c; c += length)
572 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573 badchar = *c;
574 break;
575 }
576 }
577 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000578 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000579 /* Need to add 1 to the line number, since this line
580 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000582 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000583 "in file %.200s on line %i, "
584 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000585 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000586 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000587 PyErr_SetString(PyExc_SyntaxError, buf);
588 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 }
590#endif
591 return line;
592}
593
594static int
595decoding_feof(struct tok_state *tok)
596{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000597 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598 return feof(tok->fp);
599 } else {
600 PyObject* buf = tok->decoding_buffer;
601 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000602 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 if (buf == NULL) {
604 error_ret(tok);
605 return 1;
606 } else {
607 tok->decoding_buffer = buf;
608 }
609 }
610 return PyObject_Length(buf) == 0;
611 }
612}
613
614/* Fetch a byte from TOK, using the string buffer. */
615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000616static int
617buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000618 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619}
620
621/* Unfetch a byte from TOK, using the string buffer. */
622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000623static void
624buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000626 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627}
628
629/* Set the readline function for TOK to ENC. For the string-based
630 tokenizer, this means to just record the encoding. */
631
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000632static int
633buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000634 tok->enc = enc;
635 return 1;
636}
637
638/* Return a UTF-8 encoding Python string object from the
639 C byte string STR, which is encoded with ENC. */
640
641static PyObject *
642translate_into_utf8(const char* str, const char* enc) {
643 PyObject *utf8;
644 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
645 if (buf == NULL)
646 return NULL;
647 utf8 = PyUnicode_AsUTF8String(buf);
648 Py_DECREF(buf);
649 return utf8;
650}
651
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000652
653static char *
654translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000655 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000656 char *buf, *current;
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000657 char c = '\0';
658 buf = PyMem_MALLOC(needed_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000659 if (buf == NULL) {
660 tok->done = E_NOMEM;
661 return NULL;
662 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000663 for (current = buf; *s; s++, current++) {
664 c = *s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000665 if (skip_next_lf) {
666 skip_next_lf = 0;
667 if (c == '\n') {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000668 c = *++s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000669 if (!c)
670 break;
671 }
672 }
673 if (c == '\r') {
674 skip_next_lf = 1;
675 c = '\n';
676 }
677 *current = c;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000678 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000679 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000680 there isn't one already. */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000681 if (exec_input && c != '\n') {
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000682 *current = '\n';
683 current++;
684 }
685 *current = '\0';
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000686 final_length = current - buf + 1;
687 if (final_length < needed_length && final_length)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000688 /* should never fail */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000689 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000690 return buf;
691}
692
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000693/* Decode a byte string STR for use as the buffer of TOK.
694 Look for encoding declarations inside STR, and record them
695 inside TOK. */
696
697static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000698decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699{
700 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000701 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000703 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000704 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000705 tok->input = str = translate_newlines(input, single, tok);
706 if (str == NULL)
707 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000708 tok->enc = NULL;
709 tok->str = str;
710 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000711 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000712 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000713 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714 if (tok->enc != NULL) {
715 utf8 = translate_into_utf8(str, tok->enc);
716 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000717 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000718 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 }
720 for (s = str;; s++) {
721 if (*s == '\0') break;
722 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000723 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000724 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000725 lineno++;
726 if (lineno == 2) break;
727 }
728 }
729 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000730 /* need to check line 1 and 2 separately since check_coding_spec
731 assumes a single line as input */
732 if (newl[0]) {
733 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
734 return error_ret(tok);
735 if (tok->enc == NULL && newl[1]) {
736 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
737 tok, buf_setreadl))
738 return error_ret(tok);
739 }
740 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000741 if (tok->enc != NULL) {
742 assert(utf8 == NULL);
743 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000744 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000745 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000746 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000747 }
748 assert(tok->decoding_buffer == NULL);
749 tok->decoding_buffer = utf8; /* CAUTION */
750 return str;
751}
752
753#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754
755/* Set up tokenizer for string */
756
757struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000758PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759{
760 struct tok_state *tok = tok_new();
761 if (tok == NULL)
762 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000763 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000764 if (str == NULL) {
765 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000766 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000767 }
768
Martin v. Löwis95292d62002-12-11 14:04:59 +0000769 /* XXX: constify members. */
770 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000771 return tok;
772}
773
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000774struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000775PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000776{
777 struct tok_state *tok = tok_new();
778 if (tok == NULL)
779 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000780#ifndef PGEN
781 tok->input = str = translate_newlines(str, exec_input, tok);
782#endif
783 if (str == NULL) {
784 PyTokenizer_Free(tok);
785 return NULL;
786 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000787 tok->decoding_state = STATE_RAW;
788 tok->read_coding_spec = 1;
789 tok->enc = NULL;
790 tok->str = str;
791 tok->encoding = (char *)PyMem_MALLOC(6);
792 if (!tok->encoding) {
793 PyTokenizer_Free(tok);
794 return NULL;
795 }
796 strcpy(tok->encoding, "utf-8");
797
798 /* XXX: constify members. */
799 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
800 return tok;
801}
802
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000803/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000804
805struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000806PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000807{
808 struct tok_state *tok = tok_new();
809 if (tok == NULL)
810 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000811 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000812 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813 return NULL;
814 }
815 tok->cur = tok->inp = tok->buf;
816 tok->end = tok->buf + BUFSIZ;
817 tok->fp = fp;
818 tok->prompt = ps1;
819 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000820 if (enc != NULL) {
821 /* Must copy encoding declaration since it
822 gets copied into the parse tree. */
823 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
824 if (!tok->encoding) {
825 PyTokenizer_Free(tok);
826 return NULL;
827 }
828 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000829 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000830 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000831 return tok;
832}
833
834
835/* Free a tok_state structure */
836
837void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000838PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000839{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000840 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000841 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000842#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000843 Py_XDECREF(tok->decoding_readline);
844 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000845#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000847 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000848 if (tok->input)
849 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000850 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000851}
852
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853/* Get next char, updating state; error code goes into tok->done */
854
855static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000856tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000857{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000859 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000860 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000861 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000862 if (tok->done != E_OK)
863 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000864 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000865 char *end = strchr(tok->inp, '\n');
866 if (end != NULL)
867 end++;
868 else {
869 end = strchr(tok->inp, '\0');
870 if (end == tok->inp) {
871 tok->done = E_EOF;
872 return EOF;
873 }
874 }
875 if (tok->start == NULL)
876 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000877 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000878 tok->lineno++;
879 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000880 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000881 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000883 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000884#ifndef PGEN
885 if (tok->encoding && newtok && *newtok) {
886 /* Recode to UTF-8 */
887 Py_ssize_t buflen;
888 const char* buf;
889 PyObject *u = translate_into_utf8(newtok, tok->encoding);
890 PyMem_FREE(newtok);
891 if (!u) {
892 tok->done = E_DECODE;
893 return EOF;
894 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000895 buflen = PyBytes_GET_SIZE(u);
896 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000897 if (!buf) {
898 Py_DECREF(u);
899 tok->done = E_DECODE;
900 return EOF;
901 }
902 newtok = PyMem_MALLOC(buflen+1);
903 strcpy(newtok, buf);
904 Py_DECREF(u);
905 }
906#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000907 if (tok->nextprompt != NULL)
908 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000909 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000910 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000911 else if (*newtok == '\0') {
912 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000913 tok->done = E_EOF;
914 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000915 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000916 size_t start = tok->start - tok->buf;
917 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000918 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000919 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000920 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000921 tok->lineno++;
922 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000923 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000924 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000925 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000926 tok->done = E_NOMEM;
927 return EOF;
928 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000929 tok->buf = buf;
930 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000931 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000932 strcpy(tok->buf + oldlen, newtok);
933 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000934 tok->inp = tok->buf + newlen;
935 tok->end = tok->inp + 1;
936 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000937 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000938 else {
939 tok->lineno++;
940 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000941 PyMem_FREE(tok->buf);
942 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000943 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000944 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000945 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000946 tok->inp = strchr(tok->buf, '\0');
947 tok->end = tok->inp + 1;
948 }
949 }
950 else {
951 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000953 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000954 if (tok->start == NULL) {
955 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000956 tok->buf = (char *)
957 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000958 if (tok->buf == NULL) {
959 tok->done = E_NOMEM;
960 return EOF;
961 }
962 tok->end = tok->buf + BUFSIZ;
963 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000964 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
965 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000966 tok->done = E_EOF;
967 done = 1;
968 }
969 else {
970 tok->done = E_OK;
971 tok->inp = strchr(tok->buf, '\0');
972 done = tok->inp[-1] == '\n';
973 }
974 }
975 else {
976 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000977 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000978 tok->done = E_EOF;
979 done = 1;
980 }
981 else
982 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000983 }
984 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000985 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000986 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000987 Py_ssize_t curstart = tok->start == NULL ? -1 :
988 tok->start - tok->buf;
989 Py_ssize_t curvalid = tok->inp - tok->buf;
990 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000991 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000992 newbuf = (char *)PyMem_REALLOC(newbuf,
993 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000994 if (newbuf == NULL) {
995 tok->done = E_NOMEM;
996 tok->cur = tok->inp;
997 return EOF;
998 }
999 tok->buf = newbuf;
1000 tok->inp = tok->buf + curvalid;
1001 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001002 tok->start = curstart < 0 ? NULL :
1003 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001004 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001005 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001006 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001007 /* Break out early on decoding
1008 errors, as tok->buf will be NULL
1009 */
1010 if (tok->decoding_erred)
1011 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001012 /* Last line does not end in \n,
1013 fake one */
1014 strcpy(tok->inp, "\n");
1015 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001016 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001017 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001018 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001019 if (tok->buf != NULL) {
1020 tok->cur = tok->buf + cur;
1021 tok->line_start = tok->cur;
1022 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001023 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001024 pt = tok->inp - 2;
1025 if (pt >= tok->buf && *pt == '\r') {
1026 *pt++ = '\n';
1027 *pt = '\0';
1028 tok->inp = pt;
1029 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001030 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001031 }
1032 if (tok->done != E_OK) {
1033 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001034 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001035 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001036 return EOF;
1037 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001038 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001039 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001040}
1041
1042
1043/* Back-up one character */
1044
1045static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001046tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001047{
1048 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001049 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001050 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001051 if (*tok->cur != c)
1052 *tok->cur = c;
1053 }
1054}
1055
1056
1057/* Return the token corresponding to a single character */
1058
1059int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001060PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001061{
1062 switch (c) {
1063 case '(': return LPAR;
1064 case ')': return RPAR;
1065 case '[': return LSQB;
1066 case ']': return RSQB;
1067 case ':': return COLON;
1068 case ',': return COMMA;
1069 case ';': return SEMI;
1070 case '+': return PLUS;
1071 case '-': return MINUS;
1072 case '*': return STAR;
1073 case '/': return SLASH;
1074 case '|': return VBAR;
1075 case '&': return AMPER;
1076 case '<': return LESS;
1077 case '>': return GREATER;
1078 case '=': return EQUAL;
1079 case '.': return DOT;
1080 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001081 case '{': return LBRACE;
1082 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001083 case '^': return CIRCUMFLEX;
1084 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001085 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086 default: return OP;
1087 }
1088}
1089
1090
Guido van Rossumfbab9051991-10-20 20:25:03 +00001091int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001092PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001093{
1094 switch (c1) {
1095 case '=':
1096 switch (c2) {
1097 case '=': return EQEQUAL;
1098 }
1099 break;
1100 case '!':
1101 switch (c2) {
1102 case '=': return NOTEQUAL;
1103 }
1104 break;
1105 case '<':
1106 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001107 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001108 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001109 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001110 }
1111 break;
1112 case '>':
1113 switch (c2) {
1114 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001115 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001116 }
1117 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001118 case '+':
1119 switch (c2) {
1120 case '=': return PLUSEQUAL;
1121 }
1122 break;
1123 case '-':
1124 switch (c2) {
1125 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001126 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001127 }
1128 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001129 case '*':
1130 switch (c2) {
1131 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001132 case '=': return STAREQUAL;
1133 }
1134 break;
1135 case '/':
1136 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001137 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001138 case '=': return SLASHEQUAL;
1139 }
1140 break;
1141 case '|':
1142 switch (c2) {
1143 case '=': return VBAREQUAL;
1144 }
1145 break;
1146 case '%':
1147 switch (c2) {
1148 case '=': return PERCENTEQUAL;
1149 }
1150 break;
1151 case '&':
1152 switch (c2) {
1153 case '=': return AMPEREQUAL;
1154 }
1155 break;
1156 case '^':
1157 switch (c2) {
1158 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001159 }
1160 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001161 }
1162 return OP;
1163}
1164
Thomas Wouters434d0822000-08-24 20:11:32 +00001165int
1166PyToken_ThreeChars(int c1, int c2, int c3)
1167{
1168 switch (c1) {
1169 case '<':
1170 switch (c2) {
1171 case '<':
1172 switch (c3) {
1173 case '=':
1174 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001175 }
1176 break;
1177 }
1178 break;
1179 case '>':
1180 switch (c2) {
1181 case '>':
1182 switch (c3) {
1183 case '=':
1184 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001185 }
1186 break;
1187 }
1188 break;
1189 case '*':
1190 switch (c2) {
1191 case '*':
1192 switch (c3) {
1193 case '=':
1194 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001195 }
1196 break;
1197 }
1198 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001199 case '/':
1200 switch (c2) {
1201 case '/':
1202 switch (c3) {
1203 case '=':
1204 return DOUBLESLASHEQUAL;
1205 }
1206 break;
1207 }
1208 break;
Georg Brandldde00282007-03-18 19:01:53 +00001209 case '.':
1210 switch (c2) {
1211 case '.':
1212 switch (c3) {
1213 case '.':
1214 return ELLIPSIS;
1215 }
1216 break;
1217 }
1218 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001219 }
1220 return OP;
1221}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001222
Guido van Rossum926f13a1998-04-09 21:38:06 +00001223static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001224indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001225{
1226 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001227 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001228 tok->cur = tok->inp;
1229 return 1;
1230 }
1231 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001232 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1233 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001234 tok->altwarning = 0;
1235 }
1236 return 0;
1237}
1238
Martin v. Löwis47383402007-08-15 07:32:56 +00001239#ifdef PGEN
1240#define verify_identifier(s,e) 1
1241#else
1242/* Verify that the identifier follows PEP 3131. */
1243static int
1244verify_identifier(char *start, char *end)
1245{
Guido van Rossume3e37012007-08-29 18:54:41 +00001246 PyObject *s;
1247 int result;
1248 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1249 if (s == NULL) {
1250 PyErr_Clear();
1251 return 0;
1252 }
1253 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001254 Py_DECREF(s);
1255 return result;
1256}
1257#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001258
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001259/* Get next token, after space stripping etc. */
1260
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001261static int
1262tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263{
1264 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001265 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001266
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001267 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001268 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001269 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001270 blankline = 0;
1271
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001272 /* Get indentation level */
1273 if (tok->atbol) {
1274 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001275 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001276 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 for (;;) {
1278 c = tok_nextc(tok);
1279 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001280 col++, altcol++;
1281 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001283 altcol = (altcol/tok->alttabsize + 1)
1284 * tok->alttabsize;
1285 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001286 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001287 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001288 else
1289 break;
1290 }
1291 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001292 if (c == '#' || c == '\n') {
1293 /* Lines with only whitespace and/or comments
1294 shouldn't affect the indentation and are
1295 not passed to the parser as NEWLINE tokens,
1296 except *totally* empty lines in interactive
1297 mode, which signal the end of a command group. */
1298 if (col == 0 && c == '\n' && tok->prompt != NULL)
1299 blankline = 0; /* Let it through */
1300 else
1301 blankline = 1; /* Ignore completely */
1302 /* We can't jump back right here since we still
1303 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001305 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001306 if (col == tok->indstack[tok->indent]) {
1307 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001308 if (altcol != tok->altindstack[tok->indent]) {
1309 if (indenterror(tok))
1310 return ERRORTOKEN;
1311 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001312 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001313 else if (col > tok->indstack[tok->indent]) {
1314 /* Indent -- always one */
1315 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001316 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001317 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001318 return ERRORTOKEN;
1319 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001320 if (altcol <= tok->altindstack[tok->indent]) {
1321 if (indenterror(tok))
1322 return ERRORTOKEN;
1323 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001324 tok->pendin++;
1325 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001326 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001327 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001328 else /* col < tok->indstack[tok->indent] */ {
1329 /* Dedent -- any number, must be consistent */
1330 while (tok->indent > 0 &&
1331 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001332 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001333 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001334 }
1335 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001336 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001337 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001338 return ERRORTOKEN;
1339 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001340 if (altcol != tok->altindstack[tok->indent]) {
1341 if (indenterror(tok))
1342 return ERRORTOKEN;
1343 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001344 }
1345 }
1346 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001347
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001348 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001349
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001350 /* Return pending indents/dedents */
1351 if (tok->pendin != 0) {
1352 if (tok->pendin < 0) {
1353 tok->pendin++;
1354 return DEDENT;
1355 }
1356 else {
1357 tok->pendin--;
1358 return INDENT;
1359 }
1360 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001361
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001363 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 /* Skip spaces */
1365 do {
1366 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001367 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001368
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001369 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001370 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001372 /* Skip comment */
1373 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001374 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001375 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001376
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001377 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001378 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001379 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001380 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001381
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001382 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001383 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001384 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001385 /* Process b"", r"" and br"" */
1386 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001387 c = tok_nextc(tok);
1388 if (c == '"' || c == '\'')
1389 goto letter_quote;
1390 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001391 if (c == 'r' || c == 'R') {
1392 c = tok_nextc(tok);
1393 if (c == '"' || c == '\'')
1394 goto letter_quote;
1395 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001396 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001397 if (c >= 128)
1398 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001400 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001402 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001403 !verify_identifier(tok->start, tok->cur)) {
1404 tok->done = E_IDENTIFIER;
1405 return ERRORTOKEN;
1406 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001407 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001408 *p_end = tok->cur;
1409 return NAME;
1410 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001412 /* Newline */
1413 if (c == '\n') {
1414 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001415 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001416 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001417 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001419 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001420 return NEWLINE;
1421 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001423 /* Period or number starting with period? */
1424 if (c == '.') {
1425 c = tok_nextc(tok);
1426 if (isdigit(c)) {
1427 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001428 } else if (c == '.') {
1429 c = tok_nextc(tok);
1430 if (c == '.') {
1431 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001432 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001433 return ELLIPSIS;
1434 } else {
1435 tok_backup(tok, c);
1436 }
1437 tok_backup(tok, '.');
1438 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001439 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001440 }
Georg Brandldde00282007-03-18 19:01:53 +00001441 *p_start = tok->start;
1442 *p_end = tok->cur;
1443 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001444 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001445
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001446 /* Number */
1447 if (isdigit(c)) {
1448 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001449 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001450 c = tok_nextc(tok);
1451 if (c == '.')
1452 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001453 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001454 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001456
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001457 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001458 c = tok_nextc(tok);
1459 if (!isxdigit(c)) {
1460 tok->done = E_TOKEN;
1461 tok_backup(tok, c);
1462 return ERRORTOKEN;
1463 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001464 do {
1465 c = tok_nextc(tok);
1466 } while (isxdigit(c));
1467 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001468 else if (c == 'o' || c == 'O') {
1469 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001470 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001471 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001472 tok->done = E_TOKEN;
1473 tok_backup(tok, c);
1474 return ERRORTOKEN;
1475 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001476 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001478 } while ('0' <= c && c < '8');
1479 }
1480 else if (c == 'b' || c == 'B') {
1481 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001482 c = tok_nextc(tok);
1483 if (c != '0' && c != '1') {
1484 tok->done = E_TOKEN;
1485 tok_backup(tok, c);
1486 return ERRORTOKEN;
1487 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001488 do {
1489 c = tok_nextc(tok);
1490 } while (c == '0' || c == '1');
1491 }
1492 else {
1493 int nonzero = 0;
1494 /* maybe old-style octal; c is first char of it */
1495 /* in any case, allow '0' as a literal */
1496 while (c == '0')
1497 c = tok_nextc(tok);
1498 while (isdigit(c)) {
1499 nonzero = 1;
1500 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001501 }
1502 if (c == '.')
1503 goto fraction;
1504 else if (c == 'e' || c == 'E')
1505 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001506 else if (c == 'j' || c == 'J')
1507 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001508 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001509 tok->done = E_TOKEN;
1510 tok_backup(tok, c);
1511 return ERRORTOKEN;
1512 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001513 }
1514 }
1515 else {
1516 /* Decimal */
1517 do {
1518 c = tok_nextc(tok);
1519 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001520 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001521 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001522 if (c == '.') {
1523 fraction:
1524 /* Fraction */
1525 do {
1526 c = tok_nextc(tok);
1527 } while (isdigit(c));
1528 }
1529 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001530 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001531 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001532 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001533 if (c == '+' || c == '-')
1534 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001535 if (!isdigit(c)) {
1536 tok->done = E_TOKEN;
1537 tok_backup(tok, c);
1538 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001539 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001540 do {
1541 c = tok_nextc(tok);
1542 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001543 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001544 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001545 /* Imaginary part */
1546 imaginary:
1547 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001548 }
1549 }
1550 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001551 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001552 *p_end = tok->cur;
1553 return NUMBER;
1554 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001555
1556 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001557 /* String */
1558 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001559 int quote = c;
1560 int quote_size = 1; /* 1 or 3 */
1561 int end_quote_size = 0;
1562
1563 /* Find the quote size and start of string */
1564 c = tok_nextc(tok);
1565 if (c == quote) {
1566 c = tok_nextc(tok);
1567 if (c == quote)
1568 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001569 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001570 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001571 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001572 if (c != quote)
1573 tok_backup(tok, c);
1574
1575 /* Get rest of string */
1576 while (end_quote_size != quote_size) {
1577 c = tok_nextc(tok);
1578 if (c == EOF) {
1579 if (quote_size == 3)
1580 tok->done = E_EOFS;
1581 else
1582 tok->done = E_EOLS;
1583 tok->cur = tok->inp;
1584 return ERRORTOKEN;
1585 }
1586 if (quote_size == 1 && c == '\n') {
1587 tok->done = E_EOLS;
1588 tok->cur = tok->inp;
1589 return ERRORTOKEN;
1590 }
1591 if (c == quote)
1592 end_quote_size += 1;
1593 else {
1594 end_quote_size = 0;
1595 if (c == '\\')
1596 c = tok_nextc(tok); /* skip escaped char */
1597 }
1598 }
1599
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001600 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001601 *p_end = tok->cur;
1602 return STRING;
1603 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001604
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001605 /* Line continuation */
1606 if (c == '\\') {
1607 c = tok_nextc(tok);
1608 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001609 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001610 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001611 return ERRORTOKEN;
1612 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001613 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001614 goto again; /* Read next line */
1615 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001616
Guido van Rossumfbab9051991-10-20 20:25:03 +00001617 /* Check for two-character token */
1618 {
1619 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001620 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001621 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001622 int c3 = tok_nextc(tok);
1623 int token3 = PyToken_ThreeChars(c, c2, c3);
1624 if (token3 != OP) {
1625 token = token3;
1626 } else {
1627 tok_backup(tok, c3);
1628 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001629 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001630 *p_end = tok->cur;
1631 return token;
1632 }
1633 tok_backup(tok, c2);
1634 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001635
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001636 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001637 switch (c) {
1638 case '(':
1639 case '[':
1640 case '{':
1641 tok->level++;
1642 break;
1643 case ')':
1644 case ']':
1645 case '}':
1646 tok->level--;
1647 break;
1648 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001649
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001650 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001651 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001652 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001653 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001654}
1655
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001656int
1657PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1658{
1659 int result = tok_get(tok, p_start, p_end);
1660 if (tok->decoding_erred) {
1661 result = ERRORTOKEN;
1662 tok->done = E_DECODE;
1663 }
1664 return result;
1665}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001666
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001667/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001668
1669 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001670 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001671 should be assumed to be PyUnicode_GetDefaultEncoding()).
1672
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001673 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1674 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001675*/
1676char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001677PyTokenizer_FindEncoding(int fd)
1678{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001679 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001680 FILE *fp;
1681 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001682
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001683 fd = dup(fd);
1684 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001685 return NULL;
1686 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001687 fp = fdopen(fd, "r");
1688 if (fp == NULL) {
1689 return NULL;
1690 }
1691 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1692 if (tok == NULL) {
1693 fclose(fp);
1694 return NULL;
1695 }
1696 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001697 PyTokenizer_Get(tok, &p_start, &p_end);
1698 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001699 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001700 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001701 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001702 if (encoding)
1703 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001704 }
1705 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001706 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001707}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001708
Guido van Rossum408027e1996-12-30 16:17:54 +00001709#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001710
1711void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001712tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001713{
Guido van Rossum86bea461997-04-29 21:03:06 +00001714 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001715 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1716 printf("(%.*s)", (int)(end - start), start);
1717}
1718
1719#endif