blob: f3ef1cbc6c1249c632fc928a898659757334f3ce [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000200 if (c == '\0')
201 break;
202 else if (c == '_')
203 buf[i] = '-';
204 else
205 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000206 }
207 buf[i] = '\0';
208 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000209 strncmp(buf, "utf-8-", 6) == 0)
210 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000211 else if (strcmp(buf, "latin-1") == 0 ||
212 strcmp(buf, "iso-8859-1") == 0 ||
213 strcmp(buf, "iso-latin-1") == 0 ||
214 strncmp(buf, "latin-1-", 8) == 0 ||
215 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000216 strncmp(buf, "iso-latin-1-", 12) == 0)
217 return "iso-8859-1";
218 else
219 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220}
221
222/* Return the coding spec in S, or NULL if none is found. */
223
224static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000225get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000226{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000227 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000228 /* Coding spec must be in a comment, and that comment must be
229 * the only statement on the source code line. */
230 for (i = 0; i < size - 6; i++) {
231 if (s[i] == '#')
232 break;
233 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 return NULL;
235 }
236 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000237 const char* t = s + i;
238 if (strncmp(t, "coding", 6) == 0) {
239 const char* begin = NULL;
240 t += 6;
241 if (t[0] != ':' && t[0] != '=')
242 continue;
243 do {
244 t++;
245 } while (t[0] == '\x20' || t[0] == '\t');
246
247 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000248 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000249 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000250 t++;
251
252 if (begin < t) {
253 char* r = new_string(begin, t - begin);
254 char* q = get_normal_name(r);
255 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000257 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000258 }
259 return r;
260 }
261 }
262 }
263 return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267 invoke the set_readline function for the new encoding.
268 This function receives the tok_state and the new encoding.
269 Return 1 on success, 0 on failure. */
270
271static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 int set_readline(struct tok_state *, const char *))
274{
Tim Peters17db21f2002-09-03 15:39:58 +0000275 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000277
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000278 if (tok->cont_line)
279 /* It's a continuation line, so it can't be a coding spec. */
280 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000281 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282 if (cs != NULL) {
283 tok->read_coding_spec = 1;
284 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000285 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000286 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 tok->encoding = cs;
288 } else {
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000292 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000293 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000294 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000296 }
297 } else { /* then, compare cs with BOM */
298 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 }
301 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000302 if (!r) {
303 cs = tok->encoding;
304 if (!cs)
305 cs = "with BOM";
306 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000308 return r;
309}
310
311/* See whether the file starts with a BOM. If it does,
312 invoke the set_readline function with the new encoding.
313 Return 1 on success, 0 on failure. */
314
315static int
316check_bom(int get_char(struct tok_state *),
317 void unget_char(int, struct tok_state *),
318 int set_readline(struct tok_state *, const char *),
319 struct tok_state *tok)
320{
321 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000322 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000323 if (ch == EOF) {
324 return 1;
325 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000326 ch = get_char(tok);
327 if (ch != 0xBB) {
328 unget_char(ch, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
333 ch = get_char(tok);
334 if (ch != 0xBF) {
335 unget_char(ch, tok);
336 unget_char(0xBB, tok);
337 unget_char(0xEF, tok);
338 /* any token beginning with '\xEF' is a bad token */
339 return 1;
340 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000341#if 0
342 /* Disable support for UTF-16 BOMs until a decision
343 is made whether this needs to be supported. */
344 } else if (ch == 0xFE) {
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000345 ch = get_char(tok);
346 if (ch != 0xFF)
347 goto NON_BOM;
348 if (!set_readline(tok, "utf-16-be"))
349 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000350 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351 } else if (ch == 0xFF) {
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000352 ch = get_char(tok);
353 if (ch != 0xFE)
354 goto NON_BOM;
355 if (!set_readline(tok, "utf-16-le"))
356 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000357 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000358#endif
359 } else {
360 unget_char(ch, tok);
361 return 1;
362 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000363 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000365 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000366 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000367 return 1;
368}
369
370/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000371 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000373 On entry, tok->decoding_buffer will be one of:
374 1) NULL: need to call tok->decoding_readline to get a new line
375 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
376 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000377 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000378 (in the s buffer) to copy entire contents of the line read
379 by tok->decoding_readline. tok->decoding_buffer has the overflow.
380 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000382 reached): see tok_nextc and its calls to decoding_fgets.
383*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000384
385static char *
386fp_readl(char *s, int size, struct tok_state *tok)
387{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000388 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000389 const char *buf;
390 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000391
392 /* Ask for one less byte so we can terminate it */
393 assert(size > 0);
394 size--;
395
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000396 if (tok->decoding_buffer) {
397 bufobj = tok->decoding_buffer;
398 Py_INCREF(bufobj);
399 }
400 else
401 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000402 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
403 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000404 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000405 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000406 if (PyUnicode_CheckExact(bufobj))
407 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000408 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000409 if (buf == NULL) {
410 goto error;
411 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000412 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000413 else
414 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000415 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000416 if (buf == NULL) {
417 goto error;
418 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000419 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000420 }
421
422 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000423 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000425 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000426 buflen-size);
427 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000428 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000429 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000430 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000431 else
432 tok->decoding_buffer = NULL;
433
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000434 memcpy(s, buf, buflen);
435 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000436 if (buflen == 0) /* EOF */
437 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000438 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000439 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000440
441error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000442 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000443 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000444}
445
446/* Set the readline function for TOK to a StreamReader's
447 readline function. The StreamReader is named ENC.
448
449 This function is called from check_bom and check_coding_spec.
450
451 ENC is usually identical to the future value of tok->encoding,
452 except for the (currently unsupported) case of UTF-16.
453
454 Return 1 on success, 0 on failure. */
455
456static int
457fp_setreadl(struct tok_state *tok, const char* enc)
458{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000459 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000460
Christian Heimes819b8bf2008-01-03 23:05:47 +0000461 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000462 if (io == NULL)
463 goto cleanup;
464
Brett Cannon8a9583e2008-09-04 05:04:25 +0000465 if (tok->filename)
466 stream = PyObject_CallMethod(io, "open", "ssis",
467 tok->filename, "r", -1, enc);
468 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000469 stream = PyObject_CallMethod(io, "open", "isisOOO",
470 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000471 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000472 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000473
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000474 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000475 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000476 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000478 /* The file has been reopened; parsing will restart from
479 * the beginning of the file, we have to reset the line number.
480 * But this function has been called from inside tok_nextc() which
481 * will increment lineno before it returns. So we set it -1 so that
482 * the next call to tok_nextc() will start with tok->lineno == 0.
483 */
484 tok->lineno = -1;
485
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000486 cleanup:
487 Py_XDECREF(stream);
488 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000489 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000490}
491
492/* Fetch the next byte from TOK. */
493
494static int fp_getc(struct tok_state *tok) {
495 return getc(tok->fp);
496}
497
498/* Unfetch the last byte back into TOK. */
499
500static void fp_ungetc(int c, struct tok_state *tok) {
501 ungetc(c, tok->fp);
502}
503
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000504/* Check whether the characters at s start a valid
505 UTF-8 sequence. Return the number of characters forming
506 the sequence if yes, 0 if not. */
507static int valid_utf8(const unsigned char* s)
508{
509 int expected = 0;
510 int length;
511 if (*s < 0x80)
512 /* single-byte code */
513 return 1;
514 if (*s < 0xc0)
515 /* following byte */
516 return 0;
517 if (*s < 0xE0)
518 expected = 1;
519 else if (*s < 0xF0)
520 expected = 2;
521 else if (*s < 0xF8)
522 expected = 3;
523 else
524 return 0;
525 length = expected + 1;
526 for (; expected; expected--)
527 if (s[expected] < 0x80 || s[expected] >= 0xC0)
528 return 0;
529 return length;
530}
531
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532/* Read a line of input from TOK. Determine encoding
533 if necessary. */
534
535static char *
536decoding_fgets(char *s, int size, struct tok_state *tok)
537{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000538 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000539 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000540 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000541 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000542 /* We already have a codec associated with
543 this input. */
544 line = fp_readl(s, size, tok);
545 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000546 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000548 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000549 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000550 break;
551 } else {
552 /* We have not yet determined the encoding.
553 If an encoding is found, use the file-pointer
554 reader functions from now on. */
555 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
556 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000557 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000558 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000559 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000560 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
561 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
562 return error_ret(tok);
563 }
564 }
565#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000566 /* The default encoding is UTF-8, so make sure we don't have any
567 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000568 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000569 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000570 int length;
571 for (c = (unsigned char *)line; *c; c += length)
572 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573 badchar = *c;
574 break;
575 }
576 }
577 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000578 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000579 /* Need to add 1 to the line number, since this line
580 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000581 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000582 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000583 "in file %.200s on line %i, "
584 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000585 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000586 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000587 PyErr_SetString(PyExc_SyntaxError, buf);
588 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000589 }
590#endif
591 return line;
592}
593
594static int
595decoding_feof(struct tok_state *tok)
596{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000597 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000598 return feof(tok->fp);
599 } else {
600 PyObject* buf = tok->decoding_buffer;
601 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000602 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 if (buf == NULL) {
604 error_ret(tok);
605 return 1;
606 } else {
607 tok->decoding_buffer = buf;
608 }
609 }
610 return PyObject_Length(buf) == 0;
611 }
612}
613
614/* Fetch a byte from TOK, using the string buffer. */
615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000616static int
617buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000618 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619}
620
621/* Unfetch a byte from TOK, using the string buffer. */
622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000623static void
624buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000625 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000626 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000627}
628
629/* Set the readline function for TOK to ENC. For the string-based
630 tokenizer, this means to just record the encoding. */
631
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000632static int
633buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000634 tok->enc = enc;
635 return 1;
636}
637
638/* Return a UTF-8 encoding Python string object from the
639 C byte string STR, which is encoded with ENC. */
640
641static PyObject *
642translate_into_utf8(const char* str, const char* enc) {
643 PyObject *utf8;
644 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
645 if (buf == NULL)
646 return NULL;
647 utf8 = PyUnicode_AsUTF8String(buf);
648 Py_DECREF(buf);
649 return utf8;
650}
651
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000652
653static char *
654translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
655 int skip_next_lf = 0, length = strlen(s), final_length;
656 char *buf, *current;
657 char c;
658 buf = PyMem_MALLOC(length + 2);
659 if (buf == NULL) {
660 tok->done = E_NOMEM;
661 return NULL;
662 }
663 for (current = buf; (c = *s++);) {
664 if (skip_next_lf) {
665 skip_next_lf = 0;
666 if (c == '\n') {
667 c = *s;
668 s++;
669 if (!c)
670 break;
671 }
672 }
673 if (c == '\r') {
674 skip_next_lf = 1;
675 c = '\n';
676 }
677 *current = c;
678 current++;
679 }
680 /* If this is exec input, add a newline to the end of the file if
681 there isn't one already. */
682 if (exec_input && *current != '\n') {
683 *current = '\n';
684 current++;
685 }
686 *current = '\0';
687 final_length = current - buf;
688 if (final_length < length && final_length)
689 /* should never fail */
690 buf = PyMem_REALLOC(buf, final_length + 1);
691 return buf;
692}
693
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000694/* Decode a byte string STR for use as the buffer of TOK.
695 Look for encoding declarations inside STR, and record them
696 inside TOK. */
697
698static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000699decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000700{
701 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000702 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000704 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000705 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000706 tok->input = str = translate_newlines(input, single, tok);
707 if (str == NULL)
708 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000709 tok->enc = NULL;
710 tok->str = str;
711 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000712 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000713 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000714 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000715 if (tok->enc != NULL) {
716 utf8 = translate_into_utf8(str, tok->enc);
717 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000718 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000719 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000720 }
721 for (s = str;; s++) {
722 if (*s == '\0') break;
723 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000724 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000725 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000726 lineno++;
727 if (lineno == 2) break;
728 }
729 }
730 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000731 /* need to check line 1 and 2 separately since check_coding_spec
732 assumes a single line as input */
733 if (newl[0]) {
734 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
735 return error_ret(tok);
736 if (tok->enc == NULL && newl[1]) {
737 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
738 tok, buf_setreadl))
739 return error_ret(tok);
740 }
741 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000742 if (tok->enc != NULL) {
743 assert(utf8 == NULL);
744 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000745 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000746 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000747 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748 }
749 assert(tok->decoding_buffer == NULL);
750 tok->decoding_buffer = utf8; /* CAUTION */
751 return str;
752}
753
754#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755
756/* Set up tokenizer for string */
757
758struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000759PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760{
761 struct tok_state *tok = tok_new();
762 if (tok == NULL)
763 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000764 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000765 if (str == NULL) {
766 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000767 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000768 }
769
Martin v. Löwis95292d62002-12-11 14:04:59 +0000770 /* XXX: constify members. */
771 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000772 return tok;
773}
774
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000775struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000776PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000777{
778 struct tok_state *tok = tok_new();
779 if (tok == NULL)
780 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000781#ifndef PGEN
782 tok->input = str = translate_newlines(str, exec_input, tok);
783#endif
784 if (str == NULL) {
785 PyTokenizer_Free(tok);
786 return NULL;
787 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000788 tok->decoding_state = STATE_RAW;
789 tok->read_coding_spec = 1;
790 tok->enc = NULL;
791 tok->str = str;
792 tok->encoding = (char *)PyMem_MALLOC(6);
793 if (!tok->encoding) {
794 PyTokenizer_Free(tok);
795 return NULL;
796 }
797 strcpy(tok->encoding, "utf-8");
798
799 /* XXX: constify members. */
800 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
801 return tok;
802}
803
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000804/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000805
806struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000807PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000808{
809 struct tok_state *tok = tok_new();
810 if (tok == NULL)
811 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000812 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000813 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000814 return NULL;
815 }
816 tok->cur = tok->inp = tok->buf;
817 tok->end = tok->buf + BUFSIZ;
818 tok->fp = fp;
819 tok->prompt = ps1;
820 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000821 if (enc != NULL) {
822 /* Must copy encoding declaration since it
823 gets copied into the parse tree. */
824 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
825 if (!tok->encoding) {
826 PyTokenizer_Free(tok);
827 return NULL;
828 }
829 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000830 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000831 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000832 return tok;
833}
834
835
836/* Free a tok_state structure */
837
838void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000839PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000840{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000841 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000842 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000843#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000844 Py_XDECREF(tok->decoding_readline);
845 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000846#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000848 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000849 if (tok->input)
850 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000851 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000852}
853
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854/* Get next char, updating state; error code goes into tok->done */
855
856static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000857tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000858{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000859 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000860 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000861 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000862 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000863 if (tok->done != E_OK)
864 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000865 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000866 char *end = strchr(tok->inp, '\n');
867 if (end != NULL)
868 end++;
869 else {
870 end = strchr(tok->inp, '\0');
871 if (end == tok->inp) {
872 tok->done = E_EOF;
873 return EOF;
874 }
875 }
876 if (tok->start == NULL)
877 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000878 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000879 tok->lineno++;
880 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000881 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000882 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000884 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000885#ifndef PGEN
886 if (tok->encoding && newtok && *newtok) {
887 /* Recode to UTF-8 */
888 Py_ssize_t buflen;
889 const char* buf;
890 PyObject *u = translate_into_utf8(newtok, tok->encoding);
891 PyMem_FREE(newtok);
892 if (!u) {
893 tok->done = E_DECODE;
894 return EOF;
895 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000896 buflen = PyBytes_GET_SIZE(u);
897 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000898 if (!buf) {
899 Py_DECREF(u);
900 tok->done = E_DECODE;
901 return EOF;
902 }
903 newtok = PyMem_MALLOC(buflen+1);
904 strcpy(newtok, buf);
905 Py_DECREF(u);
906 }
907#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000908 if (tok->nextprompt != NULL)
909 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000910 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000911 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000912 else if (*newtok == '\0') {
913 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000914 tok->done = E_EOF;
915 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000916 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000917 size_t start = tok->start - tok->buf;
918 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000919 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000920 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000921 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000922 tok->lineno++;
923 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000924 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000925 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000926 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000927 tok->done = E_NOMEM;
928 return EOF;
929 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000930 tok->buf = buf;
931 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000932 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000933 strcpy(tok->buf + oldlen, newtok);
934 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000935 tok->inp = tok->buf + newlen;
936 tok->end = tok->inp + 1;
937 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000938 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000939 else {
940 tok->lineno++;
941 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000942 PyMem_FREE(tok->buf);
943 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000944 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000945 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000946 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000947 tok->inp = strchr(tok->buf, '\0');
948 tok->end = tok->inp + 1;
949 }
950 }
951 else {
952 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000953 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000954 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000955 if (tok->start == NULL) {
956 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000957 tok->buf = (char *)
958 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000959 if (tok->buf == NULL) {
960 tok->done = E_NOMEM;
961 return EOF;
962 }
963 tok->end = tok->buf + BUFSIZ;
964 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000965 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
966 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000967 tok->done = E_EOF;
968 done = 1;
969 }
970 else {
971 tok->done = E_OK;
972 tok->inp = strchr(tok->buf, '\0');
973 done = tok->inp[-1] == '\n';
974 }
975 }
976 else {
977 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000978 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000979 tok->done = E_EOF;
980 done = 1;
981 }
982 else
983 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000984 }
985 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000986 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000987 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000988 Py_ssize_t curstart = tok->start == NULL ? -1 :
989 tok->start - tok->buf;
990 Py_ssize_t curvalid = tok->inp - tok->buf;
991 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000992 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000993 newbuf = (char *)PyMem_REALLOC(newbuf,
994 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000995 if (newbuf == NULL) {
996 tok->done = E_NOMEM;
997 tok->cur = tok->inp;
998 return EOF;
999 }
1000 tok->buf = newbuf;
1001 tok->inp = tok->buf + curvalid;
1002 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001003 tok->start = curstart < 0 ? NULL :
1004 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001005 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001006 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001007 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001008 /* Break out early on decoding
1009 errors, as tok->buf will be NULL
1010 */
1011 if (tok->decoding_erred)
1012 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001013 /* Last line does not end in \n,
1014 fake one */
1015 strcpy(tok->inp, "\n");
1016 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001017 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001018 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001019 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001020 if (tok->buf != NULL) {
1021 tok->cur = tok->buf + cur;
1022 tok->line_start = tok->cur;
1023 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001024 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001025 pt = tok->inp - 2;
1026 if (pt >= tok->buf && *pt == '\r') {
1027 *pt++ = '\n';
1028 *pt = '\0';
1029 tok->inp = pt;
1030 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001031 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001032 }
1033 if (tok->done != E_OK) {
1034 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001035 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001036 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001037 return EOF;
1038 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001039 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001040 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001041}
1042
1043
1044/* Back-up one character */
1045
1046static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001047tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001048{
1049 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001050 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001051 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001052 if (*tok->cur != c)
1053 *tok->cur = c;
1054 }
1055}
1056
1057
1058/* Return the token corresponding to a single character */
1059
1060int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001061PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001062{
1063 switch (c) {
1064 case '(': return LPAR;
1065 case ')': return RPAR;
1066 case '[': return LSQB;
1067 case ']': return RSQB;
1068 case ':': return COLON;
1069 case ',': return COMMA;
1070 case ';': return SEMI;
1071 case '+': return PLUS;
1072 case '-': return MINUS;
1073 case '*': return STAR;
1074 case '/': return SLASH;
1075 case '|': return VBAR;
1076 case '&': return AMPER;
1077 case '<': return LESS;
1078 case '>': return GREATER;
1079 case '=': return EQUAL;
1080 case '.': return DOT;
1081 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001082 case '{': return LBRACE;
1083 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001084 case '^': return CIRCUMFLEX;
1085 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001086 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001087 default: return OP;
1088 }
1089}
1090
1091
Guido van Rossumfbab9051991-10-20 20:25:03 +00001092int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001093PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001094{
1095 switch (c1) {
1096 case '=':
1097 switch (c2) {
1098 case '=': return EQEQUAL;
1099 }
1100 break;
1101 case '!':
1102 switch (c2) {
1103 case '=': return NOTEQUAL;
1104 }
1105 break;
1106 case '<':
1107 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001108 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001109 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001110 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001111 }
1112 break;
1113 case '>':
1114 switch (c2) {
1115 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001116 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001117 }
1118 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001119 case '+':
1120 switch (c2) {
1121 case '=': return PLUSEQUAL;
1122 }
1123 break;
1124 case '-':
1125 switch (c2) {
1126 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001127 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001128 }
1129 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001130 case '*':
1131 switch (c2) {
1132 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001133 case '=': return STAREQUAL;
1134 }
1135 break;
1136 case '/':
1137 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001138 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001139 case '=': return SLASHEQUAL;
1140 }
1141 break;
1142 case '|':
1143 switch (c2) {
1144 case '=': return VBAREQUAL;
1145 }
1146 break;
1147 case '%':
1148 switch (c2) {
1149 case '=': return PERCENTEQUAL;
1150 }
1151 break;
1152 case '&':
1153 switch (c2) {
1154 case '=': return AMPEREQUAL;
1155 }
1156 break;
1157 case '^':
1158 switch (c2) {
1159 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001160 }
1161 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001162 }
1163 return OP;
1164}
1165
Thomas Wouters434d0822000-08-24 20:11:32 +00001166int
1167PyToken_ThreeChars(int c1, int c2, int c3)
1168{
1169 switch (c1) {
1170 case '<':
1171 switch (c2) {
1172 case '<':
1173 switch (c3) {
1174 case '=':
1175 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001176 }
1177 break;
1178 }
1179 break;
1180 case '>':
1181 switch (c2) {
1182 case '>':
1183 switch (c3) {
1184 case '=':
1185 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001186 }
1187 break;
1188 }
1189 break;
1190 case '*':
1191 switch (c2) {
1192 case '*':
1193 switch (c3) {
1194 case '=':
1195 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001196 }
1197 break;
1198 }
1199 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001200 case '/':
1201 switch (c2) {
1202 case '/':
1203 switch (c3) {
1204 case '=':
1205 return DOUBLESLASHEQUAL;
1206 }
1207 break;
1208 }
1209 break;
Georg Brandldde00282007-03-18 19:01:53 +00001210 case '.':
1211 switch (c2) {
1212 case '.':
1213 switch (c3) {
1214 case '.':
1215 return ELLIPSIS;
1216 }
1217 break;
1218 }
1219 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001220 }
1221 return OP;
1222}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001223
Guido van Rossum926f13a1998-04-09 21:38:06 +00001224static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001225indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001226{
1227 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001228 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001229 tok->cur = tok->inp;
1230 return 1;
1231 }
1232 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001233 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1234 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001235 tok->altwarning = 0;
1236 }
1237 return 0;
1238}
1239
Martin v. Löwis47383402007-08-15 07:32:56 +00001240#ifdef PGEN
1241#define verify_identifier(s,e) 1
1242#else
1243/* Verify that the identifier follows PEP 3131. */
1244static int
1245verify_identifier(char *start, char *end)
1246{
Guido van Rossume3e37012007-08-29 18:54:41 +00001247 PyObject *s;
1248 int result;
1249 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1250 if (s == NULL) {
1251 PyErr_Clear();
1252 return 0;
1253 }
1254 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001255 Py_DECREF(s);
1256 return result;
1257}
1258#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001259
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001260/* Get next token, after space stripping etc. */
1261
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001262static int
1263tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001264{
1265 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001266 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001267
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001268 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001269 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001270 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001271 blankline = 0;
1272
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001273 /* Get indentation level */
1274 if (tok->atbol) {
1275 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001276 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 for (;;) {
1279 c = tok_nextc(tok);
1280 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001281 col++, altcol++;
1282 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001284 altcol = (altcol/tok->alttabsize + 1)
1285 * tok->alttabsize;
1286 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001287 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001288 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001289 else
1290 break;
1291 }
1292 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001293 if (c == '#' || c == '\n') {
1294 /* Lines with only whitespace and/or comments
1295 shouldn't affect the indentation and are
1296 not passed to the parser as NEWLINE tokens,
1297 except *totally* empty lines in interactive
1298 mode, which signal the end of a command group. */
1299 if (col == 0 && c == '\n' && tok->prompt != NULL)
1300 blankline = 0; /* Let it through */
1301 else
1302 blankline = 1; /* Ignore completely */
1303 /* We can't jump back right here since we still
1304 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001305 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001306 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001307 if (col == tok->indstack[tok->indent]) {
1308 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001309 if (altcol != tok->altindstack[tok->indent]) {
1310 if (indenterror(tok))
1311 return ERRORTOKEN;
1312 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001314 else if (col > tok->indstack[tok->indent]) {
1315 /* Indent -- always one */
1316 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001317 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001318 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001319 return ERRORTOKEN;
1320 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001321 if (altcol <= tok->altindstack[tok->indent]) {
1322 if (indenterror(tok))
1323 return ERRORTOKEN;
1324 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001325 tok->pendin++;
1326 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001327 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001328 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001329 else /* col < tok->indstack[tok->indent] */ {
1330 /* Dedent -- any number, must be consistent */
1331 while (tok->indent > 0 &&
1332 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001333 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001334 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001335 }
1336 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001337 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001338 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001339 return ERRORTOKEN;
1340 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001341 if (altcol != tok->altindstack[tok->indent]) {
1342 if (indenterror(tok))
1343 return ERRORTOKEN;
1344 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001345 }
1346 }
1347 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001348
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001349 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 /* Return pending indents/dedents */
1352 if (tok->pendin != 0) {
1353 if (tok->pendin < 0) {
1354 tok->pendin++;
1355 return DEDENT;
1356 }
1357 else {
1358 tok->pendin--;
1359 return INDENT;
1360 }
1361 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001362
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001363 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001364 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001365 /* Skip spaces */
1366 do {
1367 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001368 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001369
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001370 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001371 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001372
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001373 /* Skip comment */
1374 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001375 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001376 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001377
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001378 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001379 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001381 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001382
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001383 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001384 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001385 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001386 /* Process b"", r"" and br"" */
1387 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001388 c = tok_nextc(tok);
1389 if (c == '"' || c == '\'')
1390 goto letter_quote;
1391 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001392 if (c == 'r' || c == 'R') {
1393 c = tok_nextc(tok);
1394 if (c == '"' || c == '\'')
1395 goto letter_quote;
1396 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001397 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001398 if (c >= 128)
1399 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001400 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001401 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001402 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001403 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001404 !verify_identifier(tok->start, tok->cur)) {
1405 tok->done = E_IDENTIFIER;
1406 return ERRORTOKEN;
1407 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001408 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 *p_end = tok->cur;
1410 return NAME;
1411 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001413 /* Newline */
1414 if (c == '\n') {
1415 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001416 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001417 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001418 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001419 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001420 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 return NEWLINE;
1422 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001424 /* Period or number starting with period? */
1425 if (c == '.') {
1426 c = tok_nextc(tok);
1427 if (isdigit(c)) {
1428 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001429 } else if (c == '.') {
1430 c = tok_nextc(tok);
1431 if (c == '.') {
1432 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001433 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001434 return ELLIPSIS;
1435 } else {
1436 tok_backup(tok, c);
1437 }
1438 tok_backup(tok, '.');
1439 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001440 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001441 }
Georg Brandldde00282007-03-18 19:01:53 +00001442 *p_start = tok->start;
1443 *p_end = tok->cur;
1444 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001445 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001446
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001447 /* Number */
1448 if (isdigit(c)) {
1449 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001450 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001451 c = tok_nextc(tok);
1452 if (c == '.')
1453 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001454 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001455 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001456 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001457
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001458 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001459 c = tok_nextc(tok);
1460 if (!isxdigit(c)) {
1461 tok->done = E_TOKEN;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001465 do {
1466 c = tok_nextc(tok);
1467 } while (isxdigit(c));
1468 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001469 else if (c == 'o' || c == 'O') {
1470 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001471 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001472 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001473 tok->done = E_TOKEN;
1474 tok_backup(tok, c);
1475 return ERRORTOKEN;
1476 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001477 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001479 } while ('0' <= c && c < '8');
1480 }
1481 else if (c == 'b' || c == 'B') {
1482 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001483 c = tok_nextc(tok);
1484 if (c != '0' && c != '1') {
1485 tok->done = E_TOKEN;
1486 tok_backup(tok, c);
1487 return ERRORTOKEN;
1488 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001489 do {
1490 c = tok_nextc(tok);
1491 } while (c == '0' || c == '1');
1492 }
1493 else {
1494 int nonzero = 0;
1495 /* maybe old-style octal; c is first char of it */
1496 /* in any case, allow '0' as a literal */
1497 while (c == '0')
1498 c = tok_nextc(tok);
1499 while (isdigit(c)) {
1500 nonzero = 1;
1501 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001502 }
1503 if (c == '.')
1504 goto fraction;
1505 else if (c == 'e' || c == 'E')
1506 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001507 else if (c == 'j' || c == 'J')
1508 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001509 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001510 tok->done = E_TOKEN;
1511 tok_backup(tok, c);
1512 return ERRORTOKEN;
1513 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001514 }
1515 }
1516 else {
1517 /* Decimal */
1518 do {
1519 c = tok_nextc(tok);
1520 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001521 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001522 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001523 if (c == '.') {
1524 fraction:
1525 /* Fraction */
1526 do {
1527 c = tok_nextc(tok);
1528 } while (isdigit(c));
1529 }
1530 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001531 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001532 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001533 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001534 if (c == '+' || c == '-')
1535 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001536 if (!isdigit(c)) {
1537 tok->done = E_TOKEN;
1538 tok_backup(tok, c);
1539 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001540 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001541 do {
1542 c = tok_nextc(tok);
1543 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001544 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001545 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001546 /* Imaginary part */
1547 imaginary:
1548 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001549 }
1550 }
1551 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001552 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001553 *p_end = tok->cur;
1554 return NUMBER;
1555 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001556
1557 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001558 /* String */
1559 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001560 int quote = c;
1561 int quote_size = 1; /* 1 or 3 */
1562 int end_quote_size = 0;
1563
1564 /* Find the quote size and start of string */
1565 c = tok_nextc(tok);
1566 if (c == quote) {
1567 c = tok_nextc(tok);
1568 if (c == quote)
1569 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001570 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001571 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001572 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001573 if (c != quote)
1574 tok_backup(tok, c);
1575
1576 /* Get rest of string */
1577 while (end_quote_size != quote_size) {
1578 c = tok_nextc(tok);
1579 if (c == EOF) {
1580 if (quote_size == 3)
1581 tok->done = E_EOFS;
1582 else
1583 tok->done = E_EOLS;
1584 tok->cur = tok->inp;
1585 return ERRORTOKEN;
1586 }
1587 if (quote_size == 1 && c == '\n') {
1588 tok->done = E_EOLS;
1589 tok->cur = tok->inp;
1590 return ERRORTOKEN;
1591 }
1592 if (c == quote)
1593 end_quote_size += 1;
1594 else {
1595 end_quote_size = 0;
1596 if (c == '\\')
1597 c = tok_nextc(tok); /* skip escaped char */
1598 }
1599 }
1600
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001601 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001602 *p_end = tok->cur;
1603 return STRING;
1604 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001605
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001606 /* Line continuation */
1607 if (c == '\\') {
1608 c = tok_nextc(tok);
1609 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001610 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001611 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001612 return ERRORTOKEN;
1613 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001614 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001615 goto again; /* Read next line */
1616 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001617
Guido van Rossumfbab9051991-10-20 20:25:03 +00001618 /* Check for two-character token */
1619 {
1620 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001621 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001622 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001623 int c3 = tok_nextc(tok);
1624 int token3 = PyToken_ThreeChars(c, c2, c3);
1625 if (token3 != OP) {
1626 token = token3;
1627 } else {
1628 tok_backup(tok, c3);
1629 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001630 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001631 *p_end = tok->cur;
1632 return token;
1633 }
1634 tok_backup(tok, c2);
1635 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001636
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001637 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001638 switch (c) {
1639 case '(':
1640 case '[':
1641 case '{':
1642 tok->level++;
1643 break;
1644 case ')':
1645 case ']':
1646 case '}':
1647 tok->level--;
1648 break;
1649 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001650
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001651 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001652 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001653 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001654 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655}
1656
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001657int
1658PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1659{
1660 int result = tok_get(tok, p_start, p_end);
1661 if (tok->decoding_erred) {
1662 result = ERRORTOKEN;
1663 tok->done = E_DECODE;
1664 }
1665 return result;
1666}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001667
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001668/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001669
1670 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001671 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001672 should be assumed to be PyUnicode_GetDefaultEncoding()).
1673
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001674 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1675 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001676*/
1677char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001678PyTokenizer_FindEncoding(int fd)
1679{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001680 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001681 FILE *fp;
1682 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001683
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001684 fd = dup(fd);
1685 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001686 return NULL;
1687 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001688 fp = fdopen(fd, "r");
1689 if (fp == NULL) {
1690 return NULL;
1691 }
1692 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1693 if (tok == NULL) {
1694 fclose(fp);
1695 return NULL;
1696 }
1697 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001698 PyTokenizer_Get(tok, &p_start, &p_end);
1699 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001700 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001701 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001702 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001703 if (encoding)
1704 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001705 }
1706 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001707 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001708}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001709
Guido van Rossum408027e1996-12-30 16:17:54 +00001710#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001711
1712void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001713tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001714{
Guido van Rossum86bea461997-04-29 21:03:06 +00001715 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001716 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1717 printf("(%.*s)", (int)(end - start), start);
1718}
1719
1720#endif