blob: e637cb37e5b1c2e4806ccfe9232edd1e395302b1 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185 tok->decoding_erred = 1;
186 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000188 tok->buf = NULL;
189 return NULL; /* as if it were EOF */
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192
193static char *
194get_normal_name(char *s) /* for utf-8 and latin-1 */
195{
196 char buf[13];
197 int i;
198 for (i = 0; i < 12; i++) {
199 int c = s[i];
200 if (c == '\0') break;
201 else if (c == '_') buf[i] = '-';
202 else buf[i] = tolower(c);
203 }
204 buf[i] = '\0';
205 if (strcmp(buf, "utf-8") == 0 ||
206 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
207 else if (strcmp(buf, "latin-1") == 0 ||
208 strcmp(buf, "iso-8859-1") == 0 ||
209 strcmp(buf, "iso-latin-1") == 0 ||
210 strncmp(buf, "latin-1-", 8) == 0 ||
211 strncmp(buf, "iso-8859-1-", 11) == 0 ||
212 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
213 else return s;
214}
215
216/* Return the coding spec in S, or NULL if none is found. */
217
218static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000219get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000220{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000221 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000222 /* Coding spec must be in a comment, and that comment must be
223 * the only statement on the source code line. */
224 for (i = 0; i < size - 6; i++) {
225 if (s[i] == '#')
226 break;
227 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
228 return NULL;
229 }
230 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000231 const char* t = s + i;
232 if (strncmp(t, "coding", 6) == 0) {
233 const char* begin = NULL;
234 t += 6;
235 if (t[0] != ':' && t[0] != '=')
236 continue;
237 do {
238 t++;
239 } while (t[0] == '\x20' || t[0] == '\t');
240
241 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000242 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000243 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000244 t++;
245
246 if (begin < t) {
247 char* r = new_string(begin, t - begin);
248 char* q = get_normal_name(r);
249 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000251 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000252 }
253 return r;
254 }
255 }
256 }
257 return NULL;
258}
259
260/* Check whether the line contains a coding spec. If it does,
261 invoke the set_readline function for the new encoding.
262 This function receives the tok_state and the new encoding.
263 Return 1 on success, 0 on failure. */
264
265static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000266check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 int set_readline(struct tok_state *, const char *))
268{
Tim Peters17db21f2002-09-03 15:39:58 +0000269 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000270 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000271
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000272 if (tok->cont_line)
273 /* It's a continuation line, so it can't be a coding spec. */
274 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000275 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000276 if (cs != NULL) {
277 tok->read_coding_spec = 1;
278 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000279 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000280 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000281 tok->encoding = cs;
282 } else {
283 r = set_readline(tok, cs);
284 if (r) {
285 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000286 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000287 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000288 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000290 }
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000294 }
295 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000302 return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
308
309static int
310check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
314{
315 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000316 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000320 ch = get_char(tok);
321 if (ch != 0xBB) {
322 unget_char(ch, tok);
323 unget_char(0xEF, tok);
324 /* any token beginning with '\xEF' is a bad token */
325 return 1;
326 }
327 ch = get_char(tok);
328 if (ch != 0xBF) {
329 unget_char(ch, tok);
330 unget_char(0xBB, tok);
331 unget_char(0xEF, tok);
332 /* any token beginning with '\xEF' is a bad token */
333 return 1;
334 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335#if 0
336 /* Disable support for UTF-16 BOMs until a decision
337 is made whether this needs to be supported. */
338 } else if (ch == 0xFE) {
339 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
340 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000341 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000342 } else if (ch == 0xFF) {
343 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
344 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000345 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000346#endif
347 } else {
348 unget_char(ch, tok);
349 return 1;
350 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000351 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000353 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000354 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000355 return 1;
356}
357
358/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000361 On entry, tok->decoding_buffer will be one of:
362 1) NULL: need to call tok->decoding_readline to get a new line
363 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
364 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000365 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000366 (in the s buffer) to copy entire contents of the line read
367 by tok->decoding_readline. tok->decoding_buffer has the overflow.
368 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000370 reached): see tok_nextc and its calls to decoding_fgets.
371*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000372
373static char *
374fp_readl(char *s, int size, struct tok_state *tok)
375{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000376 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000377 const char *buf;
378 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000379
380 /* Ask for one less byte so we can terminate it */
381 assert(size > 0);
382 size--;
383
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000384 if (tok->decoding_buffer) {
385 bufobj = tok->decoding_buffer;
386 Py_INCREF(bufobj);
387 }
388 else
389 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000390 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
391 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000392 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000393 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000394 if (PyUnicode_CheckExact(bufobj))
395 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000396 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000397 if (buf == NULL) {
398 goto error;
399 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000400 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000401 else
402 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000403 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000404 if (buf == NULL) {
405 goto error;
406 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000408 }
409
410 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000411 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000412 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000413 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000414 buflen-size);
415 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000416 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000417 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000418 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000419 else
420 tok->decoding_buffer = NULL;
421
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000422 memcpy(s, buf, buflen);
423 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000424 if (buflen == 0) /* EOF */
425 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000426 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000427 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000428
429error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000430 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000431 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000432}
433
434/* Set the readline function for TOK to a StreamReader's
435 readline function. The StreamReader is named ENC.
436
437 This function is called from check_bom and check_coding_spec.
438
439 ENC is usually identical to the future value of tok->encoding,
440 except for the (currently unsupported) case of UTF-16.
441
442 Return 1 on success, 0 on failure. */
443
444static int
445fp_setreadl(struct tok_state *tok, const char* enc)
446{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000447 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000448
Christian Heimes819b8bf2008-01-03 23:05:47 +0000449 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000450 if (io == NULL)
451 goto cleanup;
452
Brett Cannon8a9583e2008-09-04 05:04:25 +0000453 if (tok->filename)
454 stream = PyObject_CallMethod(io, "open", "ssis",
455 tok->filename, "r", -1, enc);
456 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000457 stream = PyObject_CallMethod(io, "open", "isisOOO",
458 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000459 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000460 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000461
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000462 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000463 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000465
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000466 /* The file has been reopened; parsing will restart from
467 * the beginning of the file, we have to reset the line number.
468 * But this function has been called from inside tok_nextc() which
469 * will increment lineno before it returns. So we set it -1 so that
470 * the next call to tok_nextc() will start with tok->lineno == 0.
471 */
472 tok->lineno = -1;
473
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000474 cleanup:
475 Py_XDECREF(stream);
476 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000477 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000478}
479
480/* Fetch the next byte from TOK. */
481
482static int fp_getc(struct tok_state *tok) {
483 return getc(tok->fp);
484}
485
486/* Unfetch the last byte back into TOK. */
487
488static void fp_ungetc(int c, struct tok_state *tok) {
489 ungetc(c, tok->fp);
490}
491
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000492/* Check whether the characters at s start a valid
493 UTF-8 sequence. Return the number of characters forming
494 the sequence if yes, 0 if not. */
495static int valid_utf8(const unsigned char* s)
496{
497 int expected = 0;
498 int length;
499 if (*s < 0x80)
500 /* single-byte code */
501 return 1;
502 if (*s < 0xc0)
503 /* following byte */
504 return 0;
505 if (*s < 0xE0)
506 expected = 1;
507 else if (*s < 0xF0)
508 expected = 2;
509 else if (*s < 0xF8)
510 expected = 3;
511 else
512 return 0;
513 length = expected + 1;
514 for (; expected; expected--)
515 if (s[expected] < 0x80 || s[expected] >= 0xC0)
516 return 0;
517 return length;
518}
519
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000520/* Read a line of input from TOK. Determine encoding
521 if necessary. */
522
523static char *
524decoding_fgets(char *s, int size, struct tok_state *tok)
525{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000526 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000527 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000528 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000529 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000530 /* We already have a codec associated with
531 this input. */
532 line = fp_readl(s, size, tok);
533 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000534 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000535 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000536 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000537 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538 break;
539 } else {
540 /* We have not yet determined the encoding.
541 If an encoding is found, use the file-pointer
542 reader functions from now on. */
543 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
544 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000545 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000546 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000547 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000548 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
549 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
550 return error_ret(tok);
551 }
552 }
553#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000554 /* The default encoding is UTF-8, so make sure we don't have any
555 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000556 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000558 int length;
559 for (c = (unsigned char *)line; *c; c += length)
560 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000561 badchar = *c;
562 break;
563 }
564 }
565 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000566 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000567 /* Need to add 1 to the line number, since this line
568 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000569 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000570 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000571 "in file %.200s on line %i, "
572 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000573 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000574 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000575 PyErr_SetString(PyExc_SyntaxError, buf);
576 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577 }
578#endif
579 return line;
580}
581
582static int
583decoding_feof(struct tok_state *tok)
584{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000585 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000586 return feof(tok->fp);
587 } else {
588 PyObject* buf = tok->decoding_buffer;
589 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000590 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000591 if (buf == NULL) {
592 error_ret(tok);
593 return 1;
594 } else {
595 tok->decoding_buffer = buf;
596 }
597 }
598 return PyObject_Length(buf) == 0;
599 }
600}
601
602/* Fetch a byte from TOK, using the string buffer. */
603
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000604static int
605buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000606 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000607}
608
609/* Unfetch a byte from TOK, using the string buffer. */
610
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000611static void
612buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000614 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000615}
616
617/* Set the readline function for TOK to ENC. For the string-based
618 tokenizer, this means to just record the encoding. */
619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000620static int
621buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000622 tok->enc = enc;
623 return 1;
624}
625
626/* Return a UTF-8 encoding Python string object from the
627 C byte string STR, which is encoded with ENC. */
628
629static PyObject *
630translate_into_utf8(const char* str, const char* enc) {
631 PyObject *utf8;
632 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
633 if (buf == NULL)
634 return NULL;
635 utf8 = PyUnicode_AsUTF8String(buf);
636 Py_DECREF(buf);
637 return utf8;
638}
639
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000640
641static char *
642translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
643 int skip_next_lf = 0, length = strlen(s), final_length;
644 char *buf, *current;
645 char c;
646 buf = PyMem_MALLOC(length + 2);
647 if (buf == NULL) {
648 tok->done = E_NOMEM;
649 return NULL;
650 }
651 for (current = buf; (c = *s++);) {
652 if (skip_next_lf) {
653 skip_next_lf = 0;
654 if (c == '\n') {
655 c = *s;
656 s++;
657 if (!c)
658 break;
659 }
660 }
661 if (c == '\r') {
662 skip_next_lf = 1;
663 c = '\n';
664 }
665 *current = c;
666 current++;
667 }
668 /* If this is exec input, add a newline to the end of the file if
669 there isn't one already. */
670 if (exec_input && *current != '\n') {
671 *current = '\n';
672 current++;
673 }
674 *current = '\0';
675 final_length = current - buf;
676 if (final_length < length && final_length)
677 /* should never fail */
678 buf = PyMem_REALLOC(buf, final_length + 1);
679 return buf;
680}
681
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000682/* Decode a byte string STR for use as the buffer of TOK.
683 Look for encoding declarations inside STR, and record them
684 inside TOK. */
685
686static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000687decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000688{
689 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000690 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000691 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000692 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000693 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000694 tok->input = str = translate_newlines(input, single, tok);
695 if (str == NULL)
696 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000697 tok->enc = NULL;
698 tok->str = str;
699 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000700 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000701 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000702 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000703 if (tok->enc != NULL) {
704 utf8 = translate_into_utf8(str, tok->enc);
705 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000706 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000707 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000708 }
709 for (s = str;; s++) {
710 if (*s == '\0') break;
711 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000712 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000713 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714 lineno++;
715 if (lineno == 2) break;
716 }
717 }
718 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000719 /* need to check line 1 and 2 separately since check_coding_spec
720 assumes a single line as input */
721 if (newl[0]) {
722 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
723 return error_ret(tok);
724 if (tok->enc == NULL && newl[1]) {
725 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
726 tok, buf_setreadl))
727 return error_ret(tok);
728 }
729 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000730 if (tok->enc != NULL) {
731 assert(utf8 == NULL);
732 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000733 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000734 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000735 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000736 }
737 assert(tok->decoding_buffer == NULL);
738 tok->decoding_buffer = utf8; /* CAUTION */
739 return str;
740}
741
742#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743
744/* Set up tokenizer for string */
745
746struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000747PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000748{
749 struct tok_state *tok = tok_new();
750 if (tok == NULL)
751 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000752 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000753 if (str == NULL) {
754 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000755 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000756 }
757
Martin v. Löwis95292d62002-12-11 14:04:59 +0000758 /* XXX: constify members. */
759 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 return tok;
761}
762
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000763struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000764PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000765{
766 struct tok_state *tok = tok_new();
767 if (tok == NULL)
768 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000769#ifndef PGEN
770 tok->input = str = translate_newlines(str, exec_input, tok);
771#endif
772 if (str == NULL) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000776 tok->decoding_state = STATE_RAW;
777 tok->read_coding_spec = 1;
778 tok->enc = NULL;
779 tok->str = str;
780 tok->encoding = (char *)PyMem_MALLOC(6);
781 if (!tok->encoding) {
782 PyTokenizer_Free(tok);
783 return NULL;
784 }
785 strcpy(tok->encoding, "utf-8");
786
787 /* XXX: constify members. */
788 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
789 return tok;
790}
791
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000792/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000793
794struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000795PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000796{
797 struct tok_state *tok = tok_new();
798 if (tok == NULL)
799 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000800 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000801 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802 return NULL;
803 }
804 tok->cur = tok->inp = tok->buf;
805 tok->end = tok->buf + BUFSIZ;
806 tok->fp = fp;
807 tok->prompt = ps1;
808 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000809 if (enc != NULL) {
810 /* Must copy encoding declaration since it
811 gets copied into the parse tree. */
812 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
813 if (!tok->encoding) {
814 PyTokenizer_Free(tok);
815 return NULL;
816 }
817 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000818 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000819 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000820 return tok;
821}
822
823
824/* Free a tok_state structure */
825
826void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000827PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000829 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000830 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000831#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000832 Py_XDECREF(tok->decoding_readline);
833 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000834#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000835 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000836 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000837 if (tok->input)
838 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000839 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000840}
841
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000842/* Get next char, updating state; error code goes into tok->done */
843
844static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000845tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000847 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000848 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000849 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000850 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000851 if (tok->done != E_OK)
852 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000853 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000854 char *end = strchr(tok->inp, '\n');
855 if (end != NULL)
856 end++;
857 else {
858 end = strchr(tok->inp, '\0');
859 if (end == tok->inp) {
860 tok->done = E_EOF;
861 return EOF;
862 }
863 }
864 if (tok->start == NULL)
865 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000866 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000867 tok->lineno++;
868 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000869 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000870 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000871 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000872 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000873#ifndef PGEN
874 if (tok->encoding && newtok && *newtok) {
875 /* Recode to UTF-8 */
876 Py_ssize_t buflen;
877 const char* buf;
878 PyObject *u = translate_into_utf8(newtok, tok->encoding);
879 PyMem_FREE(newtok);
880 if (!u) {
881 tok->done = E_DECODE;
882 return EOF;
883 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000884 buflen = PyBytes_GET_SIZE(u);
885 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000886 if (!buf) {
887 Py_DECREF(u);
888 tok->done = E_DECODE;
889 return EOF;
890 }
891 newtok = PyMem_MALLOC(buflen+1);
892 strcpy(newtok, buf);
893 Py_DECREF(u);
894 }
895#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896 if (tok->nextprompt != NULL)
897 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000899 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 else if (*newtok == '\0') {
901 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000902 tok->done = E_EOF;
903 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000905 size_t start = tok->start - tok->buf;
906 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000907 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000908 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000909 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000910 tok->lineno++;
911 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000912 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000913 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000915 tok->done = E_NOMEM;
916 return EOF;
917 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000918 tok->buf = buf;
919 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000920 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000921 strcpy(tok->buf + oldlen, newtok);
922 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000923 tok->inp = tok->buf + newlen;
924 tok->end = tok->inp + 1;
925 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000926 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000927 else {
928 tok->lineno++;
929 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000930 PyMem_FREE(tok->buf);
931 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000932 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000933 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000934 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000935 tok->inp = strchr(tok->buf, '\0');
936 tok->end = tok->inp + 1;
937 }
938 }
939 else {
940 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000941 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000942 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000943 if (tok->start == NULL) {
944 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000945 tok->buf = (char *)
946 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000947 if (tok->buf == NULL) {
948 tok->done = E_NOMEM;
949 return EOF;
950 }
951 tok->end = tok->buf + BUFSIZ;
952 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000953 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
954 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000955 tok->done = E_EOF;
956 done = 1;
957 }
958 else {
959 tok->done = E_OK;
960 tok->inp = strchr(tok->buf, '\0');
961 done = tok->inp[-1] == '\n';
962 }
963 }
964 else {
965 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000966 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000967 tok->done = E_EOF;
968 done = 1;
969 }
970 else
971 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000972 }
973 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000974 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000975 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000976 Py_ssize_t curstart = tok->start == NULL ? -1 :
977 tok->start - tok->buf;
978 Py_ssize_t curvalid = tok->inp - tok->buf;
979 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000980 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000981 newbuf = (char *)PyMem_REALLOC(newbuf,
982 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000983 if (newbuf == NULL) {
984 tok->done = E_NOMEM;
985 tok->cur = tok->inp;
986 return EOF;
987 }
988 tok->buf = newbuf;
989 tok->inp = tok->buf + curvalid;
990 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000991 tok->start = curstart < 0 ? NULL :
992 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000993 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000994 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000995 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000996 /* Break out early on decoding
997 errors, as tok->buf will be NULL
998 */
999 if (tok->decoding_erred)
1000 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001001 /* Last line does not end in \n,
1002 fake one */
1003 strcpy(tok->inp, "\n");
1004 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001005 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001006 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001007 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001008 if (tok->buf != NULL) {
1009 tok->cur = tok->buf + cur;
1010 tok->line_start = tok->cur;
1011 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001012 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001013 pt = tok->inp - 2;
1014 if (pt >= tok->buf && *pt == '\r') {
1015 *pt++ = '\n';
1016 *pt = '\0';
1017 tok->inp = pt;
1018 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001019 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001020 }
1021 if (tok->done != E_OK) {
1022 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001023 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001024 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001025 return EOF;
1026 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001027 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001028 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001029}
1030
1031
1032/* Back-up one character */
1033
1034static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001035tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001036{
1037 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001038 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +00001039 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001040 if (*tok->cur != c)
1041 *tok->cur = c;
1042 }
1043}
1044
1045
1046/* Return the token corresponding to a single character */
1047
1048int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001049PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001050{
1051 switch (c) {
1052 case '(': return LPAR;
1053 case ')': return RPAR;
1054 case '[': return LSQB;
1055 case ']': return RSQB;
1056 case ':': return COLON;
1057 case ',': return COMMA;
1058 case ';': return SEMI;
1059 case '+': return PLUS;
1060 case '-': return MINUS;
1061 case '*': return STAR;
1062 case '/': return SLASH;
1063 case '|': return VBAR;
1064 case '&': return AMPER;
1065 case '<': return LESS;
1066 case '>': return GREATER;
1067 case '=': return EQUAL;
1068 case '.': return DOT;
1069 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001070 case '{': return LBRACE;
1071 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001072 case '^': return CIRCUMFLEX;
1073 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001074 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001075 default: return OP;
1076 }
1077}
1078
1079
Guido van Rossumfbab9051991-10-20 20:25:03 +00001080int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001081PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001082{
1083 switch (c1) {
1084 case '=':
1085 switch (c2) {
1086 case '=': return EQEQUAL;
1087 }
1088 break;
1089 case '!':
1090 switch (c2) {
1091 case '=': return NOTEQUAL;
1092 }
1093 break;
1094 case '<':
1095 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001096 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001097 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001098 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001099 }
1100 break;
1101 case '>':
1102 switch (c2) {
1103 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001104 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001105 }
1106 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001107 case '+':
1108 switch (c2) {
1109 case '=': return PLUSEQUAL;
1110 }
1111 break;
1112 case '-':
1113 switch (c2) {
1114 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001115 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001116 }
1117 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001118 case '*':
1119 switch (c2) {
1120 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001121 case '=': return STAREQUAL;
1122 }
1123 break;
1124 case '/':
1125 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001126 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001127 case '=': return SLASHEQUAL;
1128 }
1129 break;
1130 case '|':
1131 switch (c2) {
1132 case '=': return VBAREQUAL;
1133 }
1134 break;
1135 case '%':
1136 switch (c2) {
1137 case '=': return PERCENTEQUAL;
1138 }
1139 break;
1140 case '&':
1141 switch (c2) {
1142 case '=': return AMPEREQUAL;
1143 }
1144 break;
1145 case '^':
1146 switch (c2) {
1147 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001148 }
1149 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001150 }
1151 return OP;
1152}
1153
Thomas Wouters434d0822000-08-24 20:11:32 +00001154int
1155PyToken_ThreeChars(int c1, int c2, int c3)
1156{
1157 switch (c1) {
1158 case '<':
1159 switch (c2) {
1160 case '<':
1161 switch (c3) {
1162 case '=':
1163 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001164 }
1165 break;
1166 }
1167 break;
1168 case '>':
1169 switch (c2) {
1170 case '>':
1171 switch (c3) {
1172 case '=':
1173 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001174 }
1175 break;
1176 }
1177 break;
1178 case '*':
1179 switch (c2) {
1180 case '*':
1181 switch (c3) {
1182 case '=':
1183 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001184 }
1185 break;
1186 }
1187 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001188 case '/':
1189 switch (c2) {
1190 case '/':
1191 switch (c3) {
1192 case '=':
1193 return DOUBLESLASHEQUAL;
1194 }
1195 break;
1196 }
1197 break;
Georg Brandldde00282007-03-18 19:01:53 +00001198 case '.':
1199 switch (c2) {
1200 case '.':
1201 switch (c3) {
1202 case '.':
1203 return ELLIPSIS;
1204 }
1205 break;
1206 }
1207 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001208 }
1209 return OP;
1210}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001211
Guido van Rossum926f13a1998-04-09 21:38:06 +00001212static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001213indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001214{
1215 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001216 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001217 tok->cur = tok->inp;
1218 return 1;
1219 }
1220 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001221 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1222 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001223 tok->altwarning = 0;
1224 }
1225 return 0;
1226}
1227
Martin v. Löwis47383402007-08-15 07:32:56 +00001228#ifdef PGEN
1229#define verify_identifier(s,e) 1
1230#else
1231/* Verify that the identifier follows PEP 3131. */
1232static int
1233verify_identifier(char *start, char *end)
1234{
Guido van Rossume3e37012007-08-29 18:54:41 +00001235 PyObject *s;
1236 int result;
1237 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1238 if (s == NULL) {
1239 PyErr_Clear();
1240 return 0;
1241 }
1242 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001243 Py_DECREF(s);
1244 return result;
1245}
1246#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001247
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001248/* Get next token, after space stripping etc. */
1249
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001250static int
1251tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001252{
1253 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001254 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001255
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001256 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001257 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001258 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001259 blankline = 0;
1260
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001261 /* Get indentation level */
1262 if (tok->atbol) {
1263 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001264 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001265 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001266 for (;;) {
1267 c = tok_nextc(tok);
1268 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001269 col++, altcol++;
1270 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001271 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001272 altcol = (altcol/tok->alttabsize + 1)
1273 * tok->alttabsize;
1274 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001275 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001276 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001277 else
1278 break;
1279 }
1280 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001281 if (c == '#' || c == '\n') {
1282 /* Lines with only whitespace and/or comments
1283 shouldn't affect the indentation and are
1284 not passed to the parser as NEWLINE tokens,
1285 except *totally* empty lines in interactive
1286 mode, which signal the end of a command group. */
1287 if (col == 0 && c == '\n' && tok->prompt != NULL)
1288 blankline = 0; /* Let it through */
1289 else
1290 blankline = 1; /* Ignore completely */
1291 /* We can't jump back right here since we still
1292 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001293 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001294 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001295 if (col == tok->indstack[tok->indent]) {
1296 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001297 if (altcol != tok->altindstack[tok->indent]) {
1298 if (indenterror(tok))
1299 return ERRORTOKEN;
1300 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001301 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001302 else if (col > tok->indstack[tok->indent]) {
1303 /* Indent -- always one */
1304 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001305 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001306 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001307 return ERRORTOKEN;
1308 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001309 if (altcol <= tok->altindstack[tok->indent]) {
1310 if (indenterror(tok))
1311 return ERRORTOKEN;
1312 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001313 tok->pendin++;
1314 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001315 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001317 else /* col < tok->indstack[tok->indent] */ {
1318 /* Dedent -- any number, must be consistent */
1319 while (tok->indent > 0 &&
1320 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001321 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001322 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001323 }
1324 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001325 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001326 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001327 return ERRORTOKEN;
1328 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001329 if (altcol != tok->altindstack[tok->indent]) {
1330 if (indenterror(tok))
1331 return ERRORTOKEN;
1332 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001333 }
1334 }
1335 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001336
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001337 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001338
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001339 /* Return pending indents/dedents */
1340 if (tok->pendin != 0) {
1341 if (tok->pendin < 0) {
1342 tok->pendin++;
1343 return DEDENT;
1344 }
1345 else {
1346 tok->pendin--;
1347 return INDENT;
1348 }
1349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001352 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001353 /* Skip spaces */
1354 do {
1355 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001356 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001357
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001359 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001360
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001361 /* Skip comment */
1362 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001363 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001365
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001367 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001368 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001369 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001370
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001372 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001373 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001374 /* Process b"", r"" and br"" */
1375 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001376 c = tok_nextc(tok);
1377 if (c == '"' || c == '\'')
1378 goto letter_quote;
1379 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001380 if (c == 'r' || c == 'R') {
1381 c = tok_nextc(tok);
1382 if (c == '"' || c == '\'')
1383 goto letter_quote;
1384 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001385 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001386 if (c >= 128)
1387 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001388 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001389 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001390 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001391 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001392 !verify_identifier(tok->start, tok->cur)) {
1393 tok->done = E_IDENTIFIER;
1394 return ERRORTOKEN;
1395 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001396 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 *p_end = tok->cur;
1398 return NAME;
1399 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 /* Newline */
1402 if (c == '\n') {
1403 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001404 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001405 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001406 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001407 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001408 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001409 return NEWLINE;
1410 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001412 /* Period or number starting with period? */
1413 if (c == '.') {
1414 c = tok_nextc(tok);
1415 if (isdigit(c)) {
1416 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001417 } else if (c == '.') {
1418 c = tok_nextc(tok);
1419 if (c == '.') {
1420 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001421 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001422 return ELLIPSIS;
1423 } else {
1424 tok_backup(tok, c);
1425 }
1426 tok_backup(tok, '.');
1427 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001428 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001429 }
Georg Brandldde00282007-03-18 19:01:53 +00001430 *p_start = tok->start;
1431 *p_end = tok->cur;
1432 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001433 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001434
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001435 /* Number */
1436 if (isdigit(c)) {
1437 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001438 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001439 c = tok_nextc(tok);
1440 if (c == '.')
1441 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001442 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001443 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001444 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001445
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001446 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001447 c = tok_nextc(tok);
1448 if (!isxdigit(c)) {
1449 tok->done = E_TOKEN;
1450 tok_backup(tok, c);
1451 return ERRORTOKEN;
1452 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001453 do {
1454 c = tok_nextc(tok);
1455 } while (isxdigit(c));
1456 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001457 else if (c == 'o' || c == 'O') {
1458 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001459 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001460 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001461 tok->done = E_TOKEN;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001465 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001466 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001467 } while ('0' <= c && c < '8');
1468 }
1469 else if (c == 'b' || c == 'B') {
1470 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001471 c = tok_nextc(tok);
1472 if (c != '0' && c != '1') {
1473 tok->done = E_TOKEN;
1474 tok_backup(tok, c);
1475 return ERRORTOKEN;
1476 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001477 do {
1478 c = tok_nextc(tok);
1479 } while (c == '0' || c == '1');
1480 }
1481 else {
1482 int nonzero = 0;
1483 /* maybe old-style octal; c is first char of it */
1484 /* in any case, allow '0' as a literal */
1485 while (c == '0')
1486 c = tok_nextc(tok);
1487 while (isdigit(c)) {
1488 nonzero = 1;
1489 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001490 }
1491 if (c == '.')
1492 goto fraction;
1493 else if (c == 'e' || c == 'E')
1494 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001495 else if (c == 'j' || c == 'J')
1496 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001497 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001498 tok->done = E_TOKEN;
1499 tok_backup(tok, c);
1500 return ERRORTOKEN;
1501 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001502 }
1503 }
1504 else {
1505 /* Decimal */
1506 do {
1507 c = tok_nextc(tok);
1508 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001509 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001510 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001511 if (c == '.') {
1512 fraction:
1513 /* Fraction */
1514 do {
1515 c = tok_nextc(tok);
1516 } while (isdigit(c));
1517 }
1518 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001519 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001520 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001522 if (c == '+' || c == '-')
1523 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001524 if (!isdigit(c)) {
1525 tok->done = E_TOKEN;
1526 tok_backup(tok, c);
1527 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001528 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001529 do {
1530 c = tok_nextc(tok);
1531 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001532 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001533 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001534 /* Imaginary part */
1535 imaginary:
1536 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001537 }
1538 }
1539 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001540 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001541 *p_end = tok->cur;
1542 return NUMBER;
1543 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001544
1545 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001546 /* String */
1547 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001548 int quote = c;
1549 int quote_size = 1; /* 1 or 3 */
1550 int end_quote_size = 0;
1551
1552 /* Find the quote size and start of string */
1553 c = tok_nextc(tok);
1554 if (c == quote) {
1555 c = tok_nextc(tok);
1556 if (c == quote)
1557 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001558 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001559 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001560 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001561 if (c != quote)
1562 tok_backup(tok, c);
1563
1564 /* Get rest of string */
1565 while (end_quote_size != quote_size) {
1566 c = tok_nextc(tok);
1567 if (c == EOF) {
1568 if (quote_size == 3)
1569 tok->done = E_EOFS;
1570 else
1571 tok->done = E_EOLS;
1572 tok->cur = tok->inp;
1573 return ERRORTOKEN;
1574 }
1575 if (quote_size == 1 && c == '\n') {
1576 tok->done = E_EOLS;
1577 tok->cur = tok->inp;
1578 return ERRORTOKEN;
1579 }
1580 if (c == quote)
1581 end_quote_size += 1;
1582 else {
1583 end_quote_size = 0;
1584 if (c == '\\')
1585 c = tok_nextc(tok); /* skip escaped char */
1586 }
1587 }
1588
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001589 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001590 *p_end = tok->cur;
1591 return STRING;
1592 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001593
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001594 /* Line continuation */
1595 if (c == '\\') {
1596 c = tok_nextc(tok);
1597 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001598 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001599 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001600 return ERRORTOKEN;
1601 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001602 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001603 goto again; /* Read next line */
1604 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001605
Guido van Rossumfbab9051991-10-20 20:25:03 +00001606 /* Check for two-character token */
1607 {
1608 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001609 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001610 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001611 int c3 = tok_nextc(tok);
1612 int token3 = PyToken_ThreeChars(c, c2, c3);
1613 if (token3 != OP) {
1614 token = token3;
1615 } else {
1616 tok_backup(tok, c3);
1617 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001618 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001619 *p_end = tok->cur;
1620 return token;
1621 }
1622 tok_backup(tok, c2);
1623 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001624
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001625 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001626 switch (c) {
1627 case '(':
1628 case '[':
1629 case '{':
1630 tok->level++;
1631 break;
1632 case ')':
1633 case ']':
1634 case '}':
1635 tok->level--;
1636 break;
1637 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001638
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001639 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001640 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001641 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001642 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001643}
1644
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001645int
1646PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1647{
1648 int result = tok_get(tok, p_start, p_end);
1649 if (tok->decoding_erred) {
1650 result = ERRORTOKEN;
1651 tok->done = E_DECODE;
1652 }
1653 return result;
1654}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001655
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001656/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001657
1658 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001659 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001660 should be assumed to be PyUnicode_GetDefaultEncoding()).
1661
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001662 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1663 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001664*/
1665char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001666PyTokenizer_FindEncoding(int fd)
1667{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001668 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001669 FILE *fp;
1670 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001671
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001672 fd = dup(fd);
1673 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001674 return NULL;
1675 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001676 fp = fdopen(fd, "r");
1677 if (fp == NULL) {
1678 return NULL;
1679 }
1680 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1681 if (tok == NULL) {
1682 fclose(fp);
1683 return NULL;
1684 }
1685 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001686 PyTokenizer_Get(tok, &p_start, &p_end);
1687 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001688 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001689 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001690 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001691 if (encoding)
1692 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001693 }
1694 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001695 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001696}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001697
Guido van Rossum408027e1996-12-30 16:17:54 +00001698#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001699
1700void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001701tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001702{
Guido van Rossum86bea461997-04-29 21:03:06 +00001703 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001704 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1705 printf("(%.*s)", (int)(end - start), start);
1706}
1707
1708#endif