blob: be2940cd67ff9e555598d4a82b51d21e9a22cb39 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000122 tok->input = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000130 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000131 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000132 tok->altwarning = 1;
133 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000134 tok->alttabsize = 1;
135 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000136 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000137 tok->decoding_erred = 0;
138 tok->read_coding_spec = 0;
Brett Cannonda780432008-10-17 03:38:50 +0000139 tok->enc = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000140 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000141 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000142#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000146 return tok;
147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152 char* result = (char *)PyMem_MALLOC(len + 1);
153 if (result != NULL) {
154 memcpy(result, s, len);
155 result[len] = '\0';
156 }
157 return result;
158}
159
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165 return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171 return feof(tok->fp);
172}
173
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000176{
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000177 return new_string(str, strlen(str));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178}
179
180#else /* PGEN */
181
Benjamin Peterson8daa49e2010-04-03 22:55:48 +0000182/* Ensure that the locale does not interfere with tokenization. */
183
184static int
185ascii_isalnum(int c)
186{
187 return (('a' <= c && c <= 'z') ||
188 ('A' <= c && c <= 'Z') ||
189 ('0' <= c && c <= '9'));
190}
191
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192static char *
193error_ret(struct tok_state *tok) /* XXX */
194{
195 tok->decoding_erred = 1;
196 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000197 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000198 tok->buf = NULL;
199 return NULL; /* as if it were EOF */
200}
201
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000202
203static char *
204get_normal_name(char *s) /* for utf-8 and latin-1 */
205{
206 char buf[13];
207 int i;
208 for (i = 0; i < 12; i++) {
209 int c = s[i];
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000210 if (c == '\0')
211 break;
212 else if (c == '_')
213 buf[i] = '-';
214 else
215 buf[i] = tolower(c);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000216 }
217 buf[i] = '\0';
218 if (strcmp(buf, "utf-8") == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000219 strncmp(buf, "utf-8-", 6) == 0)
220 return "utf-8";
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000221 else if (strcmp(buf, "latin-1") == 0 ||
222 strcmp(buf, "iso-8859-1") == 0 ||
223 strcmp(buf, "iso-latin-1") == 0 ||
224 strncmp(buf, "latin-1-", 8) == 0 ||
225 strncmp(buf, "iso-8859-1-", 11) == 0 ||
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000226 strncmp(buf, "iso-latin-1-", 12) == 0)
227 return "iso-8859-1";
228 else
229 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000230}
231
232/* Return the coding spec in S, or NULL if none is found. */
233
234static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000236{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000238 /* Coding spec must be in a comment, and that comment must be
239 * the only statement on the source code line. */
240 for (i = 0; i < size - 6; i++) {
241 if (s[i] == '#')
242 break;
243 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
244 return NULL;
245 }
246 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000247 const char* t = s + i;
248 if (strncmp(t, "coding", 6) == 0) {
249 const char* begin = NULL;
250 t += 6;
251 if (t[0] != ':' && t[0] != '=')
252 continue;
253 do {
254 t++;
255 } while (t[0] == '\x20' || t[0] == '\t');
256
257 begin = t;
Benjamin Peterson8daa49e2010-04-03 22:55:48 +0000258 while (ascii_isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000259 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000260 t++;
261
262 if (begin < t) {
263 char* r = new_string(begin, t - begin);
264 char* q = get_normal_name(r);
265 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000267 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000268 }
269 return r;
270 }
271 }
272 }
273 return NULL;
274}
275
276/* Check whether the line contains a coding spec. If it does,
277 invoke the set_readline function for the new encoding.
278 This function receives the tok_state and the new encoding.
279 Return 1 on success, 0 on failure. */
280
281static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000282check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000283 int set_readline(struct tok_state *, const char *))
284{
Tim Peters17db21f2002-09-03 15:39:58 +0000285 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000286 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000287
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000288 if (tok->cont_line)
289 /* It's a continuation line, so it can't be a coding spec. */
290 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000291 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 if (cs != NULL) {
293 tok->read_coding_spec = 1;
294 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000295 assert(tok->decoding_state == STATE_RAW);
Brett Cannonda780432008-10-17 03:38:50 +0000296 if (strcmp(cs, "utf-8") == 0) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000297 tok->encoding = cs;
298 } else {
299 r = set_readline(tok, cs);
300 if (r) {
301 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000302 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000303 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000304 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000306 }
307 } else { /* then, compare cs with BOM */
308 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000309 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000310 }
311 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000312 if (!r) {
313 cs = tok->encoding;
314 if (!cs)
315 cs = "with BOM";
316 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
317 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000318 return r;
319}
320
321/* See whether the file starts with a BOM. If it does,
322 invoke the set_readline function with the new encoding.
323 Return 1 on success, 0 on failure. */
324
325static int
326check_bom(int get_char(struct tok_state *),
327 void unget_char(int, struct tok_state *),
328 int set_readline(struct tok_state *, const char *),
329 struct tok_state *tok)
330{
Victor Stinner6aa278e2010-03-03 00:18:49 +0000331 int ch1, ch2, ch3;
332 ch1 = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000333 tok->decoding_state = STATE_RAW;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000334 if (ch1 == EOF) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000335 return 1;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000336 } else if (ch1 == 0xEF) {
337 ch2 = get_char(tok);
338 if (ch2 != 0xBB) {
339 unget_char(ch2, tok);
340 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000341 return 1;
342 }
Victor Stinner6aa278e2010-03-03 00:18:49 +0000343 ch3 = get_char(tok);
344 if (ch3 != 0xBF) {
345 unget_char(ch3, tok);
346 unget_char(ch2, tok);
347 unget_char(ch1, tok);
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000348 return 1;
349 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000350#if 0
351 /* Disable support for UTF-16 BOMs until a decision
352 is made whether this needs to be supported. */
Victor Stinner6aa278e2010-03-03 00:18:49 +0000353 } else if (ch1 == 0xFE) {
354 ch2 = get_char(tok);
355 if (ch2 != 0xFF) {
356 unget_char(ch2, tok);
357 unget_char(ch1, tok);
358 return 1;
359 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000360 if (!set_readline(tok, "utf-16-be"))
361 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000362 tok->decoding_state = STATE_NORMAL;
Victor Stinner6aa278e2010-03-03 00:18:49 +0000363 } else if (ch1 == 0xFF) {
364 ch2 = get_char(tok);
365 if (ch2 != 0xFE) {
366 unget_char(ch2, tok);
367 unget_char(ch1, tok);
368 return 1;
369 }
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000370 if (!set_readline(tok, "utf-16-le"))
371 return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000372 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000373#endif
374 } else {
Victor Stinner6aa278e2010-03-03 00:18:49 +0000375 unget_char(ch1, tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000376 return 1;
377 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000378 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000380 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000381 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000382 return 1;
383}
384
385/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000386 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000388 On entry, tok->decoding_buffer will be one of:
389 1) NULL: need to call tok->decoding_readline to get a new line
390 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
391 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000392 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000393 (in the s buffer) to copy entire contents of the line read
394 by tok->decoding_readline. tok->decoding_buffer has the overflow.
395 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000396 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000397 reached): see tok_nextc and its calls to decoding_fgets.
398*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000399
400static char *
401fp_readl(char *s, int size, struct tok_state *tok)
402{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000403 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000404 const char *buf;
405 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000406
407 /* Ask for one less byte so we can terminate it */
408 assert(size > 0);
409 size--;
410
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000411 if (tok->decoding_buffer) {
412 bufobj = tok->decoding_buffer;
413 Py_INCREF(bufobj);
414 }
415 else
416 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000417 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
418 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000419 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000420 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000421 if (PyUnicode_CheckExact(bufobj))
422 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000423 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 if (buf == NULL) {
425 goto error;
426 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000427 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000428 else
429 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000430 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000431 if (buf == NULL) {
432 goto error;
433 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000434 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000435 }
436
437 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000438 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000439 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000440 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000441 buflen-size);
442 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000443 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000444 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000445 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000446 else
447 tok->decoding_buffer = NULL;
448
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000449 memcpy(s, buf, buflen);
450 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000451 if (buflen == 0) /* EOF */
452 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000453 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000454 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000455
456error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000457 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000458 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459}
460
461/* Set the readline function for TOK to a StreamReader's
462 readline function. The StreamReader is named ENC.
463
464 This function is called from check_bom and check_coding_spec.
465
466 ENC is usually identical to the future value of tok->encoding,
467 except for the (currently unsupported) case of UTF-16.
468
469 Return 1 on success, 0 on failure. */
470
471static int
472fp_setreadl(struct tok_state *tok, const char* enc)
473{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000474 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000475
Christian Heimes819b8bf2008-01-03 23:05:47 +0000476 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000477 if (io == NULL)
478 goto cleanup;
479
Brett Cannon8a9583e2008-09-04 05:04:25 +0000480 if (tok->filename)
481 stream = PyObject_CallMethod(io, "open", "ssis",
482 tok->filename, "r", -1, enc);
483 else
Kristján Valur Jónsson19288c22008-12-18 17:15:54 +0000484 stream = PyObject_CallMethod(io, "open", "isisOOO",
485 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000486 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000487 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000488
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000489 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000490 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000491 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000492
Amaury Forgeot d'Arccf8016a2008-10-09 23:37:48 +0000493 /* The file has been reopened; parsing will restart from
494 * the beginning of the file, we have to reset the line number.
495 * But this function has been called from inside tok_nextc() which
496 * will increment lineno before it returns. So we set it -1 so that
497 * the next call to tok_nextc() will start with tok->lineno == 0.
498 */
499 tok->lineno = -1;
500
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000501 cleanup:
502 Py_XDECREF(stream);
503 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000504 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000505}
506
507/* Fetch the next byte from TOK. */
508
509static int fp_getc(struct tok_state *tok) {
510 return getc(tok->fp);
511}
512
513/* Unfetch the last byte back into TOK. */
514
515static void fp_ungetc(int c, struct tok_state *tok) {
516 ungetc(c, tok->fp);
517}
518
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000519/* Check whether the characters at s start a valid
520 UTF-8 sequence. Return the number of characters forming
521 the sequence if yes, 0 if not. */
522static int valid_utf8(const unsigned char* s)
523{
524 int expected = 0;
525 int length;
526 if (*s < 0x80)
527 /* single-byte code */
528 return 1;
529 if (*s < 0xc0)
530 /* following byte */
531 return 0;
532 if (*s < 0xE0)
533 expected = 1;
534 else if (*s < 0xF0)
535 expected = 2;
536 else if (*s < 0xF8)
537 expected = 3;
538 else
539 return 0;
540 length = expected + 1;
541 for (; expected; expected--)
542 if (s[expected] < 0x80 || s[expected] >= 0xC0)
543 return 0;
544 return length;
545}
546
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547/* Read a line of input from TOK. Determine encoding
548 if necessary. */
549
550static char *
551decoding_fgets(char *s, int size, struct tok_state *tok)
552{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000553 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000554 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000555 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000556 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000557 /* We already have a codec associated with
558 this input. */
559 line = fp_readl(s, size, tok);
560 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000561 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000562 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000563 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000564 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000565 break;
566 } else {
567 /* We have not yet determined the encoding.
568 If an encoding is found, use the file-pointer
569 reader functions from now on. */
570 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
571 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000572 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000573 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000574 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000575 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
576 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
577 return error_ret(tok);
578 }
579 }
580#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000581 /* The default encoding is UTF-8, so make sure we don't have any
582 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000583 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000584 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000585 int length;
586 for (c = (unsigned char *)line; *c; c += length)
587 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000588 badchar = *c;
589 break;
590 }
591 }
592 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000593 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000594 /* Need to add 1 to the line number, since this line
595 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000596 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000597 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000598 "in file %.200s on line %i, "
599 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000600 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000601 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000602 PyErr_SetString(PyExc_SyntaxError, buf);
603 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000604 }
605#endif
606 return line;
607}
608
609static int
610decoding_feof(struct tok_state *tok)
611{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000612 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000613 return feof(tok->fp);
614 } else {
615 PyObject* buf = tok->decoding_buffer;
616 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000617 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000618 if (buf == NULL) {
619 error_ret(tok);
620 return 1;
621 } else {
622 tok->decoding_buffer = buf;
623 }
624 }
625 return PyObject_Length(buf) == 0;
626 }
627}
628
629/* Fetch a byte from TOK, using the string buffer. */
630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000631static int
632buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000633 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000634}
635
636/* Unfetch a byte from TOK, using the string buffer. */
637
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000638static void
639buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000641 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000642}
643
644/* Set the readline function for TOK to ENC. For the string-based
645 tokenizer, this means to just record the encoding. */
646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000647static int
648buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000649 tok->enc = enc;
650 return 1;
651}
652
653/* Return a UTF-8 encoding Python string object from the
654 C byte string STR, which is encoded with ENC. */
655
656static PyObject *
657translate_into_utf8(const char* str, const char* enc) {
658 PyObject *utf8;
659 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
660 if (buf == NULL)
661 return NULL;
662 utf8 = PyUnicode_AsUTF8String(buf);
663 Py_DECREF(buf);
664 return utf8;
665}
666
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000667
668static char *
669translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000670 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000671 char *buf, *current;
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000672 char c = '\0';
673 buf = PyMem_MALLOC(needed_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000674 if (buf == NULL) {
675 tok->done = E_NOMEM;
676 return NULL;
677 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000678 for (current = buf; *s; s++, current++) {
679 c = *s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000680 if (skip_next_lf) {
681 skip_next_lf = 0;
682 if (c == '\n') {
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000683 c = *++s;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000684 if (!c)
685 break;
686 }
687 }
688 if (c == '\r') {
689 skip_next_lf = 1;
690 c = '\n';
691 }
692 *current = c;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000693 }
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000694 /* If this is exec input, add a newline to the end of the string if
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000695 there isn't one already. */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000696 if (exec_input && c != '\n') {
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000697 *current = '\n';
698 current++;
699 }
700 *current = '\0';
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000701 final_length = current - buf + 1;
702 if (final_length < needed_length && final_length)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000703 /* should never fail */
Benjamin Peterson8f326b22009-12-13 02:10:36 +0000704 buf = PyMem_REALLOC(buf, final_length);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000705 return buf;
706}
707
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000708/* Decode a byte string STR for use as the buffer of TOK.
709 Look for encoding declarations inside STR, and record them
710 inside TOK. */
711
712static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000713decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000714{
715 PyObject* utf8 = NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000716 const char *str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000717 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000718 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000719 int lineno = 0;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000720 tok->input = str = translate_newlines(input, single, tok);
721 if (str == NULL)
722 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000723 tok->enc = NULL;
724 tok->str = str;
725 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000726 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000727 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000728 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000729 if (tok->enc != NULL) {
730 utf8 = translate_into_utf8(str, tok->enc);
731 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000732 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000733 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000734 }
735 for (s = str;; s++) {
736 if (*s == '\0') break;
737 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000738 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000739 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000740 lineno++;
741 if (lineno == 2) break;
742 }
743 }
744 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000745 /* need to check line 1 and 2 separately since check_coding_spec
746 assumes a single line as input */
747 if (newl[0]) {
748 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
749 return error_ret(tok);
750 if (tok->enc == NULL && newl[1]) {
751 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
752 tok, buf_setreadl))
753 return error_ret(tok);
754 }
755 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000756 if (tok->enc != NULL) {
757 assert(utf8 == NULL);
758 utf8 = translate_into_utf8(str, tok->enc);
Benjamin Peterson0289b152009-06-28 17:22:03 +0000759 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000760 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000761 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000762 }
763 assert(tok->decoding_buffer == NULL);
764 tok->decoding_buffer = utf8; /* CAUTION */
765 return str;
766}
767
768#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000769
770/* Set up tokenizer for string */
771
772struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000773PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000774{
775 struct tok_state *tok = tok_new();
776 if (tok == NULL)
777 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000778 str = (char *)decode_str(str, exec_input, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000779 if (str == NULL) {
780 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000781 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000782 }
783
Martin v. Löwis95292d62002-12-11 14:04:59 +0000784 /* XXX: constify members. */
785 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000786 return tok;
787}
788
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000789struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000790PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000791{
792 struct tok_state *tok = tok_new();
793 if (tok == NULL)
794 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000795#ifndef PGEN
796 tok->input = str = translate_newlines(str, exec_input, tok);
797#endif
798 if (str == NULL) {
799 PyTokenizer_Free(tok);
800 return NULL;
801 }
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000802 tok->decoding_state = STATE_RAW;
803 tok->read_coding_spec = 1;
804 tok->enc = NULL;
805 tok->str = str;
806 tok->encoding = (char *)PyMem_MALLOC(6);
807 if (!tok->encoding) {
808 PyTokenizer_Free(tok);
809 return NULL;
810 }
811 strcpy(tok->encoding, "utf-8");
812
813 /* XXX: constify members. */
814 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
815 return tok;
816}
817
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000818/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000819
820struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000821PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000822{
823 struct tok_state *tok = tok_new();
824 if (tok == NULL)
825 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000826 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000827 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000828 return NULL;
829 }
830 tok->cur = tok->inp = tok->buf;
831 tok->end = tok->buf + BUFSIZ;
832 tok->fp = fp;
833 tok->prompt = ps1;
834 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000835 if (enc != NULL) {
836 /* Must copy encoding declaration since it
837 gets copied into the parse tree. */
838 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
839 if (!tok->encoding) {
840 PyTokenizer_Free(tok);
841 return NULL;
842 }
843 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000844 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000845 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000846 return tok;
847}
848
849
850/* Free a tok_state structure */
851
852void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000853PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000854{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000855 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000856 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000857#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000858 Py_XDECREF(tok->decoding_readline);
859 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000860#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000861 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000862 PyMem_FREE(tok->buf);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000863 if (tok->input)
864 PyMem_FREE((char *)tok->input);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000865 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000866}
867
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000868/* Get next char, updating state; error code goes into tok->done */
869
870static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000871tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000872{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000873 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000874 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000875 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000876 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000877 if (tok->done != E_OK)
878 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000879 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000880 char *end = strchr(tok->inp, '\n');
881 if (end != NULL)
882 end++;
883 else {
884 end = strchr(tok->inp, '\0');
885 if (end == tok->inp) {
886 tok->done = E_EOF;
887 return EOF;
888 }
889 }
890 if (tok->start == NULL)
891 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000892 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000893 tok->lineno++;
894 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000895 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000896 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000897 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000899#ifndef PGEN
900 if (tok->encoding && newtok && *newtok) {
901 /* Recode to UTF-8 */
902 Py_ssize_t buflen;
903 const char* buf;
904 PyObject *u = translate_into_utf8(newtok, tok->encoding);
905 PyMem_FREE(newtok);
906 if (!u) {
907 tok->done = E_DECODE;
908 return EOF;
909 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000910 buflen = PyBytes_GET_SIZE(u);
911 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000912 if (!buf) {
913 Py_DECREF(u);
914 tok->done = E_DECODE;
915 return EOF;
916 }
917 newtok = PyMem_MALLOC(buflen+1);
918 strcpy(newtok, buf);
919 Py_DECREF(u);
920 }
921#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000922 if (tok->nextprompt != NULL)
923 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000924 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000925 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000926 else if (*newtok == '\0') {
927 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000928 tok->done = E_EOF;
929 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000930 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000931 size_t start = tok->start - tok->buf;
932 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000933 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000934 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000935 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000936 tok->lineno++;
937 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000938 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000939 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000940 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 tok->done = E_NOMEM;
942 return EOF;
943 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000944 tok->buf = buf;
945 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000946 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000947 strcpy(tok->buf + oldlen, newtok);
948 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000949 tok->inp = tok->buf + newlen;
950 tok->end = tok->inp + 1;
951 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000952 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000953 else {
954 tok->lineno++;
955 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000956 PyMem_FREE(tok->buf);
957 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000958 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000959 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000960 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000961 tok->inp = strchr(tok->buf, '\0');
962 tok->end = tok->inp + 1;
963 }
964 }
965 else {
966 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000967 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000968 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000969 if (tok->start == NULL) {
970 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000971 tok->buf = (char *)
972 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000973 if (tok->buf == NULL) {
974 tok->done = E_NOMEM;
975 return EOF;
976 }
977 tok->end = tok->buf + BUFSIZ;
978 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000979 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
980 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000981 tok->done = E_EOF;
982 done = 1;
983 }
984 else {
985 tok->done = E_OK;
986 tok->inp = strchr(tok->buf, '\0');
987 done = tok->inp[-1] == '\n';
988 }
989 }
990 else {
991 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000992 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000993 tok->done = E_EOF;
994 done = 1;
995 }
996 else
997 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000998 }
999 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001000 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001001 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00001002 Py_ssize_t curstart = tok->start == NULL ? -1 :
1003 tok->start - tok->buf;
1004 Py_ssize_t curvalid = tok->inp - tok->buf;
1005 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001006 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001007 newbuf = (char *)PyMem_REALLOC(newbuf,
1008 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001009 if (newbuf == NULL) {
1010 tok->done = E_NOMEM;
1011 tok->cur = tok->inp;
1012 return EOF;
1013 }
1014 tok->buf = newbuf;
1015 tok->inp = tok->buf + curvalid;
1016 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001017 tok->start = curstart < 0 ? NULL :
1018 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001019 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001020 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001021 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +00001022 /* Break out early on decoding
1023 errors, as tok->buf will be NULL
1024 */
1025 if (tok->decoding_erred)
1026 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001027 /* Last line does not end in \n,
1028 fake one */
1029 strcpy(tok->inp, "\n");
1030 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001031 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001032 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001033 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001034 if (tok->buf != NULL) {
1035 tok->cur = tok->buf + cur;
1036 tok->line_start = tok->cur;
1037 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +00001038 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001039 pt = tok->inp - 2;
1040 if (pt >= tok->buf && *pt == '\r') {
1041 *pt++ = '\n';
1042 *pt = '\0';
1043 tok->inp = pt;
1044 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +00001045 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001046 }
1047 if (tok->done != E_OK) {
1048 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +00001049 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001050 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001051 return EOF;
1052 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001053 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001054 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001055}
1056
1057
1058/* Back-up one character */
1059
1060static void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001061tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001062{
1063 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +00001064 if (--tok->cur < tok->buf)
Benjamin Petersona0dfa822009-11-13 02:25:08 +00001065 Py_FatalError("tok_backup: beginning of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001066 if (*tok->cur != c)
1067 *tok->cur = c;
1068 }
1069}
1070
1071
1072/* Return the token corresponding to a single character */
1073
1074int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001075PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001076{
1077 switch (c) {
1078 case '(': return LPAR;
1079 case ')': return RPAR;
1080 case '[': return LSQB;
1081 case ']': return RSQB;
1082 case ':': return COLON;
1083 case ',': return COMMA;
1084 case ';': return SEMI;
1085 case '+': return PLUS;
1086 case '-': return MINUS;
1087 case '*': return STAR;
1088 case '/': return SLASH;
1089 case '|': return VBAR;
1090 case '&': return AMPER;
1091 case '<': return LESS;
1092 case '>': return GREATER;
1093 case '=': return EQUAL;
1094 case '.': return DOT;
1095 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001096 case '{': return LBRACE;
1097 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001098 case '^': return CIRCUMFLEX;
1099 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +00001100 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001101 default: return OP;
1102 }
1103}
1104
1105
Guido van Rossumfbab9051991-10-20 20:25:03 +00001106int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001107PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001108{
1109 switch (c1) {
1110 case '=':
1111 switch (c2) {
1112 case '=': return EQEQUAL;
1113 }
1114 break;
1115 case '!':
1116 switch (c2) {
1117 case '=': return NOTEQUAL;
1118 }
1119 break;
1120 case '<':
1121 switch (c2) {
Brett Cannone3944a52009-04-01 05:08:41 +00001122 case '>': return NOTEQUAL;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001123 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001124 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001125 }
1126 break;
1127 case '>':
1128 switch (c2) {
1129 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001130 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001131 }
1132 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001133 case '+':
1134 switch (c2) {
1135 case '=': return PLUSEQUAL;
1136 }
1137 break;
1138 case '-':
1139 switch (c2) {
1140 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001141 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001142 }
1143 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001144 case '*':
1145 switch (c2) {
1146 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001147 case '=': return STAREQUAL;
1148 }
1149 break;
1150 case '/':
1151 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001152 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001153 case '=': return SLASHEQUAL;
1154 }
1155 break;
1156 case '|':
1157 switch (c2) {
1158 case '=': return VBAREQUAL;
1159 }
1160 break;
1161 case '%':
1162 switch (c2) {
1163 case '=': return PERCENTEQUAL;
1164 }
1165 break;
1166 case '&':
1167 switch (c2) {
1168 case '=': return AMPEREQUAL;
1169 }
1170 break;
1171 case '^':
1172 switch (c2) {
1173 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001174 }
1175 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001176 }
1177 return OP;
1178}
1179
Thomas Wouters434d0822000-08-24 20:11:32 +00001180int
1181PyToken_ThreeChars(int c1, int c2, int c3)
1182{
1183 switch (c1) {
1184 case '<':
1185 switch (c2) {
1186 case '<':
1187 switch (c3) {
1188 case '=':
1189 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001190 }
1191 break;
1192 }
1193 break;
1194 case '>':
1195 switch (c2) {
1196 case '>':
1197 switch (c3) {
1198 case '=':
1199 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001200 }
1201 break;
1202 }
1203 break;
1204 case '*':
1205 switch (c2) {
1206 case '*':
1207 switch (c3) {
1208 case '=':
1209 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001210 }
1211 break;
1212 }
1213 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001214 case '/':
1215 switch (c2) {
1216 case '/':
1217 switch (c3) {
1218 case '=':
1219 return DOUBLESLASHEQUAL;
1220 }
1221 break;
1222 }
1223 break;
Georg Brandldde00282007-03-18 19:01:53 +00001224 case '.':
1225 switch (c2) {
1226 case '.':
1227 switch (c3) {
1228 case '.':
1229 return ELLIPSIS;
1230 }
1231 break;
1232 }
1233 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001234 }
1235 return OP;
1236}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001237
Guido van Rossum926f13a1998-04-09 21:38:06 +00001238static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001239indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001240{
1241 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001242 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001243 tok->cur = tok->inp;
1244 return 1;
1245 }
1246 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001247 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1248 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001249 tok->altwarning = 0;
1250 }
1251 return 0;
1252}
1253
Martin v. Löwis47383402007-08-15 07:32:56 +00001254#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001255#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001256#else
1257/* Verify that the identifier follows PEP 3131. */
1258static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001259verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001260{
Guido van Rossume3e37012007-08-29 18:54:41 +00001261 PyObject *s;
1262 int result;
Victor Stinner52f6dd72010-03-12 14:45:56 +00001263 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Guido van Rossume3e37012007-08-29 18:54:41 +00001264 if (s == NULL) {
Victor Stinner52f6dd72010-03-12 14:45:56 +00001265 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1266 PyErr_Clear();
1267 tok->done = E_IDENTIFIER;
1268 } else {
1269 tok->done = E_ERROR;
1270 }
Guido van Rossume3e37012007-08-29 18:54:41 +00001271 return 0;
1272 }
1273 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001274 Py_DECREF(s);
Victor Stinner52f6dd72010-03-12 14:45:56 +00001275 if (result == 0)
1276 tok->done = E_IDENTIFIER;
Martin v. Löwis47383402007-08-15 07:32:56 +00001277 return result;
1278}
1279#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001280
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001281/* Get next token, after space stripping etc. */
1282
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001283static int
1284tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001285{
1286 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001287 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001288
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001289 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001290 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001291 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001292 blankline = 0;
1293
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001294 /* Get indentation level */
1295 if (tok->atbol) {
1296 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001297 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001298 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001299 for (;;) {
1300 c = tok_nextc(tok);
1301 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001302 col++, altcol++;
1303 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001305 altcol = (altcol/tok->alttabsize + 1)
1306 * tok->alttabsize;
1307 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001308 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001309 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001310 else
1311 break;
1312 }
1313 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001314 if (c == '#' || c == '\n') {
1315 /* Lines with only whitespace and/or comments
1316 shouldn't affect the indentation and are
1317 not passed to the parser as NEWLINE tokens,
1318 except *totally* empty lines in interactive
1319 mode, which signal the end of a command group. */
1320 if (col == 0 && c == '\n' && tok->prompt != NULL)
1321 blankline = 0; /* Let it through */
1322 else
1323 blankline = 1; /* Ignore completely */
1324 /* We can't jump back right here since we still
1325 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001326 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001327 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001328 if (col == tok->indstack[tok->indent]) {
1329 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001330 if (altcol != tok->altindstack[tok->indent]) {
1331 if (indenterror(tok))
1332 return ERRORTOKEN;
1333 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001334 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001335 else if (col > tok->indstack[tok->indent]) {
1336 /* Indent -- always one */
1337 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001338 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001339 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001340 return ERRORTOKEN;
1341 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001342 if (altcol <= tok->altindstack[tok->indent]) {
1343 if (indenterror(tok))
1344 return ERRORTOKEN;
1345 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001346 tok->pendin++;
1347 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001348 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001349 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001350 else /* col < tok->indstack[tok->indent] */ {
1351 /* Dedent -- any number, must be consistent */
1352 while (tok->indent > 0 &&
1353 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001354 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001355 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001356 }
1357 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001358 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001359 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001360 return ERRORTOKEN;
1361 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001362 if (altcol != tok->altindstack[tok->indent]) {
1363 if (indenterror(tok))
1364 return ERRORTOKEN;
1365 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001366 }
1367 }
1368 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001369
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001370 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001371
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001372 /* Return pending indents/dedents */
1373 if (tok->pendin != 0) {
1374 if (tok->pendin < 0) {
1375 tok->pendin++;
1376 return DEDENT;
1377 }
1378 else {
1379 tok->pendin--;
1380 return INDENT;
1381 }
1382 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001383
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001384 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001385 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001386 /* Skip spaces */
1387 do {
1388 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001389 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001391 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001392 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001393
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001394 /* Skip comment */
1395 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001396 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001397 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001398
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001399 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001400 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001401 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001402 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001403
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001404 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001405 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001406 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001407 /* Process b"", r"" and br"" */
1408 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001409 c = tok_nextc(tok);
1410 if (c == '"' || c == '\'')
1411 goto letter_quote;
1412 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001413 if (c == 'r' || c == 'R') {
1414 c = tok_nextc(tok);
1415 if (c == '"' || c == '\'')
1416 goto letter_quote;
1417 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001418 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001419 if (c >= 128)
1420 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001421 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001422 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001423 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001424 if (nonascii &&
Victor Stinner52f6dd72010-03-12 14:45:56 +00001425 !verify_identifier(tok)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001426 tok->done = E_IDENTIFIER;
1427 return ERRORTOKEN;
1428 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001429 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001430 *p_end = tok->cur;
1431 return NAME;
1432 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001433
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001434 /* Newline */
1435 if (c == '\n') {
1436 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001437 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001438 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001439 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001440 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001441 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001442 return NEWLINE;
1443 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001445 /* Period or number starting with period? */
1446 if (c == '.') {
1447 c = tok_nextc(tok);
1448 if (isdigit(c)) {
1449 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001450 } else if (c == '.') {
1451 c = tok_nextc(tok);
1452 if (c == '.') {
1453 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001454 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001455 return ELLIPSIS;
1456 } else {
1457 tok_backup(tok, c);
1458 }
1459 tok_backup(tok, '.');
1460 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001461 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001462 }
Georg Brandldde00282007-03-18 19:01:53 +00001463 *p_start = tok->start;
1464 *p_end = tok->cur;
1465 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001466 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001467
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001468 /* Number */
1469 if (isdigit(c)) {
1470 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001471 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001472 c = tok_nextc(tok);
1473 if (c == '.')
1474 goto fraction;
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001475 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001476 goto imaginary;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001477 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001478
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001479 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001480 c = tok_nextc(tok);
1481 if (!isxdigit(c)) {
1482 tok->done = E_TOKEN;
1483 tok_backup(tok, c);
1484 return ERRORTOKEN;
1485 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001486 do {
1487 c = tok_nextc(tok);
1488 } while (isxdigit(c));
1489 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001490 else if (c == 'o' || c == 'O') {
1491 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001492 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001493 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001494 tok->done = E_TOKEN;
1495 tok_backup(tok, c);
1496 return ERRORTOKEN;
1497 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001498 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001499 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001500 } while ('0' <= c && c < '8');
1501 }
1502 else if (c == 'b' || c == 'B') {
1503 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001504 c = tok_nextc(tok);
1505 if (c != '0' && c != '1') {
1506 tok->done = E_TOKEN;
1507 tok_backup(tok, c);
1508 return ERRORTOKEN;
1509 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001510 do {
1511 c = tok_nextc(tok);
1512 } while (c == '0' || c == '1');
1513 }
1514 else {
1515 int nonzero = 0;
1516 /* maybe old-style octal; c is first char of it */
1517 /* in any case, allow '0' as a literal */
1518 while (c == '0')
1519 c = tok_nextc(tok);
1520 while (isdigit(c)) {
1521 nonzero = 1;
1522 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001523 }
1524 if (c == '.')
1525 goto fraction;
1526 else if (c == 'e' || c == 'E')
1527 goto exponent;
Tim Petersd507dab2001-08-30 20:51:59 +00001528 else if (c == 'j' || c == 'J')
1529 goto imaginary;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001530 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001531 tok->done = E_TOKEN;
1532 tok_backup(tok, c);
1533 return ERRORTOKEN;
1534 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001535 }
1536 }
1537 else {
1538 /* Decimal */
1539 do {
1540 c = tok_nextc(tok);
1541 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001542 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001543 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001544 if (c == '.') {
1545 fraction:
1546 /* Fraction */
1547 do {
1548 c = tok_nextc(tok);
1549 } while (isdigit(c));
1550 }
1551 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001552 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001553 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001554 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001555 if (c == '+' || c == '-')
1556 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001557 if (!isdigit(c)) {
1558 tok->done = E_TOKEN;
1559 tok_backup(tok, c);
1560 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001561 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001562 do {
1563 c = tok_nextc(tok);
1564 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565 }
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001566 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001567 /* Imaginary part */
1568 imaginary:
1569 c = tok_nextc(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001570 }
1571 }
1572 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001573 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001574 *p_end = tok->cur;
1575 return NUMBER;
1576 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001577
1578 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001579 /* String */
1580 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001581 int quote = c;
1582 int quote_size = 1; /* 1 or 3 */
1583 int end_quote_size = 0;
1584
1585 /* Find the quote size and start of string */
1586 c = tok_nextc(tok);
1587 if (c == quote) {
1588 c = tok_nextc(tok);
1589 if (c == quote)
1590 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001591 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001592 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001593 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001594 if (c != quote)
1595 tok_backup(tok, c);
1596
1597 /* Get rest of string */
1598 while (end_quote_size != quote_size) {
1599 c = tok_nextc(tok);
1600 if (c == EOF) {
1601 if (quote_size == 3)
1602 tok->done = E_EOFS;
1603 else
1604 tok->done = E_EOLS;
1605 tok->cur = tok->inp;
1606 return ERRORTOKEN;
1607 }
1608 if (quote_size == 1 && c == '\n') {
1609 tok->done = E_EOLS;
1610 tok->cur = tok->inp;
1611 return ERRORTOKEN;
1612 }
1613 if (c == quote)
1614 end_quote_size += 1;
1615 else {
1616 end_quote_size = 0;
1617 if (c == '\\')
1618 c = tok_nextc(tok); /* skip escaped char */
1619 }
1620 }
1621
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001622 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001623 *p_end = tok->cur;
1624 return STRING;
1625 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001627 /* Line continuation */
1628 if (c == '\\') {
1629 c = tok_nextc(tok);
1630 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001631 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001632 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001633 return ERRORTOKEN;
1634 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001635 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001636 goto again; /* Read next line */
1637 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001638
Guido van Rossumfbab9051991-10-20 20:25:03 +00001639 /* Check for two-character token */
1640 {
1641 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001642 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001643 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001644 int c3 = tok_nextc(tok);
1645 int token3 = PyToken_ThreeChars(c, c2, c3);
1646 if (token3 != OP) {
1647 token = token3;
1648 } else {
1649 tok_backup(tok, c3);
1650 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001651 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001652 *p_end = tok->cur;
1653 return token;
1654 }
1655 tok_backup(tok, c2);
1656 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001657
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001658 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001659 switch (c) {
1660 case '(':
1661 case '[':
1662 case '{':
1663 tok->level++;
1664 break;
1665 case ')':
1666 case ']':
1667 case '}':
1668 tok->level--;
1669 break;
1670 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001671
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001672 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001673 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001674 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001675 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001676}
1677
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001678int
1679PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1680{
1681 int result = tok_get(tok, p_start, p_end);
1682 if (tok->decoding_erred) {
1683 result = ERRORTOKEN;
1684 tok->done = E_DECODE;
1685 }
1686 return result;
1687}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001688
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001689/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001690
1691 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001692 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001693 should be assumed to be PyUnicode_GetDefaultEncoding()).
1694
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001695 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1696 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001697*/
1698char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001699PyTokenizer_FindEncoding(int fd)
1700{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001701 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001702 FILE *fp;
1703 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001704
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001705 fd = dup(fd);
1706 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001707 return NULL;
1708 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001709 fp = fdopen(fd, "r");
1710 if (fp == NULL) {
1711 return NULL;
1712 }
1713 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1714 if (tok == NULL) {
1715 fclose(fp);
1716 return NULL;
1717 }
1718 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001719 PyTokenizer_Get(tok, &p_start, &p_end);
1720 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001721 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001722 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001723 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Amaury Forgeot d'Arc1b933ed2008-09-04 22:34:09 +00001724 if (encoding)
1725 strcpy(encoding, tok->encoding);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001726 }
1727 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001728 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001729}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001730
Guido van Rossum408027e1996-12-30 16:17:54 +00001731#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001732
1733void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001734tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001735{
Guido van Rossum86bea461997-04-29 21:03:06 +00001736 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001737 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1738 printf("(%.*s)", (int)(end - start), start);
1739}
1740
1741#endif