blob: e4cf8e4cc5524ff184193f996bd1f2af21ad839b [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000129 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000130 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000135 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000139 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000140#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141 tok->decoding_readline = NULL;
142 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000143#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000144 return tok;
145}
146
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147#ifdef PGEN
148
149static char *
150decoding_fgets(char *s, int size, struct tok_state *tok)
151{
152 return fgets(s, size, tok->fp);
153}
154
155static int
156decoding_feof(struct tok_state *tok)
157{
158 return feof(tok->fp);
159}
160
161static const char *
162decode_str(const char *str, struct tok_state *tok)
163{
164 return str;
165}
166
167#else /* PGEN */
168
169static char *
170error_ret(struct tok_state *tok) /* XXX */
171{
172 tok->decoding_erred = 1;
173 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000174 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175 tok->buf = NULL;
176 return NULL; /* as if it were EOF */
177}
178
179static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000181{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 if (result != NULL) {
184 memcpy(result, s, len);
185 result[len] = '\0';
186 }
187 return result;
188}
189
190static char *
191get_normal_name(char *s) /* for utf-8 and latin-1 */
192{
193 char buf[13];
194 int i;
195 for (i = 0; i < 12; i++) {
196 int c = s[i];
197 if (c == '\0') break;
198 else if (c == '_') buf[i] = '-';
199 else buf[i] = tolower(c);
200 }
201 buf[i] = '\0';
202 if (strcmp(buf, "utf-8") == 0 ||
203 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
204 else if (strcmp(buf, "latin-1") == 0 ||
205 strcmp(buf, "iso-8859-1") == 0 ||
206 strcmp(buf, "iso-latin-1") == 0 ||
207 strncmp(buf, "latin-1-", 8) == 0 ||
208 strncmp(buf, "iso-8859-1-", 11) == 0 ||
209 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
210 else return s;
211}
212
213/* Return the coding spec in S, or NULL if none is found. */
214
215static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000216get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000217{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000218 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000219 /* Coding spec must be in a comment, and that comment must be
220 * the only statement on the source code line. */
221 for (i = 0; i < size - 6; i++) {
222 if (s[i] == '#')
223 break;
224 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
225 return NULL;
226 }
227 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228 const char* t = s + i;
229 if (strncmp(t, "coding", 6) == 0) {
230 const char* begin = NULL;
231 t += 6;
232 if (t[0] != ':' && t[0] != '=')
233 continue;
234 do {
235 t++;
236 } while (t[0] == '\x20' || t[0] == '\t');
237
238 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000239 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000240 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 t++;
242
243 if (begin < t) {
244 char* r = new_string(begin, t - begin);
245 char* q = get_normal_name(r);
246 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000248 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 }
250 return r;
251 }
252 }
253 }
254 return NULL;
255}
256
257/* Check whether the line contains a coding spec. If it does,
258 invoke the set_readline function for the new encoding.
259 This function receives the tok_state and the new encoding.
260 Return 1 on success, 0 on failure. */
261
262static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000263check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264 int set_readline(struct tok_state *, const char *))
265{
Tim Peters17db21f2002-09-03 15:39:58 +0000266 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000268
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000269 if (tok->cont_line)
270 /* It's a continuation line, so it can't be a coding spec. */
271 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000272 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 if (cs != NULL) {
274 tok->read_coding_spec = 1;
275 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000276 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000277 if (strcmp(cs, "utf-8") == 0 ||
278 strcmp(cs, "iso-8859-1") == 0) {
279 tok->encoding = cs;
280 } else {
281 r = set_readline(tok, cs);
282 if (r) {
283 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000286 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000291 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000314 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000318 ch = get_char(tok);
319 if (ch != 0xBB) {
320 unget_char(ch, tok);
321 unget_char(0xEF, tok);
322 /* any token beginning with '\xEF' is a bad token */
323 return 1;
324 }
325 ch = get_char(tok);
326 if (ch != 0xBF) {
327 unget_char(ch, tok);
328 unget_char(0xBB, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch == 0xFE) {
337 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
338 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000339 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 } else if (ch == 0xFF) {
341 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
342 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000343 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344#endif
345 } else {
346 unget_char(ch, tok);
347 return 1;
348 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000349 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000352 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000353 return 1;
354}
355
356/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000363 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 reached): see tok_nextc and its calls to decoding_fgets.
369*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370
371static char *
372fp_readl(char *s, int size, struct tok_state *tok)
373{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000374 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000375 const char *buf;
376 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377
378 /* Ask for one less byte so we can terminate it */
379 assert(size > 0);
380 size--;
381
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 if (tok->decoding_buffer) {
383 bufobj = tok->decoding_buffer;
384 Py_INCREF(bufobj);
385 }
386 else
387 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000388 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
389 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000390 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000391 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000392 if (PyUnicode_CheckExact(bufobj))
393 {
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +0000394 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000395 if (buf == NULL) {
396 goto error;
397 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000399 else
400 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000401 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000402 if (buf == NULL) {
403 goto error;
404 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000405 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000406 }
407
408 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000409 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000410 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000411 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000412 buflen-size);
413 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000414 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000415 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000417 else
418 tok->decoding_buffer = NULL;
419
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000420 memcpy(s, buf, buflen);
421 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 if (buflen == 0) /* EOF */
423 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426
427error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000428 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000429 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430}
431
432/* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
434
435 This function is called from check_bom and check_coding_spec.
436
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
439
440 Return 1 on success, 0 on failure. */
441
442static int
443fp_setreadl(struct tok_state *tok, const char* enc)
444{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000445 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000446
Christian Heimes819b8bf2008-01-03 23:05:47 +0000447 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000448 if (io == NULL)
449 goto cleanup;
450
Brett Cannon8a9583e2008-09-04 05:04:25 +0000451 if (tok->filename)
452 stream = PyObject_CallMethod(io, "open", "ssis",
453 tok->filename, "r", -1, enc);
454 else
455 stream = PyObject_CallMethod(io, "open", "isis",
456 fileno(tok->fp), "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000457 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000458 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000459
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000460 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000461 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000462 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000463
464 cleanup:
465 Py_XDECREF(stream);
466 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000467 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000468}
469
470/* Fetch the next byte from TOK. */
471
472static int fp_getc(struct tok_state *tok) {
473 return getc(tok->fp);
474}
475
476/* Unfetch the last byte back into TOK. */
477
478static void fp_ungetc(int c, struct tok_state *tok) {
479 ungetc(c, tok->fp);
480}
481
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000482/* Check whether the characters at s start a valid
483 UTF-8 sequence. Return the number of characters forming
484 the sequence if yes, 0 if not. */
485static int valid_utf8(const unsigned char* s)
486{
487 int expected = 0;
488 int length;
489 if (*s < 0x80)
490 /* single-byte code */
491 return 1;
492 if (*s < 0xc0)
493 /* following byte */
494 return 0;
495 if (*s < 0xE0)
496 expected = 1;
497 else if (*s < 0xF0)
498 expected = 2;
499 else if (*s < 0xF8)
500 expected = 3;
501 else
502 return 0;
503 length = expected + 1;
504 for (; expected; expected--)
505 if (s[expected] < 0x80 || s[expected] >= 0xC0)
506 return 0;
507 return length;
508}
509
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000510/* Read a line of input from TOK. Determine encoding
511 if necessary. */
512
513static char *
514decoding_fgets(char *s, int size, struct tok_state *tok)
515{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000516 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000517 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000518 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000519 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000520 /* We already have a codec associated with
521 this input. */
522 line = fp_readl(s, size, tok);
523 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000524 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000525 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000526 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000528 break;
529 } else {
530 /* We have not yet determined the encoding.
531 If an encoding is found, use the file-pointer
532 reader functions from now on. */
533 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
534 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000535 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000536 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000537 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000538 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
539 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
540 return error_ret(tok);
541 }
542 }
543#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000544 /* The default encoding is UTF-8, so make sure we don't have any
545 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000546 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000548 int length;
549 for (c = (unsigned char *)line; *c; c += length)
550 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000551 badchar = *c;
552 break;
553 }
554 }
555 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000556 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000557 /* Need to add 1 to the line number, since this line
558 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000559 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000560 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000561 "in file %.200s on line %i, "
562 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000563 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000564 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000565 PyErr_SetString(PyExc_SyntaxError, buf);
566 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000567 }
568#endif
569 return line;
570}
571
572static int
573decoding_feof(struct tok_state *tok)
574{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000575 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000576 return feof(tok->fp);
577 } else {
578 PyObject* buf = tok->decoding_buffer;
579 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000580 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000581 if (buf == NULL) {
582 error_ret(tok);
583 return 1;
584 } else {
585 tok->decoding_buffer = buf;
586 }
587 }
588 return PyObject_Length(buf) == 0;
589 }
590}
591
592/* Fetch a byte from TOK, using the string buffer. */
593
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000594static int
595buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000596 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000597}
598
599/* Unfetch a byte from TOK, using the string buffer. */
600
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000601static void
602buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000603 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000604 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000605}
606
607/* Set the readline function for TOK to ENC. For the string-based
608 tokenizer, this means to just record the encoding. */
609
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000610static int
611buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000612 tok->enc = enc;
613 return 1;
614}
615
616/* Return a UTF-8 encoding Python string object from the
617 C byte string STR, which is encoded with ENC. */
618
619static PyObject *
620translate_into_utf8(const char* str, const char* enc) {
621 PyObject *utf8;
622 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
623 if (buf == NULL)
624 return NULL;
625 utf8 = PyUnicode_AsUTF8String(buf);
626 Py_DECREF(buf);
627 return utf8;
628}
629
630/* Decode a byte string STR for use as the buffer of TOK.
631 Look for encoding declarations inside STR, and record them
632 inside TOK. */
633
634static const char *
635decode_str(const char *str, struct tok_state *tok)
636{
637 PyObject* utf8 = NULL;
638 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000639 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000640 int lineno = 0;
641 tok->enc = NULL;
642 tok->str = str;
643 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000644 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000645 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000646 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000647 if (tok->enc != NULL) {
648 utf8 = translate_into_utf8(str, tok->enc);
649 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000650 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000651 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000652 }
653 for (s = str;; s++) {
654 if (*s == '\0') break;
655 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000656 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000657 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000658 lineno++;
659 if (lineno == 2) break;
660 }
661 }
662 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000663 /* need to check line 1 and 2 separately since check_coding_spec
664 assumes a single line as input */
665 if (newl[0]) {
666 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
667 return error_ret(tok);
668 if (tok->enc == NULL && newl[1]) {
669 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
670 tok, buf_setreadl))
671 return error_ret(tok);
672 }
673 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000674 if (tok->enc != NULL) {
675 assert(utf8 == NULL);
676 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000677 if (utf8 == NULL) {
678 PyErr_Format(PyExc_SyntaxError,
679 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000680 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000681 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000682 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000683 }
684 assert(tok->decoding_buffer == NULL);
685 tok->decoding_buffer = utf8; /* CAUTION */
686 return str;
687}
688
689#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000690
691/* Set up tokenizer for string */
692
693struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000694PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000695{
696 struct tok_state *tok = tok_new();
697 if (tok == NULL)
698 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000699 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000700 if (str == NULL) {
701 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000702 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000703 }
704
Martin v. Löwis95292d62002-12-11 14:04:59 +0000705 /* XXX: constify members. */
706 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000707 return tok;
708}
709
710
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000711/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000712
713struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000714PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000715{
716 struct tok_state *tok = tok_new();
717 if (tok == NULL)
718 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000719 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000720 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000721 return NULL;
722 }
723 tok->cur = tok->inp = tok->buf;
724 tok->end = tok->buf + BUFSIZ;
725 tok->fp = fp;
726 tok->prompt = ps1;
727 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000728 if (enc != NULL) {
729 /* Must copy encoding declaration since it
730 gets copied into the parse tree. */
731 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
732 if (!tok->encoding) {
733 PyTokenizer_Free(tok);
734 return NULL;
735 }
736 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000737 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000738 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000739 return tok;
740}
741
742
743/* Free a tok_state structure */
744
745void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000746PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000747{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000748 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000749 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000750#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000751 Py_XDECREF(tok->decoding_readline);
752 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000753#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000754 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000755 PyMem_FREE(tok->buf);
756 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000757}
758
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759/* Get next char, updating state; error code goes into tok->done */
760
761static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000762tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000763{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000764 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000765 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000766 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000767 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000768 if (tok->done != E_OK)
769 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000770 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000771 char *end = strchr(tok->inp, '\n');
772 if (end != NULL)
773 end++;
774 else {
775 end = strchr(tok->inp, '\0');
776 if (end == tok->inp) {
777 tok->done = E_EOF;
778 return EOF;
779 }
780 }
781 if (tok->start == NULL)
782 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000783 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000784 tok->lineno++;
785 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000786 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000787 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000788 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000789 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000790#ifndef PGEN
791 if (tok->encoding && newtok && *newtok) {
792 /* Recode to UTF-8 */
793 Py_ssize_t buflen;
794 const char* buf;
795 PyObject *u = translate_into_utf8(newtok, tok->encoding);
796 PyMem_FREE(newtok);
797 if (!u) {
798 tok->done = E_DECODE;
799 return EOF;
800 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000801 buflen = PyBytes_GET_SIZE(u);
802 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000803 if (!buf) {
804 Py_DECREF(u);
805 tok->done = E_DECODE;
806 return EOF;
807 }
808 newtok = PyMem_MALLOC(buflen+1);
809 strcpy(newtok, buf);
810 Py_DECREF(u);
811 }
812#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000813 if (tok->nextprompt != NULL)
814 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000815 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000816 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000817 else if (*newtok == '\0') {
818 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000819 tok->done = E_EOF;
820 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000821 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000822 size_t start = tok->start - tok->buf;
823 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000824 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000825 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000826 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000827 tok->lineno++;
828 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000830 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000831 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000832 tok->done = E_NOMEM;
833 return EOF;
834 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000835 tok->buf = buf;
836 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000837 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000838 strcpy(tok->buf + oldlen, newtok);
839 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 tok->inp = tok->buf + newlen;
841 tok->end = tok->inp + 1;
842 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000843 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000844 else {
845 tok->lineno++;
846 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000847 PyMem_FREE(tok->buf);
848 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000849 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000850 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000851 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000852 tok->inp = strchr(tok->buf, '\0');
853 tok->end = tok->inp + 1;
854 }
855 }
856 else {
857 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000859 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 if (tok->start == NULL) {
861 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000862 tok->buf = (char *)
863 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000864 if (tok->buf == NULL) {
865 tok->done = E_NOMEM;
866 return EOF;
867 }
868 tok->end = tok->buf + BUFSIZ;
869 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000870 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
871 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000872 tok->done = E_EOF;
873 done = 1;
874 }
875 else {
876 tok->done = E_OK;
877 tok->inp = strchr(tok->buf, '\0');
878 done = tok->inp[-1] == '\n';
879 }
880 }
881 else {
882 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000883 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000884 tok->done = E_EOF;
885 done = 1;
886 }
887 else
888 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000889 }
890 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000891 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000892 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000893 Py_ssize_t curstart = tok->start == NULL ? -1 :
894 tok->start - tok->buf;
895 Py_ssize_t curvalid = tok->inp - tok->buf;
896 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000897 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 newbuf = (char *)PyMem_REALLOC(newbuf,
899 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000900 if (newbuf == NULL) {
901 tok->done = E_NOMEM;
902 tok->cur = tok->inp;
903 return EOF;
904 }
905 tok->buf = newbuf;
906 tok->inp = tok->buf + curvalid;
907 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000908 tok->start = curstart < 0 ? NULL :
909 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000910 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000911 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000912 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000913 /* Break out early on decoding
914 errors, as tok->buf will be NULL
915 */
916 if (tok->decoding_erred)
917 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000918 /* Last line does not end in \n,
919 fake one */
920 strcpy(tok->inp, "\n");
921 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000922 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000923 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000924 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000925 if (tok->buf != NULL) {
926 tok->cur = tok->buf + cur;
927 tok->line_start = tok->cur;
928 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000929 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000930 pt = tok->inp - 2;
931 if (pt >= tok->buf && *pt == '\r') {
932 *pt++ = '\n';
933 *pt = '\0';
934 tok->inp = pt;
935 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000936 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000937 }
938 if (tok->done != E_OK) {
939 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000940 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942 return EOF;
943 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000944 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000945 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000946}
947
948
949/* Back-up one character */
950
951static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000952tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953{
954 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000955 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000956 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000957 if (*tok->cur != c)
958 *tok->cur = c;
959 }
960}
961
962
963/* Return the token corresponding to a single character */
964
965int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000966PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000967{
968 switch (c) {
969 case '(': return LPAR;
970 case ')': return RPAR;
971 case '[': return LSQB;
972 case ']': return RSQB;
973 case ':': return COLON;
974 case ',': return COMMA;
975 case ';': return SEMI;
976 case '+': return PLUS;
977 case '-': return MINUS;
978 case '*': return STAR;
979 case '/': return SLASH;
980 case '|': return VBAR;
981 case '&': return AMPER;
982 case '<': return LESS;
983 case '>': return GREATER;
984 case '=': return EQUAL;
985 case '.': return DOT;
986 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000987 case '{': return LBRACE;
988 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000989 case '^': return CIRCUMFLEX;
990 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000991 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000992 default: return OP;
993 }
994}
995
996
Guido van Rossumfbab9051991-10-20 20:25:03 +0000997int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000998PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000999{
1000 switch (c1) {
1001 case '=':
1002 switch (c2) {
1003 case '=': return EQEQUAL;
1004 }
1005 break;
1006 case '!':
1007 switch (c2) {
1008 case '=': return NOTEQUAL;
1009 }
1010 break;
1011 case '<':
1012 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +00001013 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001014 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001015 }
1016 break;
1017 case '>':
1018 switch (c2) {
1019 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001020 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001021 }
1022 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001023 case '+':
1024 switch (c2) {
1025 case '=': return PLUSEQUAL;
1026 }
1027 break;
1028 case '-':
1029 switch (c2) {
1030 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001031 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001032 }
1033 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001034 case '*':
1035 switch (c2) {
1036 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001037 case '=': return STAREQUAL;
1038 }
1039 break;
1040 case '/':
1041 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001042 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001043 case '=': return SLASHEQUAL;
1044 }
1045 break;
1046 case '|':
1047 switch (c2) {
1048 case '=': return VBAREQUAL;
1049 }
1050 break;
1051 case '%':
1052 switch (c2) {
1053 case '=': return PERCENTEQUAL;
1054 }
1055 break;
1056 case '&':
1057 switch (c2) {
1058 case '=': return AMPEREQUAL;
1059 }
1060 break;
1061 case '^':
1062 switch (c2) {
1063 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001064 }
1065 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001066 }
1067 return OP;
1068}
1069
Thomas Wouters434d0822000-08-24 20:11:32 +00001070int
1071PyToken_ThreeChars(int c1, int c2, int c3)
1072{
1073 switch (c1) {
1074 case '<':
1075 switch (c2) {
1076 case '<':
1077 switch (c3) {
1078 case '=':
1079 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001080 }
1081 break;
1082 }
1083 break;
1084 case '>':
1085 switch (c2) {
1086 case '>':
1087 switch (c3) {
1088 case '=':
1089 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001090 }
1091 break;
1092 }
1093 break;
1094 case '*':
1095 switch (c2) {
1096 case '*':
1097 switch (c3) {
1098 case '=':
1099 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001100 }
1101 break;
1102 }
1103 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001104 case '/':
1105 switch (c2) {
1106 case '/':
1107 switch (c3) {
1108 case '=':
1109 return DOUBLESLASHEQUAL;
1110 }
1111 break;
1112 }
1113 break;
Georg Brandldde00282007-03-18 19:01:53 +00001114 case '.':
1115 switch (c2) {
1116 case '.':
1117 switch (c3) {
1118 case '.':
1119 return ELLIPSIS;
1120 }
1121 break;
1122 }
1123 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001124 }
1125 return OP;
1126}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001127
Guido van Rossum926f13a1998-04-09 21:38:06 +00001128static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001129indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001130{
1131 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001132 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001133 tok->cur = tok->inp;
1134 return 1;
1135 }
1136 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001137 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1138 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001139 tok->altwarning = 0;
1140 }
1141 return 0;
1142}
1143
Martin v. Löwis47383402007-08-15 07:32:56 +00001144#ifdef PGEN
1145#define verify_identifier(s,e) 1
1146#else
1147/* Verify that the identifier follows PEP 3131. */
1148static int
1149verify_identifier(char *start, char *end)
1150{
Guido van Rossume3e37012007-08-29 18:54:41 +00001151 PyObject *s;
1152 int result;
1153 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1154 if (s == NULL) {
1155 PyErr_Clear();
1156 return 0;
1157 }
1158 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001159 Py_DECREF(s);
1160 return result;
1161}
1162#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001163
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164/* Get next token, after space stripping etc. */
1165
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001166static int
1167tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001168{
1169 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001170 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001171
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001172 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001173 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001174 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001175 blankline = 0;
1176
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 /* Get indentation level */
1178 if (tok->atbol) {
1179 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001180 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001181 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001182 for (;;) {
1183 c = tok_nextc(tok);
1184 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001185 col++, altcol++;
1186 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001187 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001188 altcol = (altcol/tok->alttabsize + 1)
1189 * tok->alttabsize;
1190 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001191 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001192 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001193 else
1194 break;
1195 }
1196 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001197 if (c == '#' || c == '\n') {
1198 /* Lines with only whitespace and/or comments
1199 shouldn't affect the indentation and are
1200 not passed to the parser as NEWLINE tokens,
1201 except *totally* empty lines in interactive
1202 mode, which signal the end of a command group. */
1203 if (col == 0 && c == '\n' && tok->prompt != NULL)
1204 blankline = 0; /* Let it through */
1205 else
1206 blankline = 1; /* Ignore completely */
1207 /* We can't jump back right here since we still
1208 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001209 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001210 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001211 if (col == tok->indstack[tok->indent]) {
1212 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001213 if (altcol != tok->altindstack[tok->indent]) {
1214 if (indenterror(tok))
1215 return ERRORTOKEN;
1216 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001217 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001218 else if (col > tok->indstack[tok->indent]) {
1219 /* Indent -- always one */
1220 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001221 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001222 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001223 return ERRORTOKEN;
1224 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001225 if (altcol <= tok->altindstack[tok->indent]) {
1226 if (indenterror(tok))
1227 return ERRORTOKEN;
1228 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001229 tok->pendin++;
1230 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001231 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001232 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001233 else /* col < tok->indstack[tok->indent] */ {
1234 /* Dedent -- any number, must be consistent */
1235 while (tok->indent > 0 &&
1236 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001237 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001238 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001239 }
1240 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001241 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001242 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001243 return ERRORTOKEN;
1244 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001245 if (altcol != tok->altindstack[tok->indent]) {
1246 if (indenterror(tok))
1247 return ERRORTOKEN;
1248 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001249 }
1250 }
1251 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001252
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001253 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001254
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001255 /* Return pending indents/dedents */
1256 if (tok->pendin != 0) {
1257 if (tok->pendin < 0) {
1258 tok->pendin++;
1259 return DEDENT;
1260 }
1261 else {
1262 tok->pendin--;
1263 return INDENT;
1264 }
1265 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001266
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001267 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001268 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001269 /* Skip spaces */
1270 do {
1271 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001272 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001273
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001274 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001275 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001276
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001277 /* Skip comment */
1278 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001279 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001281
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001282 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001283 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001284 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001285 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001286
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001287 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001288 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001289 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001290 /* Process b"", r"" and br"" */
1291 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001292 c = tok_nextc(tok);
1293 if (c == '"' || c == '\'')
1294 goto letter_quote;
1295 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001296 if (c == 'r' || c == 'R') {
1297 c = tok_nextc(tok);
1298 if (c == '"' || c == '\'')
1299 goto letter_quote;
1300 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001301 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001302 if (c >= 128)
1303 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001304 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001305 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001306 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001307 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001308 !verify_identifier(tok->start, tok->cur)) {
1309 tok->done = E_IDENTIFIER;
1310 return ERRORTOKEN;
1311 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001312 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 *p_end = tok->cur;
1314 return NAME;
1315 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001316
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001317 /* Newline */
1318 if (c == '\n') {
1319 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001320 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001321 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001322 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001323 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001324 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001325 return NEWLINE;
1326 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001327
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001328 /* Period or number starting with period? */
1329 if (c == '.') {
1330 c = tok_nextc(tok);
1331 if (isdigit(c)) {
1332 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001333 } else if (c == '.') {
1334 c = tok_nextc(tok);
1335 if (c == '.') {
1336 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001337 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001338 return ELLIPSIS;
1339 } else {
1340 tok_backup(tok, c);
1341 }
1342 tok_backup(tok, '.');
1343 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001344 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001345 }
Georg Brandldde00282007-03-18 19:01:53 +00001346 *p_start = tok->start;
1347 *p_end = tok->cur;
1348 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001349 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001350
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 /* Number */
1352 if (isdigit(c)) {
1353 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001354 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001355 c = tok_nextc(tok);
1356 if (c == '.')
1357 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001358#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001359 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001360 goto imaginary;
1361#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001362 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001363
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001364 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001365 c = tok_nextc(tok);
1366 if (!isxdigit(c)) {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001371 do {
1372 c = tok_nextc(tok);
1373 } while (isxdigit(c));
1374 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001375 else if (c == 'o' || c == 'O') {
1376 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001377 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001378 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001379 tok->done = E_TOKEN;
1380 tok_backup(tok, c);
1381 return ERRORTOKEN;
1382 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001383 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001384 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001385 } while ('0' <= c && c < '8');
1386 }
1387 else if (c == 'b' || c == 'B') {
1388 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001389 c = tok_nextc(tok);
1390 if (c != '0' && c != '1') {
1391 tok->done = E_TOKEN;
1392 tok_backup(tok, c);
1393 return ERRORTOKEN;
1394 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001395 do {
1396 c = tok_nextc(tok);
1397 } while (c == '0' || c == '1');
1398 }
1399 else {
1400 int nonzero = 0;
1401 /* maybe old-style octal; c is first char of it */
1402 /* in any case, allow '0' as a literal */
1403 while (c == '0')
1404 c = tok_nextc(tok);
1405 while (isdigit(c)) {
1406 nonzero = 1;
1407 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001408 }
1409 if (c == '.')
1410 goto fraction;
1411 else if (c == 'e' || c == 'E')
1412 goto exponent;
1413#ifndef WITHOUT_COMPLEX
1414 else if (c == 'j' || c == 'J')
1415 goto imaginary;
1416#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001417 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001418 tok->done = E_TOKEN;
1419 tok_backup(tok, c);
1420 return ERRORTOKEN;
1421 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001422 }
1423 }
1424 else {
1425 /* Decimal */
1426 do {
1427 c = tok_nextc(tok);
1428 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001429 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001430 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001431 if (c == '.') {
1432 fraction:
1433 /* Fraction */
1434 do {
1435 c = tok_nextc(tok);
1436 } while (isdigit(c));
1437 }
1438 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001439 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001440 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001441 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001442 if (c == '+' || c == '-')
1443 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001444 if (!isdigit(c)) {
1445 tok->done = E_TOKEN;
1446 tok_backup(tok, c);
1447 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001448 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001449 do {
1450 c = tok_nextc(tok);
1451 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001452 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001453#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001454 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001455 /* Imaginary part */
1456 imaginary:
1457 c = tok_nextc(tok);
1458#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 }
1460 }
1461 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001462 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001463 *p_end = tok->cur;
1464 return NUMBER;
1465 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001466
1467 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001468 /* String */
1469 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001470 int quote = c;
1471 int quote_size = 1; /* 1 or 3 */
1472 int end_quote_size = 0;
1473
1474 /* Find the quote size and start of string */
1475 c = tok_nextc(tok);
1476 if (c == quote) {
1477 c = tok_nextc(tok);
1478 if (c == quote)
1479 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001480 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001481 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001482 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001483 if (c != quote)
1484 tok_backup(tok, c);
1485
1486 /* Get rest of string */
1487 while (end_quote_size != quote_size) {
1488 c = tok_nextc(tok);
1489 if (c == EOF) {
1490 if (quote_size == 3)
1491 tok->done = E_EOFS;
1492 else
1493 tok->done = E_EOLS;
1494 tok->cur = tok->inp;
1495 return ERRORTOKEN;
1496 }
1497 if (quote_size == 1 && c == '\n') {
1498 tok->done = E_EOLS;
1499 tok->cur = tok->inp;
1500 return ERRORTOKEN;
1501 }
1502 if (c == quote)
1503 end_quote_size += 1;
1504 else {
1505 end_quote_size = 0;
1506 if (c == '\\')
1507 c = tok_nextc(tok); /* skip escaped char */
1508 }
1509 }
1510
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001511 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001512 *p_end = tok->cur;
1513 return STRING;
1514 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001515
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001516 /* Line continuation */
1517 if (c == '\\') {
1518 c = tok_nextc(tok);
1519 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001520 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001521 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001522 return ERRORTOKEN;
1523 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001524 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001525 goto again; /* Read next line */
1526 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001527
Guido van Rossumfbab9051991-10-20 20:25:03 +00001528 /* Check for two-character token */
1529 {
1530 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001531 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001532 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001533 int c3 = tok_nextc(tok);
1534 int token3 = PyToken_ThreeChars(c, c2, c3);
1535 if (token3 != OP) {
1536 token = token3;
1537 } else {
1538 tok_backup(tok, c3);
1539 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001540 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001541 *p_end = tok->cur;
1542 return token;
1543 }
1544 tok_backup(tok, c2);
1545 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001546
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001547 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001548 switch (c) {
1549 case '(':
1550 case '[':
1551 case '{':
1552 tok->level++;
1553 break;
1554 case ')':
1555 case ']':
1556 case '}':
1557 tok->level--;
1558 break;
1559 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001560
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001561 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001562 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001563 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001564 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001565}
1566
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001567int
1568PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1569{
1570 int result = tok_get(tok, p_start, p_end);
1571 if (tok->decoding_erred) {
1572 result = ERRORTOKEN;
1573 tok->done = E_DECODE;
1574 }
1575 return result;
1576}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001577
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001578/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001579
1580 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001581 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001582 should be assumed to be PyUnicode_GetDefaultEncoding()).
1583
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001584 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1585 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001586*/
1587char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001588PyTokenizer_FindEncoding(int fd)
1589{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001590 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001591 FILE *fp;
1592 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001593
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001594 fd = dup(fd);
1595 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001596 return NULL;
1597 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001598 fp = fdopen(fd, "r");
1599 if (fp == NULL) {
1600 return NULL;
1601 }
1602 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1603 if (tok == NULL) {
1604 fclose(fp);
1605 return NULL;
1606 }
1607 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001608 PyTokenizer_Get(tok, &p_start, &p_end);
1609 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001610 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001611 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001612 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001613 strcpy(encoding, tok->encoding);
1614 }
1615 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001616 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001617}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001618
Guido van Rossum408027e1996-12-30 16:17:54 +00001619#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001620
1621void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001622tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001623{
Guido van Rossum86bea461997-04-29 21:03:06 +00001624 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001625 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1626 printf("(%.*s)", (int)(end - start), start);
1627}
1628
1629#endif