blob: 487405f20ede90e0236b3efd747a2e669e0f9ad2 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
Martin v. Löwis47383402007-08-15 07:32:56 +000024 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
Martin v. Löwis47383402007-08-15 07:32:56 +000031 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Martin v. Löwis566f6af2002-10-26 14:39:10 +000034extern char *PyOS_Readline(FILE *, FILE *, char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Guido van Rossum86bea461997-04-29 21:03:06 +000050char *_PyParser_TokenNames[] = {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000076 "LBRACE",
77 "RBRACE",
Guido van Rossumfbab9051991-10-20 20:25:03 +000078 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +000082 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
Guido van Rossumf595fde1996-01-12 01:31:58 +000086 "DOUBLESTAR",
Thomas Wouters434d0822000-08-24 20:11:32 +000087 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
Guido van Rossum4668b002001-08-08 05:00:18 +000098 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000100 "AT",
Neal Norwitzc1505362006-12-28 06:47:50 +0000101 "RARROW",
Georg Brandldde00282007-03-18 19:01:53 +0000102 "ELLIPSIS",
Guido van Rossumfbab9051991-10-20 20:25:03 +0000103 /* This table must match the #defines in token.h! */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000117 if (tok == NULL)
118 return NULL;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
Guido van Rossuma849b831993-05-12 11:35:44 +0000129 tok->level = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000130 tok->filename = NULL;
Thomas Wouters6caa07b2006-04-14 11:33:28 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
Guido van Rossum926f13a1998-04-09 21:38:06 +0000133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000135 tok->decoding_state = STATE_INIT;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000138 tok->encoding = NULL;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000139 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000140#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000141 tok->decoding_readline = NULL;
142 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000143#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000144 return tok;
145}
146
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000147#ifdef PGEN
148
149static char *
150decoding_fgets(char *s, int size, struct tok_state *tok)
151{
152 return fgets(s, size, tok->fp);
153}
154
155static int
156decoding_feof(struct tok_state *tok)
157{
158 return feof(tok->fp);
159}
160
161static const char *
162decode_str(const char *str, struct tok_state *tok)
163{
164 return str;
165}
166
167#else /* PGEN */
168
169static char *
170error_ret(struct tok_state *tok) /* XXX */
171{
172 tok->decoding_erred = 1;
173 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000174 PyMem_FREE(tok->buf);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000175 tok->buf = NULL;
176 return NULL; /* as if it were EOF */
177}
178
179static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180new_string(const char *s, Py_ssize_t len)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000181{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000182 char* result = (char *)PyMem_MALLOC(len + 1);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000183 if (result != NULL) {
184 memcpy(result, s, len);
185 result[len] = '\0';
186 }
187 return result;
188}
189
190static char *
191get_normal_name(char *s) /* for utf-8 and latin-1 */
192{
193 char buf[13];
194 int i;
195 for (i = 0; i < 12; i++) {
196 int c = s[i];
197 if (c == '\0') break;
198 else if (c == '_') buf[i] = '-';
199 else buf[i] = tolower(c);
200 }
201 buf[i] = '\0';
202 if (strcmp(buf, "utf-8") == 0 ||
203 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
204 else if (strcmp(buf, "latin-1") == 0 ||
205 strcmp(buf, "iso-8859-1") == 0 ||
206 strcmp(buf, "iso-latin-1") == 0 ||
207 strncmp(buf, "latin-1-", 8) == 0 ||
208 strncmp(buf, "iso-8859-1-", 11) == 0 ||
209 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
210 else return s;
211}
212
213/* Return the coding spec in S, or NULL if none is found. */
214
215static char *
Martin v. Löwis18e16552006-02-15 17:27:45 +0000216get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000217{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000218 Py_ssize_t i;
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000219 /* Coding spec must be in a comment, and that comment must be
220 * the only statement on the source code line. */
221 for (i = 0; i < size - 6; i++) {
222 if (s[i] == '#')
223 break;
224 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
225 return NULL;
226 }
227 for (; i < size - 6; i++) { /* XXX inefficient search */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228 const char* t = s + i;
229 if (strncmp(t, "coding", 6) == 0) {
230 const char* begin = NULL;
231 t += 6;
232 if (t[0] != ':' && t[0] != '=')
233 continue;
234 do {
235 t++;
236 } while (t[0] == '\x20' || t[0] == '\t');
237
238 begin = t;
Neal Norwitz30b5c5d2005-12-19 06:05:18 +0000239 while (isalnum(Py_CHARMASK(t[0])) ||
Neal Norwitze08e1bc2002-11-02 20:43:25 +0000240 t[0] == '-' || t[0] == '_' || t[0] == '.')
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000241 t++;
242
243 if (begin < t) {
244 char* r = new_string(begin, t - begin);
245 char* q = get_normal_name(r);
246 if (r != q) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 PyMem_FREE(r);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000248 r = new_string(q, strlen(q));
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249 }
250 return r;
251 }
252 }
253 }
254 return NULL;
255}
256
257/* Check whether the line contains a coding spec. If it does,
258 invoke the set_readline function for the new encoding.
259 This function receives the tok_state and the new encoding.
260 Return 1 on success, 0 on failure. */
261
262static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000263check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000264 int set_readline(struct tok_state *, const char *))
265{
Tim Peters17db21f2002-09-03 15:39:58 +0000266 char * cs;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000267 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000268
Martin v. Löwisf62a89b2002-09-03 11:52:44 +0000269 if (tok->cont_line)
270 /* It's a continuation line, so it can't be a coding spec. */
271 return 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000272 cs = get_coding_spec(line, size);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000273 if (cs != NULL) {
274 tok->read_coding_spec = 1;
275 if (tok->encoding == NULL) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000276 assert(tok->decoding_state == STATE_RAW);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000277 if (strcmp(cs, "utf-8") == 0 ||
278 strcmp(cs, "iso-8859-1") == 0) {
279 tok->encoding = cs;
280 } else {
281 r = set_readline(tok, cs);
282 if (r) {
283 tok->encoding = cs;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000284 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000285 }
Neal Norwitzc0d5faa2005-10-21 06:05:33 +0000286 else
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000291 PyMem_FREE(cs);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000292 }
293 }
Neal Norwitzdb83eb32005-12-18 05:29:30 +0000294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000300 return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307static int
308check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312{
313 int ch = get_char(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000314 tok->decoding_state = STATE_RAW;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000318 ch = get_char(tok);
319 if (ch != 0xBB) {
320 unget_char(ch, tok);
321 unget_char(0xEF, tok);
322 /* any token beginning with '\xEF' is a bad token */
323 return 1;
324 }
325 ch = get_char(tok);
326 if (ch != 0xBF) {
327 unget_char(ch, tok);
328 unget_char(0xBB, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
332 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000333#if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch == 0xFE) {
337 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
338 if (!set_readline(tok, "utf-16-be")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000339 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000340 } else if (ch == 0xFF) {
341 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
342 if (!set_readline(tok, "utf-16-le")) return 0;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000343 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000344#endif
345 } else {
346 unget_char(ch, tok);
347 return 1;
348 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000349 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 PyMem_FREE(tok->encoding);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000351 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
Amaury Forgeot d'Arcaf593462007-11-22 20:53:01 +0000352 /* No need to set_readline: input is already utf-8 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000353 return 1;
354}
355
356/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000357 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000363 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 until the buffer ends with a '\n' (or until the end of the file is
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000368 reached): see tok_nextc and its calls to decoding_fgets.
369*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000370
371static char *
372fp_readl(char *s, int size, struct tok_state *tok)
373{
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000374 PyObject* bufobj;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000375 const char *buf;
376 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000377
378 /* Ask for one less byte so we can terminate it */
379 assert(size > 0);
380 size--;
381
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000382 if (tok->decoding_buffer) {
383 bufobj = tok->decoding_buffer;
384 Py_INCREF(bufobj);
385 }
386 else
387 {
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000388 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
389 if (bufobj == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000390 goto error;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000391 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000392 if (PyUnicode_CheckExact(bufobj))
393 {
394 buf = PyUnicode_AsStringAndSize(bufobj, &buflen);
395 if (buf == NULL) {
396 goto error;
397 }
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000398 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000399 else
400 {
Christian Heimes9c4756e2008-05-26 13:22:05 +0000401 buf = PyByteArray_AsString(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000402 if (buf == NULL) {
403 goto error;
404 }
Christian Heimes9c4756e2008-05-26 13:22:05 +0000405 buflen = PyByteArray_GET_SIZE(bufobj);
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000406 }
407
408 Py_XDECREF(tok->decoding_buffer);
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000409 if (buflen > size) {
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000410 /* Too many chars, the rest goes into tok->decoding_buffer */
Christian Heimes9c4756e2008-05-26 13:22:05 +0000411 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000412 buflen-size);
413 if (tok->decoding_buffer == NULL)
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000414 goto error;
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000415 buflen = size;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000416 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000417 else
418 tok->decoding_buffer = NULL;
419
Guido van Rossumccf4f0f2007-05-09 23:38:34 +0000420 memcpy(s, buf, buflen);
421 s[buflen] = '\0';
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000422 if (buflen == 0) /* EOF */
423 s = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000424 Py_DECREF(bufobj);
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000426
427error:
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000428 Py_XDECREF(bufobj);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000429 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000430}
431
432/* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
434
435 This function is called from check_bom and check_coding_spec.
436
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
439
440 Return 1 on success, 0 on failure. */
441
442static int
443fp_setreadl(struct tok_state *tok, const char* enc)
444{
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000445 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000446
Christian Heimes819b8bf2008-01-03 23:05:47 +0000447 io = PyImport_ImportModuleNoBlock("io");
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000448 if (io == NULL)
449 goto cleanup;
450
451 stream = PyObject_CallMethod(io, "open", "ssis",
452 tok->filename, "r", -1, enc);
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000453 if (stream == NULL)
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000454 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000455
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000456 Py_XDECREF(tok->decoding_readline);
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000457 readline = PyObject_GetAttrString(stream, "readline");
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000458 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000459
460 cleanup:
461 Py_XDECREF(stream);
462 Py_XDECREF(io);
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000463 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000464}
465
466/* Fetch the next byte from TOK. */
467
468static int fp_getc(struct tok_state *tok) {
469 return getc(tok->fp);
470}
471
472/* Unfetch the last byte back into TOK. */
473
474static void fp_ungetc(int c, struct tok_state *tok) {
475 ungetc(c, tok->fp);
476}
477
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000478/* Check whether the characters at s start a valid
479 UTF-8 sequence. Return the number of characters forming
480 the sequence if yes, 0 if not. */
481static int valid_utf8(const unsigned char* s)
482{
483 int expected = 0;
484 int length;
485 if (*s < 0x80)
486 /* single-byte code */
487 return 1;
488 if (*s < 0xc0)
489 /* following byte */
490 return 0;
491 if (*s < 0xE0)
492 expected = 1;
493 else if (*s < 0xF0)
494 expected = 2;
495 else if (*s < 0xF8)
496 expected = 3;
497 else
498 return 0;
499 length = expected + 1;
500 for (; expected; expected--)
501 if (s[expected] < 0x80 || s[expected] >= 0xC0)
502 return 0;
503 return length;
504}
505
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000506/* Read a line of input from TOK. Determine encoding
507 if necessary. */
508
509static char *
510decoding_fgets(char *s, int size, struct tok_state *tok)
511{
Martin v. Löwis2863c102002-08-07 15:18:57 +0000512 char *line = NULL;
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000513 int badchar = 0;
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000514 for (;;) {
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000515 if (tok->decoding_state == STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000516 /* We already have a codec associated with
517 this input. */
518 line = fp_readl(s, size, tok);
519 break;
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000520 } else if (tok->decoding_state == STATE_RAW) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521 /* We want a 'raw' read. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000522 line = Py_UniversalNewlineFgets(s, size,
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000523 tok->fp, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000524 break;
525 } else {
526 /* We have not yet determined the encoding.
527 If an encoding is found, use the file-pointer
528 reader functions from now on. */
529 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
530 return error_ret(tok);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000531 assert(tok->decoding_state != STATE_INIT);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000532 }
Martin v. Löwiscd280fb2002-08-04 18:28:44 +0000533 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000534 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
535 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
536 return error_ret(tok);
537 }
538 }
539#ifndef PGEN
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000540 /* The default encoding is UTF-8, so make sure we don't have any
541 non-UTF-8 sequences in it. */
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000542 if (line && !tok->encoding) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000543 unsigned char *c;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000544 int length;
545 for (c = (unsigned char *)line; *c; c += length)
546 if (!(length = valid_utf8(c))) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000547 badchar = *c;
548 break;
549 }
550 }
551 if (badchar) {
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000552 char buf[500];
Martin v. Löwis725bb232002-08-05 01:49:16 +0000553 /* Need to add 1 to the line number, since this line
554 has not been counted, yet. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000555 sprintf(buf,
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000556 "Non-UTF-8 code starting with '\\x%.2x' "
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000557 "in file %.200s on line %i, "
558 "but no encoding declared; "
Guido van Rossum21b731f2007-08-30 00:10:46 +0000559 "see http://python.org/dev/peps/pep-0263/ for details",
Marc-André Lemburg1fb14002003-02-17 18:31:57 +0000560 badchar, tok->filename, tok->lineno + 1);
Martin v. Löwis6cba2562006-02-28 22:41:29 +0000561 PyErr_SetString(PyExc_SyntaxError, buf);
562 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563 }
564#endif
565 return line;
566}
567
568static int
569decoding_feof(struct tok_state *tok)
570{
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000571 if (tok->decoding_state != STATE_NORMAL) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000572 return feof(tok->fp);
573 } else {
574 PyObject* buf = tok->decoding_buffer;
575 if (buf == NULL) {
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000576 buf = PyObject_CallObject(tok->decoding_readline, NULL);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000577 if (buf == NULL) {
578 error_ret(tok);
579 return 1;
580 } else {
581 tok->decoding_buffer = buf;
582 }
583 }
584 return PyObject_Length(buf) == 0;
585 }
586}
587
588/* Fetch a byte from TOK, using the string buffer. */
589
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590static int
591buf_getc(struct tok_state *tok) {
Just van Rossumf032f862003-02-09 20:38:48 +0000592 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000593}
594
595/* Unfetch a byte from TOK, using the string buffer. */
596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000597static void
598buf_ungetc(int c, struct tok_state *tok) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000599 tok->str--;
Just van Rossumf032f862003-02-09 20:38:48 +0000600 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000601}
602
603/* Set the readline function for TOK to ENC. For the string-based
604 tokenizer, this means to just record the encoding. */
605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000606static int
607buf_setreadl(struct tok_state *tok, const char* enc) {
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000608 tok->enc = enc;
609 return 1;
610}
611
612/* Return a UTF-8 encoding Python string object from the
613 C byte string STR, which is encoded with ENC. */
614
615static PyObject *
616translate_into_utf8(const char* str, const char* enc) {
617 PyObject *utf8;
618 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
619 if (buf == NULL)
620 return NULL;
621 utf8 = PyUnicode_AsUTF8String(buf);
622 Py_DECREF(buf);
623 return utf8;
624}
625
626/* Decode a byte string STR for use as the buffer of TOK.
627 Look for encoding declarations inside STR, and record them
628 inside TOK. */
629
630static const char *
631decode_str(const char *str, struct tok_state *tok)
632{
633 PyObject* utf8 = NULL;
634 const char *s;
Christian Heimes1af737c2008-01-23 08:24:23 +0000635 const char *newl[2] = {NULL, NULL};
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000636 int lineno = 0;
637 tok->enc = NULL;
638 tok->str = str;
639 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000640 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641 str = tok->str; /* string after BOM if any */
Tim Peters2c3f9c62002-08-04 17:58:34 +0000642 assert(str);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000643 if (tok->enc != NULL) {
644 utf8 = translate_into_utf8(str, tok->enc);
645 if (utf8 == NULL)
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000646 return error_ret(tok);
Christian Heimes72b710a2008-05-26 13:28:38 +0000647 str = PyBytes_AsString(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648 }
649 for (s = str;; s++) {
650 if (*s == '\0') break;
651 else if (*s == '\n') {
Christian Heimes412dc9c2008-01-27 18:55:54 +0000652 assert(lineno < 2);
Georg Brandl86def6c2008-01-21 20:36:10 +0000653 newl[lineno] = s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000654 lineno++;
655 if (lineno == 2) break;
656 }
657 }
658 tok->enc = NULL;
Georg Brandl86def6c2008-01-21 20:36:10 +0000659 /* need to check line 1 and 2 separately since check_coding_spec
660 assumes a single line as input */
661 if (newl[0]) {
662 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
663 return error_ret(tok);
664 if (tok->enc == NULL && newl[1]) {
665 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
666 tok, buf_setreadl))
667 return error_ret(tok);
668 }
669 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000670 if (tok->enc != NULL) {
671 assert(utf8 == NULL);
672 utf8 = translate_into_utf8(str, tok->enc);
Neal Norwitz40d37812005-10-02 01:48:49 +0000673 if (utf8 == NULL) {
674 PyErr_Format(PyExc_SyntaxError,
675 "unknown encoding: %s", tok->enc);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000676 return error_ret(tok);
Neal Norwitz40d37812005-10-02 01:48:49 +0000677 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000678 str = PyBytes_AS_STRING(utf8);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679 }
680 assert(tok->decoding_buffer == NULL);
681 tok->decoding_buffer = utf8; /* CAUTION */
682 return str;
683}
684
685#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000686
687/* Set up tokenizer for string */
688
689struct tok_state *
Martin v. Löwis95292d62002-12-11 14:04:59 +0000690PyTokenizer_FromString(const char *str)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000691{
692 struct tok_state *tok = tok_new();
693 if (tok == NULL)
694 return NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000695 str = (char *)decode_str(str, tok);
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000696 if (str == NULL) {
697 PyTokenizer_Free(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000698 return NULL;
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000699 }
700
Martin v. Löwis95292d62002-12-11 14:04:59 +0000701 /* XXX: constify members. */
702 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000703 return tok;
704}
705
706
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000707/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000708
709struct tok_state *
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000710PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000711{
712 struct tok_state *tok = tok_new();
713 if (tok == NULL)
714 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000715 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000716 PyTokenizer_Free(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000717 return NULL;
718 }
719 tok->cur = tok->inp = tok->buf;
720 tok->end = tok->buf + BUFSIZ;
721 tok->fp = fp;
722 tok->prompt = ps1;
723 tok->nextprompt = ps2;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000724 if (enc != NULL) {
725 /* Must copy encoding declaration since it
726 gets copied into the parse tree. */
727 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
728 if (!tok->encoding) {
729 PyTokenizer_Free(tok);
730 return NULL;
731 }
732 strcpy(tok->encoding, enc);
Neil Schemenauer3f993c32007-09-21 20:50:26 +0000733 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000734 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000735 return tok;
736}
737
738
739/* Free a tok_state structure */
740
741void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000742PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000743{
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000744 if (tok->encoding != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000745 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000746#ifndef PGEN
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000747 Py_XDECREF(tok->decoding_readline);
748 Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000749#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000750 if (tok->fp != NULL && tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000751 PyMem_FREE(tok->buf);
752 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000753}
754
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000755/* Get next char, updating state; error code goes into tok->done */
756
757static int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000758tok_nextc(register struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000759{
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000760 for (;;) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000761 if (tok->cur != tok->inp) {
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000762 return Py_CHARMASK(*tok->cur++); /* Fast path */
Guido van Rossum1a817c01994-09-19 08:06:25 +0000763 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000764 if (tok->done != E_OK)
765 return EOF;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000766 if (tok->fp == NULL) {
Guido van Rossum1a817c01994-09-19 08:06:25 +0000767 char *end = strchr(tok->inp, '\n');
768 if (end != NULL)
769 end++;
770 else {
771 end = strchr(tok->inp, '\0');
772 if (end == tok->inp) {
773 tok->done = E_EOF;
774 return EOF;
775 }
776 }
777 if (tok->start == NULL)
778 tok->buf = tok->cur;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000779 tok->line_start = tok->cur;
Guido van Rossum1a817c01994-09-19 08:06:25 +0000780 tok->lineno++;
781 tok->inp = end;
Guido van Rossumcf57d8b1998-01-19 22:07:46 +0000782 return Py_CHARMASK(*tok->cur++);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000783 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000784 if (tok->prompt != NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000785 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000786#ifndef PGEN
787 if (tok->encoding && newtok && *newtok) {
788 /* Recode to UTF-8 */
789 Py_ssize_t buflen;
790 const char* buf;
791 PyObject *u = translate_into_utf8(newtok, tok->encoding);
792 PyMem_FREE(newtok);
793 if (!u) {
794 tok->done = E_DECODE;
795 return EOF;
796 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000797 buflen = PyBytes_GET_SIZE(u);
798 buf = PyBytes_AS_STRING(u);
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000799 if (!buf) {
800 Py_DECREF(u);
801 tok->done = E_DECODE;
802 return EOF;
803 }
804 newtok = PyMem_MALLOC(buflen+1);
805 strcpy(newtok, buf);
806 Py_DECREF(u);
807 }
808#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000809 if (tok->nextprompt != NULL)
810 tok->prompt = tok->nextprompt;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000811 if (newtok == NULL)
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000812 tok->done = E_INTR;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000813 else if (*newtok == '\0') {
814 PyMem_FREE(newtok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000815 tok->done = E_EOF;
816 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000817 else if (tok->start != NULL) {
Guido van Rossum6da34342000-06-28 22:00:02 +0000818 size_t start = tok->start - tok->buf;
819 size_t oldlen = tok->cur - tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000820 size_t newlen = oldlen + strlen(newtok);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000821 char *buf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000822 buf = (char *)PyMem_REALLOC(buf, newlen+1);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000823 tok->lineno++;
824 if (buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000825 PyMem_FREE(tok->buf);
Guido van Rossum588633d1994-12-30 15:46:02 +0000826 tok->buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000827 PyMem_FREE(newtok);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000828 tok->done = E_NOMEM;
829 return EOF;
830 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000831 tok->buf = buf;
832 tok->cur = tok->buf + oldlen;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000833 tok->line_start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000834 strcpy(tok->buf + oldlen, newtok);
835 PyMem_FREE(newtok);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000836 tok->inp = tok->buf + newlen;
837 tok->end = tok->inp + 1;
838 tok->start = tok->buf + start;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000839 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000840 else {
841 tok->lineno++;
842 if (tok->buf != NULL)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000843 PyMem_FREE(tok->buf);
844 tok->buf = newtok;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000845 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000846 tok->cur = tok->buf;
Martin v. Löwis49c5da12006-03-01 22:49:05 +0000847 tok->line_start = tok->buf;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000848 tok->inp = strchr(tok->buf, '\0');
849 tok->end = tok->inp + 1;
850 }
851 }
852 else {
853 int done = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000854 Py_ssize_t cur = 0;
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000855 char *pt;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000856 if (tok->start == NULL) {
857 if (tok->buf == NULL) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000858 tok->buf = (char *)
859 PyMem_MALLOC(BUFSIZ);
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000860 if (tok->buf == NULL) {
861 tok->done = E_NOMEM;
862 return EOF;
863 }
864 tok->end = tok->buf + BUFSIZ;
865 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000866 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
867 tok) == NULL) {
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000868 tok->done = E_EOF;
869 done = 1;
870 }
871 else {
872 tok->done = E_OK;
873 tok->inp = strchr(tok->buf, '\0');
874 done = tok->inp[-1] == '\n';
875 }
876 }
877 else {
878 cur = tok->cur - tok->buf;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000879 if (decoding_feof(tok)) {
Guido van Rossum78c05351995-01-17 16:12:13 +0000880 tok->done = E_EOF;
881 done = 1;
882 }
883 else
884 tok->done = E_OK;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000885 }
886 tok->lineno++;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000887 /* Read until '\n' or EOF */
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000888 while (!done) {
Martin v. Löwis18e16552006-02-15 17:27:45 +0000889 Py_ssize_t curstart = tok->start == NULL ? -1 :
890 tok->start - tok->buf;
891 Py_ssize_t curvalid = tok->inp - tok->buf;
892 Py_ssize_t newsize = curvalid + BUFSIZ;
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000893 char *newbuf = tok->buf;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000894 newbuf = (char *)PyMem_REALLOC(newbuf,
895 newsize);
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000896 if (newbuf == NULL) {
897 tok->done = E_NOMEM;
898 tok->cur = tok->inp;
899 return EOF;
900 }
901 tok->buf = newbuf;
902 tok->inp = tok->buf + curvalid;
903 tok->end = tok->buf + newsize;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000904 tok->start = curstart < 0 ? NULL :
905 tok->buf + curstart;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000906 if (decoding_fgets(tok->inp,
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000907 (int)(tok->end - tok->inp),
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000908 tok) == NULL) {
Thomas Wouters7eaf2aa2006-03-02 20:41:27 +0000909 /* Break out early on decoding
910 errors, as tok->buf will be NULL
911 */
912 if (tok->decoding_erred)
913 return EOF;
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000914 /* Last line does not end in \n,
915 fake one */
916 strcpy(tok->inp, "\n");
917 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000918 tok->inp = strchr(tok->inp, '\0');
Guido van Rossumf4b1a641994-08-29 12:43:07 +0000919 done = tok->inp[-1] == '\n';
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000920 }
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000921 if (tok->buf != NULL) {
922 tok->cur = tok->buf + cur;
923 tok->line_start = tok->cur;
924 /* replace "\r\n" with "\n" */
Thomas Wouters89f507f2006-12-13 04:49:30 +0000925 /* For Mac leave the \r, giving a syntax error */
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000926 pt = tok->inp - 2;
927 if (pt >= tok->buf && *pt == '\r') {
928 *pt++ = '\n';
929 *pt = '\0';
930 tok->inp = pt;
931 }
Guido van Rossum2e96eb91995-06-14 18:26:02 +0000932 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000933 }
934 if (tok->done != E_OK) {
935 if (tok->prompt != NULL)
Guido van Rossum6e73bf41998-08-25 18:13:04 +0000936 PySys_WriteStderr("\n");
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000937 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000938 return EOF;
939 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000940 }
Guido van Rossum6ac258d1993-05-12 08:24:20 +0000941 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000942}
943
944
945/* Back-up one character */
946
947static void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000948tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000949{
950 if (c != EOF) {
Guido van Rossum588633d1994-12-30 15:46:02 +0000951 if (--tok->cur < tok->buf)
Guido van Rossum86bea461997-04-29 21:03:06 +0000952 Py_FatalError("tok_backup: begin of buffer");
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000953 if (*tok->cur != c)
954 *tok->cur = c;
955 }
956}
957
958
959/* Return the token corresponding to a single character */
960
961int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000962PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000963{
964 switch (c) {
965 case '(': return LPAR;
966 case ')': return RPAR;
967 case '[': return LSQB;
968 case ']': return RSQB;
969 case ':': return COLON;
970 case ',': return COMMA;
971 case ';': return SEMI;
972 case '+': return PLUS;
973 case '-': return MINUS;
974 case '*': return STAR;
975 case '/': return SLASH;
976 case '|': return VBAR;
977 case '&': return AMPER;
978 case '<': return LESS;
979 case '>': return GREATER;
980 case '=': return EQUAL;
981 case '.': return DOT;
982 case '%': return PERCENT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000983 case '{': return LBRACE;
984 case '}': return RBRACE;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +0000985 case '^': return CIRCUMFLEX;
986 case '~': return TILDE;
Anthony Baxterc2a5a632004-08-02 06:10:11 +0000987 case '@': return AT;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000988 default: return OP;
989 }
990}
991
992
Guido van Rossumfbab9051991-10-20 20:25:03 +0000993int
Thomas Wouters23c9e002000-07-22 19:20:54 +0000994PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +0000995{
996 switch (c1) {
997 case '=':
998 switch (c2) {
999 case '=': return EQEQUAL;
1000 }
1001 break;
1002 case '!':
1003 switch (c2) {
1004 case '=': return NOTEQUAL;
1005 }
1006 break;
1007 case '<':
1008 switch (c2) {
Guido van Rossumfbab9051991-10-20 20:25:03 +00001009 case '=': return LESSEQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001010 case '<': return LEFTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001011 }
1012 break;
1013 case '>':
1014 switch (c2) {
1015 case '=': return GREATEREQUAL;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001016 case '>': return RIGHTSHIFT;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001017 }
1018 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001019 case '+':
1020 switch (c2) {
1021 case '=': return PLUSEQUAL;
1022 }
1023 break;
1024 case '-':
1025 switch (c2) {
1026 case '=': return MINEQUAL;
Neal Norwitzc1505362006-12-28 06:47:50 +00001027 case '>': return RARROW;
Thomas Wouters434d0822000-08-24 20:11:32 +00001028 }
1029 break;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001030 case '*':
1031 switch (c2) {
1032 case '*': return DOUBLESTAR;
Thomas Wouters434d0822000-08-24 20:11:32 +00001033 case '=': return STAREQUAL;
1034 }
1035 break;
1036 case '/':
1037 switch (c2) {
Guido van Rossum4668b002001-08-08 05:00:18 +00001038 case '/': return DOUBLESLASH;
Thomas Wouters434d0822000-08-24 20:11:32 +00001039 case '=': return SLASHEQUAL;
1040 }
1041 break;
1042 case '|':
1043 switch (c2) {
1044 case '=': return VBAREQUAL;
1045 }
1046 break;
1047 case '%':
1048 switch (c2) {
1049 case '=': return PERCENTEQUAL;
1050 }
1051 break;
1052 case '&':
1053 switch (c2) {
1054 case '=': return AMPEREQUAL;
1055 }
1056 break;
1057 case '^':
1058 switch (c2) {
1059 case '=': return CIRCUMFLEXEQUAL;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001060 }
1061 break;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001062 }
1063 return OP;
1064}
1065
Thomas Wouters434d0822000-08-24 20:11:32 +00001066int
1067PyToken_ThreeChars(int c1, int c2, int c3)
1068{
1069 switch (c1) {
1070 case '<':
1071 switch (c2) {
1072 case '<':
1073 switch (c3) {
1074 case '=':
1075 return LEFTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001076 }
1077 break;
1078 }
1079 break;
1080 case '>':
1081 switch (c2) {
1082 case '>':
1083 switch (c3) {
1084 case '=':
1085 return RIGHTSHIFTEQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001086 }
1087 break;
1088 }
1089 break;
1090 case '*':
1091 switch (c2) {
1092 case '*':
1093 switch (c3) {
1094 case '=':
1095 return DOUBLESTAREQUAL;
Thomas Wouters434d0822000-08-24 20:11:32 +00001096 }
1097 break;
1098 }
1099 break;
Guido van Rossum4668b002001-08-08 05:00:18 +00001100 case '/':
1101 switch (c2) {
1102 case '/':
1103 switch (c3) {
1104 case '=':
1105 return DOUBLESLASHEQUAL;
1106 }
1107 break;
1108 }
1109 break;
Georg Brandldde00282007-03-18 19:01:53 +00001110 case '.':
1111 switch (c2) {
1112 case '.':
1113 switch (c3) {
1114 case '.':
1115 return ELLIPSIS;
1116 }
1117 break;
1118 }
1119 break;
Thomas Wouters434d0822000-08-24 20:11:32 +00001120 }
1121 return OP;
1122}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001123
Guido van Rossum926f13a1998-04-09 21:38:06 +00001124static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001125indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001126{
1127 if (tok->alterror) {
Fred Drake85f36392000-07-11 17:53:00 +00001128 tok->done = E_TABSPACE;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001129 tok->cur = tok->inp;
1130 return 1;
1131 }
1132 if (tok->altwarning) {
Fred Drake85f36392000-07-11 17:53:00 +00001133 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1134 "in indentation\n", tok->filename);
Guido van Rossum926f13a1998-04-09 21:38:06 +00001135 tok->altwarning = 0;
1136 }
1137 return 0;
1138}
1139
Martin v. Löwis47383402007-08-15 07:32:56 +00001140#ifdef PGEN
1141#define verify_identifier(s,e) 1
1142#else
1143/* Verify that the identifier follows PEP 3131. */
1144static int
1145verify_identifier(char *start, char *end)
1146{
Guido van Rossume3e37012007-08-29 18:54:41 +00001147 PyObject *s;
1148 int result;
1149 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1150 if (s == NULL) {
1151 PyErr_Clear();
1152 return 0;
1153 }
1154 result = PyUnicode_IsIdentifier(s);
Martin v. Löwis47383402007-08-15 07:32:56 +00001155 Py_DECREF(s);
1156 return result;
1157}
1158#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001159
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001160/* Get next token, after space stripping etc. */
1161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001162static int
1163tok_get(register struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001164{
1165 register int c;
Martin v. Löwis47383402007-08-15 07:32:56 +00001166 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001167
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001168 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001169 nextline:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001170 tok->start = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001171 blankline = 0;
1172
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001173 /* Get indentation level */
1174 if (tok->atbol) {
1175 register int col = 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001176 register int altcol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001177 tok->atbol = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001178 for (;;) {
1179 c = tok_nextc(tok);
1180 if (c == ' ')
Guido van Rossum926f13a1998-04-09 21:38:06 +00001181 col++, altcol++;
1182 else if (c == '\t') {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001183 col = (col/tok->tabsize + 1) * tok->tabsize;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001184 altcol = (altcol/tok->alttabsize + 1)
1185 * tok->alttabsize;
1186 }
Guido van Rossum94d32b11995-07-07 22:27:27 +00001187 else if (c == '\014') /* Control-L (formfeed) */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001188 col = altcol = 0; /* For Emacs users */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001189 else
1190 break;
1191 }
1192 tok_backup(tok, c);
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001193 if (c == '#' || c == '\n') {
1194 /* Lines with only whitespace and/or comments
1195 shouldn't affect the indentation and are
1196 not passed to the parser as NEWLINE tokens,
1197 except *totally* empty lines in interactive
1198 mode, which signal the end of a command group. */
1199 if (col == 0 && c == '\n' && tok->prompt != NULL)
1200 blankline = 0; /* Let it through */
1201 else
1202 blankline = 1; /* Ignore completely */
1203 /* We can't jump back right here since we still
1204 may need to skip to the end of a comment */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001205 }
Guido van Rossuma849b831993-05-12 11:35:44 +00001206 if (!blankline && tok->level == 0) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001207 if (col == tok->indstack[tok->indent]) {
1208 /* No change */
Guido van Rossum926f13a1998-04-09 21:38:06 +00001209 if (altcol != tok->altindstack[tok->indent]) {
1210 if (indenterror(tok))
1211 return ERRORTOKEN;
1212 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001213 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001214 else if (col > tok->indstack[tok->indent]) {
1215 /* Indent -- always one */
1216 if (tok->indent+1 >= MAXINDENT) {
Fred Drake85f36392000-07-11 17:53:00 +00001217 tok->done = E_TOODEEP;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001218 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001219 return ERRORTOKEN;
1220 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001221 if (altcol <= tok->altindstack[tok->indent]) {
1222 if (indenterror(tok))
1223 return ERRORTOKEN;
1224 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001225 tok->pendin++;
1226 tok->indstack[++tok->indent] = col;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001227 tok->altindstack[tok->indent] = altcol;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001228 }
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001229 else /* col < tok->indstack[tok->indent] */ {
1230 /* Dedent -- any number, must be consistent */
1231 while (tok->indent > 0 &&
1232 col < tok->indstack[tok->indent]) {
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001233 tok->pendin--;
Guido van Rossum54758fa1998-02-16 22:18:00 +00001234 tok->indent--;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001235 }
1236 if (col != tok->indstack[tok->indent]) {
Fred Drake85f36392000-07-11 17:53:00 +00001237 tok->done = E_DEDENT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001238 tok->cur = tok->inp;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001239 return ERRORTOKEN;
1240 }
Guido van Rossum926f13a1998-04-09 21:38:06 +00001241 if (altcol != tok->altindstack[tok->indent]) {
1242 if (indenterror(tok))
1243 return ERRORTOKEN;
1244 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001245 }
1246 }
1247 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001248
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001249 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001250
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001251 /* Return pending indents/dedents */
1252 if (tok->pendin != 0) {
1253 if (tok->pendin < 0) {
1254 tok->pendin++;
1255 return DEDENT;
1256 }
1257 else {
1258 tok->pendin--;
1259 return INDENT;
1260 }
1261 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001262
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001263 again:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001264 tok->start = NULL;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001265 /* Skip spaces */
1266 do {
1267 c = tok_nextc(tok);
Guido van Rossum94d32b11995-07-07 22:27:27 +00001268 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001269
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001270 /* Set start of current token */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001271 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001272
Thomas Wouters6caa07b2006-04-14 11:33:28 +00001273 /* Skip comment */
1274 if (c == '#')
Guido van Rossumab5ca152000-03-31 00:52:27 +00001275 while (c != EOF && c != '\n')
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001276 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001277
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001278 /* Check for EOF and errors now */
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001279 if (c == EOF) {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001280 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001281 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001282
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001283 /* Identifier (most frequent token!) */
Martin v. Löwis47383402007-08-15 07:32:56 +00001284 nonascii = 0;
Martin v. Löwis5b222132007-06-10 09:51:05 +00001285 if (is_potential_identifier_start(c)) {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001286 /* Process b"", r"" and br"" */
1287 if (c == 'b' || c == 'B') {
Guido van Rossum5026cb41997-04-25 17:32:00 +00001288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001292 if (c == 'r' || c == 'R') {
1293 c = tok_nextc(tok);
1294 if (c == '"' || c == '\'')
1295 goto letter_quote;
1296 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001297 while (is_potential_identifier_char(c)) {
Martin v. Löwis47383402007-08-15 07:32:56 +00001298 if (c >= 128)
1299 nonascii = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001300 c = tok_nextc(tok);
Guido van Rossum24dacb31997-04-06 03:46:20 +00001301 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001302 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001303 if (nonascii &&
Martin v. Löwis47383402007-08-15 07:32:56 +00001304 !verify_identifier(tok->start, tok->cur)) {
1305 tok->done = E_IDENTIFIER;
1306 return ERRORTOKEN;
1307 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001308 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001309 *p_end = tok->cur;
1310 return NAME;
1311 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001312
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001313 /* Newline */
1314 if (c == '\n') {
1315 tok->atbol = 1;
Guido van Rossuma849b831993-05-12 11:35:44 +00001316 if (blankline || tok->level > 0)
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001317 goto nextline;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001318 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001319 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001320 tok->cont_line = 0;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001321 return NEWLINE;
1322 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001323
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001324 /* Period or number starting with period? */
1325 if (c == '.') {
1326 c = tok_nextc(tok);
1327 if (isdigit(c)) {
1328 goto fraction;
Georg Brandldde00282007-03-18 19:01:53 +00001329 } else if (c == '.') {
1330 c = tok_nextc(tok);
1331 if (c == '.') {
1332 *p_start = tok->start;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001333 *p_end = tok->cur;
Georg Brandldde00282007-03-18 19:01:53 +00001334 return ELLIPSIS;
1335 } else {
1336 tok_backup(tok, c);
1337 }
1338 tok_backup(tok, '.');
1339 } else {
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001340 tok_backup(tok, c);
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001341 }
Georg Brandldde00282007-03-18 19:01:53 +00001342 *p_start = tok->start;
1343 *p_end = tok->cur;
1344 return DOT;
Guido van Rossumbaf0ebf1991-10-24 14:59:40 +00001345 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001346
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001347 /* Number */
1348 if (isdigit(c)) {
1349 if (c == '0') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001350 /* Hex, octal or binary -- maybe. */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001351 c = tok_nextc(tok);
1352 if (c == '.')
1353 goto fraction;
Guido van Rossumf595fde1996-01-12 01:31:58 +00001354#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001355 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001356 goto imaginary;
1357#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001358 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001359
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001360 /* Hex */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001361 c = tok_nextc(tok);
1362 if (!isxdigit(c)) {
1363 tok->done = E_TOKEN;
1364 tok_backup(tok, c);
1365 return ERRORTOKEN;
1366 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001367 do {
1368 c = tok_nextc(tok);
1369 } while (isxdigit(c));
1370 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001371 else if (c == 'o' || c == 'O') {
1372 /* Octal */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001373 c = tok_nextc(tok);
Christian Heimes81ee3ef2008-05-04 22:42:01 +00001374 if (c < '0' || c >= '8') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001375 tok->done = E_TOKEN;
1376 tok_backup(tok, c);
1377 return ERRORTOKEN;
1378 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001379 do {
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001380 c = tok_nextc(tok);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001381 } while ('0' <= c && c < '8');
1382 }
1383 else if (c == 'b' || c == 'B') {
1384 /* Binary */
Georg Brandlfceab5a2008-01-19 20:08:23 +00001385 c = tok_nextc(tok);
1386 if (c != '0' && c != '1') {
1387 tok->done = E_TOKEN;
1388 tok_backup(tok, c);
1389 return ERRORTOKEN;
1390 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001391 do {
1392 c = tok_nextc(tok);
1393 } while (c == '0' || c == '1');
1394 }
1395 else {
1396 int nonzero = 0;
1397 /* maybe old-style octal; c is first char of it */
1398 /* in any case, allow '0' as a literal */
1399 while (c == '0')
1400 c = tok_nextc(tok);
1401 while (isdigit(c)) {
1402 nonzero = 1;
1403 c = tok_nextc(tok);
Tim Petersd507dab2001-08-30 20:51:59 +00001404 }
1405 if (c == '.')
1406 goto fraction;
1407 else if (c == 'e' || c == 'E')
1408 goto exponent;
1409#ifndef WITHOUT_COMPLEX
1410 else if (c == 'j' || c == 'J')
1411 goto imaginary;
1412#endif
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001413 else if (nonzero) {
Tim Petersd507dab2001-08-30 20:51:59 +00001414 tok->done = E_TOKEN;
1415 tok_backup(tok, c);
1416 return ERRORTOKEN;
1417 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001418 }
1419 }
1420 else {
1421 /* Decimal */
1422 do {
1423 c = tok_nextc(tok);
1424 } while (isdigit(c));
Guido van Rossume2a383d2007-01-15 16:59:06 +00001425 {
Tim Peters9aa70d92001-08-27 19:19:28 +00001426 /* Accept floating point numbers. */
Guido van Rossumf023c461991-05-05 20:16:20 +00001427 if (c == '.') {
1428 fraction:
1429 /* Fraction */
1430 do {
1431 c = tok_nextc(tok);
1432 } while (isdigit(c));
1433 }
1434 if (c == 'e' || c == 'E') {
Tim Petersd507dab2001-08-30 20:51:59 +00001435 exponent:
Guido van Rossumf023c461991-05-05 20:16:20 +00001436 /* Exponent part */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001437 c = tok_nextc(tok);
Guido van Rossumf023c461991-05-05 20:16:20 +00001438 if (c == '+' || c == '-')
1439 c = tok_nextc(tok);
Tim Peters9aa70d92001-08-27 19:19:28 +00001440 if (!isdigit(c)) {
1441 tok->done = E_TOKEN;
1442 tok_backup(tok, c);
1443 return ERRORTOKEN;
Guido van Rossumf023c461991-05-05 20:16:20 +00001444 }
Tim Peters9aa70d92001-08-27 19:19:28 +00001445 do {
1446 c = tok_nextc(tok);
1447 } while (isdigit(c));
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001448 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001449#ifndef WITHOUT_COMPLEX
Guido van Rossumfaa436c1996-01-26 18:59:07 +00001450 if (c == 'j' || c == 'J')
Guido van Rossumf595fde1996-01-12 01:31:58 +00001451 /* Imaginary part */
1452 imaginary:
1453 c = tok_nextc(tok);
1454#endif
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001455 }
1456 }
1457 tok_backup(tok, c);
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001458 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001459 *p_end = tok->cur;
1460 return NUMBER;
1461 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001462
1463 letter_quote:
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001464 /* String */
1465 if (c == '\'' || c == '"') {
Guido van Rossumcf171a72007-11-16 00:51:45 +00001466 int quote = c;
1467 int quote_size = 1; /* 1 or 3 */
1468 int end_quote_size = 0;
1469
1470 /* Find the quote size and start of string */
1471 c = tok_nextc(tok);
1472 if (c == quote) {
1473 c = tok_nextc(tok);
1474 if (c == quote)
1475 quote_size = 3;
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001476 else
Guido van Rossumcf171a72007-11-16 00:51:45 +00001477 end_quote_size = 1; /* empty string found */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001478 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001479 if (c != quote)
1480 tok_backup(tok, c);
1481
1482 /* Get rest of string */
1483 while (end_quote_size != quote_size) {
1484 c = tok_nextc(tok);
1485 if (c == EOF) {
1486 if (quote_size == 3)
1487 tok->done = E_EOFS;
1488 else
1489 tok->done = E_EOLS;
1490 tok->cur = tok->inp;
1491 return ERRORTOKEN;
1492 }
1493 if (quote_size == 1 && c == '\n') {
1494 tok->done = E_EOLS;
1495 tok->cur = tok->inp;
1496 return ERRORTOKEN;
1497 }
1498 if (c == quote)
1499 end_quote_size += 1;
1500 else {
1501 end_quote_size = 0;
1502 if (c == '\\')
1503 c = tok_nextc(tok); /* skip escaped char */
1504 }
1505 }
1506
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001507 *p_start = tok->start;
Guido van Rossum8054fad1993-10-26 15:19:44 +00001508 *p_end = tok->cur;
1509 return STRING;
1510 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001511
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001512 /* Line continuation */
1513 if (c == '\\') {
1514 c = tok_nextc(tok);
1515 if (c != '\n') {
Martin v. Löwis4bf108d2005-03-03 11:45:45 +00001516 tok->done = E_LINECONT;
Guido van Rossum6ac258d1993-05-12 08:24:20 +00001517 tok->cur = tok->inp;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001518 return ERRORTOKEN;
1519 }
Martin v. Löwisf62a89b2002-09-03 11:52:44 +00001520 tok->cont_line = 1;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001521 goto again; /* Read next line */
1522 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001523
Guido van Rossumfbab9051991-10-20 20:25:03 +00001524 /* Check for two-character token */
1525 {
1526 int c2 = tok_nextc(tok);
Guido van Rossum86bea461997-04-29 21:03:06 +00001527 int token = PyToken_TwoChars(c, c2);
Guido van Rossumfbab9051991-10-20 20:25:03 +00001528 if (token != OP) {
Thomas Wouters434d0822000-08-24 20:11:32 +00001529 int c3 = tok_nextc(tok);
1530 int token3 = PyToken_ThreeChars(c, c2, c3);
1531 if (token3 != OP) {
1532 token = token3;
1533 } else {
1534 tok_backup(tok, c3);
1535 }
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001536 *p_start = tok->start;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001537 *p_end = tok->cur;
1538 return token;
1539 }
1540 tok_backup(tok, c2);
1541 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001542
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001543 /* Keep track of parentheses nesting level */
Guido van Rossuma849b831993-05-12 11:35:44 +00001544 switch (c) {
1545 case '(':
1546 case '[':
1547 case '{':
1548 tok->level++;
1549 break;
1550 case ')':
1551 case ']':
1552 case '}':
1553 tok->level--;
1554 break;
1555 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001556
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001557 /* Punctuation character */
Guido van Rossumf4b1a641994-08-29 12:43:07 +00001558 *p_start = tok->start;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001559 *p_end = tok->cur;
Guido van Rossum86bea461997-04-29 21:03:06 +00001560 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001561}
1562
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001563int
1564PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1565{
1566 int result = tok_get(tok, p_start, p_end);
1567 if (tok->decoding_erred) {
1568 result = ERRORTOKEN;
1569 tok->done = E_DECODE;
1570 }
1571 return result;
1572}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001573
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001574/* Get -*- encoding -*- from a Python file.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001575
1576 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossumcf171a72007-11-16 00:51:45 +00001577 the first or second line of the file (in which case the encoding
Brett Cannone4539892007-10-20 03:46:49 +00001578 should be assumed to be PyUnicode_GetDefaultEncoding()).
1579
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001580 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1581 by the caller.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001582*/
1583char *
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001584PyTokenizer_FindEncoding(int fd)
1585{
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001586 struct tok_state *tok;
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001587 FILE *fp;
1588 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001589
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001590 fd = dup(fd);
1591 if (fd < 0) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001592 return NULL;
1593 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001594 fp = fdopen(fd, "r");
1595 if (fp == NULL) {
1596 return NULL;
1597 }
1598 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1599 if (tok == NULL) {
1600 fclose(fp);
1601 return NULL;
1602 }
1603 while (tok->lineno < 2 && tok->done == E_OK) {
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001604 PyTokenizer_Get(tok, &p_start, &p_end);
1605 }
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001606 fclose(fp);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001607 if (tok->encoding) {
Brett Cannonc2954e52007-10-21 02:45:33 +00001608 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001609 strcpy(encoding, tok->encoding);
1610 }
1611 PyTokenizer_Free(tok);
Brett Cannond5ec98c2007-10-20 02:54:14 +00001612 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001613}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001614
Guido van Rossum408027e1996-12-30 16:17:54 +00001615#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001616
1617void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001618tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001619{
Guido van Rossum86bea461997-04-29 21:03:06 +00001620 printf("%s", _PyParser_TokenNames[type]);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001621 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1622 printf("(%.*s)", (int)(end - start), start);
1623}
1624
1625#endif