blob: c32a3bfd1c805207371e422a2f4a64a3af479f99 [file] [log] [blame]
Guido van Rossumf70e43a1991-02-19 12:39:46 +00001
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00002/* Tokenizer implementation */
3
Jack Jansen7b8c7542002-04-14 20:12:41 +00004#include "Python.h"
Guido van Rossum3f5da241990-12-20 15:06:42 +00005#include "pgenheaders.h"
6
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00007#include <ctype.h>
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00008#include <assert.h>
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00009
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000010#include "tokenizer.h"
11#include "errcode.h"
12
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000013#ifndef PGEN
14#include "unicodeobject.h"
Christian Heimes2c9c7a52008-05-26 13:42:13 +000015#include "bytesobject.h"
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +000016#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
Martin v. Löwis5b222132007-06-10 09:51:05 +000021#define is_potential_identifier_start(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000026
27#define is_potential_identifier_char(c) (\
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000028 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
Martin v. Löwis5b222132007-06-10 09:51:05 +000033
Serhiy Storchakac6792272013-10-19 21:03:34 +030034extern char *PyOS_Readline(FILE *, FILE *, const char *);
Guido van Rossumf4b1a641994-08-29 12:43:07 +000035/* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
38
Guido van Rossum4fe87291992-02-26 15:24:44 +000039/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000040#define TABSIZE 8
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000041
Guido van Rossum3f5da241990-12-20 15:06:42 +000042/* Forward */
Tim Petersdbd9ba62000-07-09 03:09:57 +000043static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum3f5da241990-12-20 15:06:42 +000046
Brett Cannond5ec98c2007-10-20 02:54:14 +000047
Guido van Rossum85a5fbb1990-10-14 12:07:46 +000048/* Token names */
49
Benjamin Petersond0845582012-10-24 08:21:52 -070050const char *_PyParser_TokenNames[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000051 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000107};
108
109
110/* Create and initialize a new tok_state structure */
111
112static struct tok_state *
Thomas Wouters23c9e002000-07-22 19:20:54 +0000113tok_new(void)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000114{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->input = NULL;
123 tok->tabsize = TABSIZE;
124 tok->indent = 0;
125 tok->indstack[0] = 0;
126 tok->atbol = 1;
127 tok->pendin = 0;
128 tok->prompt = tok->nextprompt = NULL;
129 tok->lineno = 0;
130 tok->level = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000141#ifndef PGEN
Victor Stinner7f2fee32011-04-05 00:39:01 +0200142 tok->filename = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 tok->decoding_readline = NULL;
144 tok->decoding_buffer = NULL;
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000145#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000147}
148
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000149static char *
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700150new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000151{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700153 if (!result) {
154 tok->done = E_NOMEM;
155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700157 memcpy(result, s, len);
158 result[len] = '\0';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 return result;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000160}
161
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000162#ifdef PGEN
163
164static char *
165decoding_fgets(char *s, int size, struct tok_state *tok)
166{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 return fgets(s, size, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000168}
169
170static int
171decoding_feof(struct tok_state *tok)
172{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 return feof(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000174}
175
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000176static char *
177decode_str(const char *str, int exec_input, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000178{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700179 return new_string(str, strlen(str), tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000180}
181
182#else /* PGEN */
183
184static char *
185error_ret(struct tok_state *tok) /* XXX */
186{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 tok->decoding_erred = 1;
188 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
189 PyMem_FREE(tok->buf);
190 tok->buf = NULL;
191 return NULL; /* as if it were EOF */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000192}
193
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000194
195static char *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196get_normal_name(char *s) /* for utf-8 and latin-1 */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000197{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 char buf[13];
199 int i;
200 for (i = 0; i < 12; i++) {
201 int c = s[i];
202 if (c == '\0')
203 break;
204 else if (c == '_')
205 buf[i] = '-';
206 else
207 buf[i] = tolower(c);
208 }
209 buf[i] = '\0';
210 if (strcmp(buf, "utf-8") == 0 ||
211 strncmp(buf, "utf-8-", 6) == 0)
212 return "utf-8";
213 else if (strcmp(buf, "latin-1") == 0 ||
214 strcmp(buf, "iso-8859-1") == 0 ||
215 strcmp(buf, "iso-latin-1") == 0 ||
216 strncmp(buf, "latin-1-", 8) == 0 ||
217 strncmp(buf, "iso-8859-1-", 11) == 0 ||
218 strncmp(buf, "iso-latin-1-", 12) == 0)
219 return "iso-8859-1";
220 else
221 return s;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000222}
223
224/* Return the coding spec in S, or NULL if none is found. */
225
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700226static int
227get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 Py_ssize_t i;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700230 *spec = NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 /* Coding spec must be in a comment, and that comment must be
232 * the only statement on the source code line. */
233 for (i = 0; i < size - 6; i++) {
234 if (s[i] == '#')
235 break;
236 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700237 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
239 for (; i < size - 6; i++) { /* XXX inefficient search */
240 const char* t = s + i;
241 if (strncmp(t, "coding", 6) == 0) {
242 const char* begin = NULL;
243 t += 6;
244 if (t[0] != ':' && t[0] != '=')
245 continue;
246 do {
247 t++;
248 } while (t[0] == '\x20' || t[0] == '\t');
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 begin = t;
251 while (Py_ISALNUM(t[0]) ||
252 t[0] == '-' || t[0] == '_' || t[0] == '.')
253 t++;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 if (begin < t) {
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700256 char* r = new_string(begin, t - begin, tok);
Benjamin Peterson265fba42013-07-15 20:50:22 -0700257 char* q;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700258 if (!r)
259 return 0;
Benjamin Peterson265fba42013-07-15 20:50:22 -0700260 q = get_normal_name(r);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 if (r != q) {
262 PyMem_FREE(r);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700263 r = new_string(q, strlen(q), tok);
264 if (!r)
265 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700267 *spec = r;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000268 }
269 }
270 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700271 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000272}
273
274/* Check whether the line contains a coding spec. If it does,
275 invoke the set_readline function for the new encoding.
276 This function receives the tok_state and the new encoding.
277 Return 1 on success, 0 on failure. */
278
279static int
Martin v. Löwis18e16552006-02-15 17:27:45 +0000280check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 int set_readline(struct tok_state *, const char *))
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000282{
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700283 char *cs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 int r = 1;
Tim Peters17db21f2002-09-03 15:39:58 +0000285
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200286 if (tok->cont_line) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 /* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200288 tok->read_coding_spec = 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200290 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700291 if (!get_coding_spec(line, &cs, size, tok))
292 return 0;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200293 if (!cs) {
294 Py_ssize_t i;
295 for (i = 0; i < size; i++) {
296 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
297 break;
298 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
299 /* Stop checking coding spec after a line containing
300 * anything except a comment. */
301 tok->read_coding_spec = 1;
302 break;
303 }
304 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700305 return 1;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200306 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700307 tok->read_coding_spec = 1;
308 if (tok->encoding == NULL) {
309 assert(tok->decoding_state == STATE_RAW);
310 if (strcmp(cs, "utf-8") == 0) {
311 tok->encoding = cs;
312 } else {
313 r = set_readline(tok, cs);
314 if (r) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 tok->encoding = cs;
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700316 tok->decoding_state = STATE_NORMAL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700318 else {
Serhiy Storchaka3af14aa2013-06-09 16:51:52 +0300319 PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700320 "encoding problem: %s", cs);
321 PyMem_FREE(cs);
322 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 }
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700324 } else { /* then, compare cs with BOM */
325 r = (strcmp(tok->encoding, cs) == 0);
326 if (!r)
327 PyErr_Format(PyExc_SyntaxError,
328 "encoding problem: %s with BOM", cs);
329 PyMem_FREE(cs);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 return r;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000332}
333
334/* See whether the file starts with a BOM. If it does,
335 invoke the set_readline function with the new encoding.
336 Return 1 on success, 0 on failure. */
337
338static int
339check_bom(int get_char(struct tok_state *),
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 void unget_char(int, struct tok_state *),
341 int set_readline(struct tok_state *, const char *),
342 struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000343{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 int ch1, ch2, ch3;
345 ch1 = get_char(tok);
346 tok->decoding_state = STATE_RAW;
347 if (ch1 == EOF) {
348 return 1;
349 } else if (ch1 == 0xEF) {
350 ch2 = get_char(tok);
351 if (ch2 != 0xBB) {
352 unget_char(ch2, tok);
353 unget_char(ch1, tok);
354 return 1;
355 }
356 ch3 = get_char(tok);
357 if (ch3 != 0xBF) {
358 unget_char(ch3, tok);
359 unget_char(ch2, tok);
360 unget_char(ch1, tok);
361 return 1;
362 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000363#if 0
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 /* Disable support for UTF-16 BOMs until a decision
365 is made whether this needs to be supported. */
366 } else if (ch1 == 0xFE) {
367 ch2 = get_char(tok);
368 if (ch2 != 0xFF) {
369 unget_char(ch2, tok);
370 unget_char(ch1, tok);
371 return 1;
372 }
373 if (!set_readline(tok, "utf-16-be"))
374 return 0;
375 tok->decoding_state = STATE_NORMAL;
376 } else if (ch1 == 0xFF) {
377 ch2 = get_char(tok);
378 if (ch2 != 0xFE) {
379 unget_char(ch2, tok);
380 unget_char(ch1, tok);
381 return 1;
382 }
383 if (!set_readline(tok, "utf-16-le"))
384 return 0;
385 tok->decoding_state = STATE_NORMAL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000386#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 } else {
388 unget_char(ch1, tok);
389 return 1;
390 }
391 if (tok->encoding != NULL)
392 PyMem_FREE(tok->encoding);
Benjamin Peterson2dbfd882013-07-15 19:15:34 -0700393 tok->encoding = new_string("utf-8", 5, tok);
394 if (!tok->encoding)
395 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000396 /* No need to set_readline: input is already utf-8 */
397 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000398}
399
400/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000401 Return NULL on failure, else S.
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000403 On entry, tok->decoding_buffer will be one of:
404 1) NULL: need to call tok->decoding_readline to get a new line
405 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 stored the result in tok->decoding_buffer
Christian Heimes9c4756e2008-05-26 13:22:05 +0000407 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 (in the s buffer) to copy entire contents of the line read
409 by tok->decoding_readline. tok->decoding_buffer has the overflow.
410 In this case, fp_readl is called in a loop (with an expanded buffer)
411 until the buffer ends with a '\n' (or until the end of the file is
412 reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000413*/
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000414
415static char *
416fp_readl(char *s, int size, struct tok_state *tok)
417{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 PyObject* bufobj;
419 const char *buf;
420 Py_ssize_t buflen;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 /* Ask for one less byte so we can terminate it */
423 assert(size > 0);
424 size--;
Walter Dörwaldc1f5fff2005-07-12 21:53:43 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 if (tok->decoding_buffer) {
427 bufobj = tok->decoding_buffer;
428 Py_INCREF(bufobj);
429 }
430 else
431 {
432 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
433 if (bufobj == NULL)
434 goto error;
435 }
436 if (PyUnicode_CheckExact(bufobj))
437 {
438 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
439 if (buf == NULL) {
440 goto error;
441 }
442 }
443 else
444 {
445 buf = PyByteArray_AsString(bufobj);
446 if (buf == NULL) {
447 goto error;
448 }
449 buflen = PyByteArray_GET_SIZE(bufobj);
450 }
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000451
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 Py_XDECREF(tok->decoding_buffer);
453 if (buflen > size) {
454 /* Too many chars, the rest goes into tok->decoding_buffer */
455 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
456 buflen-size);
457 if (tok->decoding_buffer == NULL)
458 goto error;
459 buflen = size;
460 }
461 else
462 tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc65f9ace2007-11-15 23:19:43 +0000463
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 memcpy(s, buf, buflen);
465 s[buflen] = '\0';
466 if (buflen == 0) /* EOF */
467 s = NULL;
468 Py_DECREF(bufobj);
469 return s;
Neal Norwitz41eaedd2007-08-12 00:03:22 +0000470
471error:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 Py_XDECREF(bufobj);
473 return error_ret(tok);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000474}
475
476/* Set the readline function for TOK to a StreamReader's
477 readline function. The StreamReader is named ENC.
478
479 This function is called from check_bom and check_coding_spec.
480
481 ENC is usually identical to the future value of tok->encoding,
482 except for the (currently unsupported) case of UTF-16.
483
484 Return 1 on success, 0 on failure. */
485
486static int
487fp_setreadl(struct tok_state *tok, const char* enc)
488{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyObject *readline = NULL, *stream = NULL, *io = NULL;
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(open);
491 _Py_IDENTIFIER(readline);
Victor Stinner22a351a2010-10-14 12:04:34 +0000492 int fd;
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200493 long pos;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000494
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495 io = PyImport_ImportModuleNoBlock("io");
496 if (io == NULL)
497 goto cleanup;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000498
Victor Stinner22a351a2010-10-14 12:04:34 +0000499 fd = fileno(tok->fp);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200500 /* Due to buffering the file offset for fd can be different from the file
501 * position of tok->fp. */
502 pos = ftell(tok->fp);
503 if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
Victor Stinner22a351a2010-10-14 12:04:34 +0000504 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
505 goto cleanup;
506 }
507
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200508 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner22a351a2010-10-14 12:04:34 +0000509 fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 if (stream == NULL)
511 goto cleanup;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000512
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000513 Py_XDECREF(tok->decoding_readline);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200514 readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000515 tok->decoding_readline = readline;
Guido van Rossum9cbfffd2007-06-07 00:54:15 +0000516
517 cleanup:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 Py_XDECREF(stream);
519 Py_XDECREF(io);
520 return readline != NULL;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000521}
522
523/* Fetch the next byte from TOK. */
524
525static int fp_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 return getc(tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000527}
528
529/* Unfetch the last byte back into TOK. */
530
531static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000532 ungetc(c, tok->fp);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000533}
534
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000535/* Check whether the characters at s start a valid
536 UTF-8 sequence. Return the number of characters forming
537 the sequence if yes, 0 if not. */
538static int valid_utf8(const unsigned char* s)
539{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000540 int expected = 0;
541 int length;
542 if (*s < 0x80)
543 /* single-byte code */
544 return 1;
545 if (*s < 0xc0)
546 /* following byte */
547 return 0;
548 if (*s < 0xE0)
549 expected = 1;
550 else if (*s < 0xF0)
551 expected = 2;
552 else if (*s < 0xF8)
553 expected = 3;
554 else
555 return 0;
556 length = expected + 1;
557 for (; expected; expected--)
558 if (s[expected] < 0x80 || s[expected] >= 0xC0)
559 return 0;
560 return length;
Martin v. Löwis447d33e2007-07-29 18:10:01 +0000561}
562
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000563/* Read a line of input from TOK. Determine encoding
564 if necessary. */
565
566static char *
567decoding_fgets(char *s, int size, struct tok_state *tok)
568{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000569 char *line = NULL;
570 int badchar = 0;
571 for (;;) {
572 if (tok->decoding_state == STATE_NORMAL) {
573 /* We already have a codec associated with
574 this input. */
575 line = fp_readl(s, size, tok);
576 break;
577 } else if (tok->decoding_state == STATE_RAW) {
578 /* We want a 'raw' read. */
579 line = Py_UniversalNewlineFgets(s, size,
580 tok->fp, NULL);
581 break;
582 } else {
583 /* We have not yet determined the encoding.
584 If an encoding is found, use the file-pointer
585 reader functions from now on. */
586 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
587 return error_ret(tok);
588 assert(tok->decoding_state != STATE_INIT);
589 }
590 }
591 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
592 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
593 return error_ret(tok);
594 }
595 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000596#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000597 /* The default encoding is UTF-8, so make sure we don't have any
598 non-UTF-8 sequences in it. */
599 if (line && !tok->encoding) {
600 unsigned char *c;
601 int length;
602 for (c = (unsigned char *)line; *c; c += length)
603 if (!(length = valid_utf8(c))) {
604 badchar = *c;
605 break;
606 }
607 }
608 if (badchar) {
609 /* Need to add 1 to the line number, since this line
610 has not been counted, yet. */
Jesus Ceac1935d22011-04-25 04:03:58 +0200611 PyErr_Format(PyExc_SyntaxError,
612 "Non-UTF-8 code starting with '\\x%.2x' "
613 "in file %U on line %i, "
614 "but no encoding declared; "
615 "see http://python.org/dev/peps/pep-0263/ for details",
616 badchar, tok->filename, tok->lineno + 1);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000617 return error_ret(tok);
618 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000619#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 return line;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000621}
622
623static int
624decoding_feof(struct tok_state *tok)
625{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 if (tok->decoding_state != STATE_NORMAL) {
627 return feof(tok->fp);
628 } else {
629 PyObject* buf = tok->decoding_buffer;
630 if (buf == NULL) {
631 buf = PyObject_CallObject(tok->decoding_readline, NULL);
632 if (buf == NULL) {
633 error_ret(tok);
634 return 1;
635 } else {
636 tok->decoding_buffer = buf;
637 }
638 }
639 return PyObject_Length(buf) == 0;
640 }
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000641}
642
643/* Fetch a byte from TOK, using the string buffer. */
644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000645static int
646buf_getc(struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 return Py_CHARMASK(*tok->str++);
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000648}
649
650/* Unfetch a byte from TOK, using the string buffer. */
651
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000652static void
653buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 tok->str--;
655 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000656}
657
658/* Set the readline function for TOK to ENC. For the string-based
659 tokenizer, this means to just record the encoding. */
660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000661static int
662buf_setreadl(struct tok_state *tok, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 tok->enc = enc;
664 return 1;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000665}
666
667/* Return a UTF-8 encoding Python string object from the
668 C byte string STR, which is encoded with ENC. */
669
670static PyObject *
671translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 PyObject *utf8;
673 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
674 if (buf == NULL)
675 return NULL;
676 utf8 = PyUnicode_AsUTF8String(buf);
677 Py_DECREF(buf);
678 return utf8;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000679}
680
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000681
682static char *
683translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
Victor Stinner79697732013-06-05 00:44:00 +0200684 int skip_next_lf = 0;
685 size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000686 char *buf, *current;
687 char c = '\0';
688 buf = PyMem_MALLOC(needed_length);
689 if (buf == NULL) {
690 tok->done = E_NOMEM;
691 return NULL;
692 }
693 for (current = buf; *s; s++, current++) {
694 c = *s;
695 if (skip_next_lf) {
696 skip_next_lf = 0;
697 if (c == '\n') {
698 c = *++s;
699 if (!c)
700 break;
701 }
702 }
703 if (c == '\r') {
704 skip_next_lf = 1;
705 c = '\n';
706 }
707 *current = c;
708 }
709 /* If this is exec input, add a newline to the end of the string if
710 there isn't one already. */
711 if (exec_input && c != '\n') {
712 *current = '\n';
713 current++;
714 }
715 *current = '\0';
716 final_length = current - buf + 1;
717 if (final_length < needed_length && final_length)
718 /* should never fail */
719 buf = PyMem_REALLOC(buf, final_length);
720 return buf;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000721}
722
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000723/* Decode a byte string STR for use as the buffer of TOK.
724 Look for encoding declarations inside STR, and record them
725 inside TOK. */
726
727static const char *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000728decode_str(const char *input, int single, struct tok_state *tok)
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000729{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 PyObject* utf8 = NULL;
731 const char *str;
732 const char *s;
733 const char *newl[2] = {NULL, NULL};
734 int lineno = 0;
735 tok->input = str = translate_newlines(input, single, tok);
736 if (str == NULL)
737 return NULL;
738 tok->enc = NULL;
739 tok->str = str;
740 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
741 return error_ret(tok);
742 str = tok->str; /* string after BOM if any */
743 assert(str);
744 if (tok->enc != NULL) {
745 utf8 = translate_into_utf8(str, tok->enc);
746 if (utf8 == NULL)
747 return error_ret(tok);
748 str = PyBytes_AsString(utf8);
749 }
750 for (s = str;; s++) {
751 if (*s == '\0') break;
752 else if (*s == '\n') {
753 assert(lineno < 2);
754 newl[lineno] = s;
755 lineno++;
756 if (lineno == 2) break;
757 }
758 }
759 tok->enc = NULL;
760 /* need to check line 1 and 2 separately since check_coding_spec
761 assumes a single line as input */
762 if (newl[0]) {
763 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
764 return error_ret(tok);
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200765 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
767 tok, buf_setreadl))
768 return error_ret(tok);
769 }
770 }
771 if (tok->enc != NULL) {
772 assert(utf8 == NULL);
773 utf8 = translate_into_utf8(str, tok->enc);
774 if (utf8 == NULL)
775 return error_ret(tok);
776 str = PyBytes_AS_STRING(utf8);
777 }
778 assert(tok->decoding_buffer == NULL);
779 tok->decoding_buffer = utf8; /* CAUTION */
780 return str;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +0000781}
782
783#endif /* PGEN */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000784
785/* Set up tokenizer for string */
786
787struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000788PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000789{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 struct tok_state *tok = tok_new();
791 if (tok == NULL)
792 return NULL;
Serhiy Storchakac6792272013-10-19 21:03:34 +0300793 str = decode_str(str, exec_input, tok);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 if (str == NULL) {
795 PyTokenizer_Free(tok);
796 return NULL;
797 }
Neal Norwitzdee2fd52005-11-16 05:12:59 +0000798
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 /* XXX: constify members. */
800 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
801 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000802}
803
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000804struct tok_state *
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000805PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000806{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 struct tok_state *tok = tok_new();
808 if (tok == NULL)
809 return NULL;
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000810#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Petersonaeaa5922009-11-13 00:17:59 +0000812#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 if (str == NULL) {
814 PyTokenizer_Free(tok);
815 return NULL;
816 }
817 tok->decoding_state = STATE_RAW;
818 tok->read_coding_spec = 1;
819 tok->enc = NULL;
820 tok->str = str;
821 tok->encoding = (char *)PyMem_MALLOC(6);
822 if (!tok->encoding) {
823 PyTokenizer_Free(tok);
824 return NULL;
825 }
826 strcpy(tok->encoding, "utf-8");
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000827
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 /* XXX: constify members. */
829 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
830 return tok;
Benjamin Petersonf5b52242009-03-02 23:31:26 +0000831}
832
Guido van Rossum8c11a5c1991-07-27 21:42:56 +0000833/* Set up tokenizer for file */
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000834
835struct tok_state *
Serhiy Storchakac6792272013-10-19 21:03:34 +0300836PyTokenizer_FromFile(FILE *fp, const char* enc,
837 const char *ps1, const char *ps2)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000838{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 struct tok_state *tok = tok_new();
840 if (tok == NULL)
841 return NULL;
842 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
843 PyTokenizer_Free(tok);
844 return NULL;
845 }
846 tok->cur = tok->inp = tok->buf;
847 tok->end = tok->buf + BUFSIZ;
848 tok->fp = fp;
849 tok->prompt = ps1;
850 tok->nextprompt = ps2;
851 if (enc != NULL) {
852 /* Must copy encoding declaration since it
853 gets copied into the parse tree. */
854 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
855 if (!tok->encoding) {
856 PyTokenizer_Free(tok);
857 return NULL;
858 }
859 strcpy(tok->encoding, enc);
860 tok->decoding_state = STATE_NORMAL;
861 }
862 return tok;
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000863}
864
865
866/* Free a tok_state structure */
867
868void
Thomas Wouters23c9e002000-07-22 19:20:54 +0000869PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000870{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000871 if (tok->encoding != NULL)
872 PyMem_FREE(tok->encoding);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000873#ifndef PGEN
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000874 Py_XDECREF(tok->decoding_readline);
875 Py_XDECREF(tok->decoding_buffer);
Victor Stinner7f2fee32011-04-05 00:39:01 +0200876 Py_XDECREF(tok->filename);
Martin v. Löwis1ee99d32002-08-04 20:10:29 +0000877#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000878 if (tok->fp != NULL && tok->buf != NULL)
879 PyMem_FREE(tok->buf);
880 if (tok->input)
881 PyMem_FREE((char *)tok->input);
882 PyMem_FREE(tok);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000883}
884
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000885/* Get next char, updating state; error code goes into tok->done */
886
887static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200888tok_nextc(struct tok_state *tok)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +0000889{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 for (;;) {
891 if (tok->cur != tok->inp) {
892 return Py_CHARMASK(*tok->cur++); /* Fast path */
893 }
894 if (tok->done != E_OK)
895 return EOF;
896 if (tok->fp == NULL) {
897 char *end = strchr(tok->inp, '\n');
898 if (end != NULL)
899 end++;
900 else {
901 end = strchr(tok->inp, '\0');
902 if (end == tok->inp) {
903 tok->done = E_EOF;
904 return EOF;
905 }
906 }
907 if (tok->start == NULL)
908 tok->buf = tok->cur;
909 tok->line_start = tok->cur;
910 tok->lineno++;
911 tok->inp = end;
912 return Py_CHARMASK(*tok->cur++);
913 }
914 if (tok->prompt != NULL) {
915 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner034c7532011-01-07 18:56:19 +0000916#ifndef PGEN
Victor Stinner89e34362011-01-07 18:47:22 +0000917 if (newtok != NULL) {
918 char *translated = translate_newlines(newtok, 0, tok);
919 PyMem_FREE(newtok);
920 if (translated == NULL)
921 return EOF;
922 newtok = translated;
923 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 if (tok->encoding && newtok && *newtok) {
925 /* Recode to UTF-8 */
926 Py_ssize_t buflen;
927 const char* buf;
928 PyObject *u = translate_into_utf8(newtok, tok->encoding);
929 PyMem_FREE(newtok);
930 if (!u) {
931 tok->done = E_DECODE;
932 return EOF;
933 }
934 buflen = PyBytes_GET_SIZE(u);
935 buf = PyBytes_AS_STRING(u);
936 if (!buf) {
937 Py_DECREF(u);
938 tok->done = E_DECODE;
939 return EOF;
940 }
941 newtok = PyMem_MALLOC(buflen+1);
942 strcpy(newtok, buf);
943 Py_DECREF(u);
944 }
Martin v. Löwis85bcc662007-09-04 09:18:06 +0000945#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 if (tok->nextprompt != NULL)
947 tok->prompt = tok->nextprompt;
948 if (newtok == NULL)
949 tok->done = E_INTR;
950 else if (*newtok == '\0') {
951 PyMem_FREE(newtok);
952 tok->done = E_EOF;
953 }
954 else if (tok->start != NULL) {
955 size_t start = tok->start - tok->buf;
956 size_t oldlen = tok->cur - tok->buf;
957 size_t newlen = oldlen + strlen(newtok);
958 char *buf = tok->buf;
959 buf = (char *)PyMem_REALLOC(buf, newlen+1);
960 tok->lineno++;
961 if (buf == NULL) {
962 PyMem_FREE(tok->buf);
963 tok->buf = NULL;
964 PyMem_FREE(newtok);
965 tok->done = E_NOMEM;
966 return EOF;
967 }
968 tok->buf = buf;
969 tok->cur = tok->buf + oldlen;
970 tok->line_start = tok->cur;
971 strcpy(tok->buf + oldlen, newtok);
972 PyMem_FREE(newtok);
973 tok->inp = tok->buf + newlen;
974 tok->end = tok->inp + 1;
975 tok->start = tok->buf + start;
976 }
977 else {
978 tok->lineno++;
979 if (tok->buf != NULL)
980 PyMem_FREE(tok->buf);
981 tok->buf = newtok;
982 tok->line_start = tok->buf;
983 tok->cur = tok->buf;
984 tok->line_start = tok->buf;
985 tok->inp = strchr(tok->buf, '\0');
986 tok->end = tok->inp + 1;
987 }
988 }
989 else {
990 int done = 0;
991 Py_ssize_t cur = 0;
992 char *pt;
993 if (tok->start == NULL) {
994 if (tok->buf == NULL) {
995 tok->buf = (char *)
996 PyMem_MALLOC(BUFSIZ);
997 if (tok->buf == NULL) {
998 tok->done = E_NOMEM;
999 return EOF;
1000 }
1001 tok->end = tok->buf + BUFSIZ;
1002 }
1003 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1004 tok) == NULL) {
1005 tok->done = E_EOF;
1006 done = 1;
1007 }
1008 else {
1009 tok->done = E_OK;
1010 tok->inp = strchr(tok->buf, '\0');
1011 done = tok->inp[-1] == '\n';
1012 }
1013 }
1014 else {
1015 cur = tok->cur - tok->buf;
1016 if (decoding_feof(tok)) {
1017 tok->done = E_EOF;
1018 done = 1;
1019 }
1020 else
1021 tok->done = E_OK;
1022 }
1023 tok->lineno++;
1024 /* Read until '\n' or EOF */
1025 while (!done) {
1026 Py_ssize_t curstart = tok->start == NULL ? -1 :
1027 tok->start - tok->buf;
1028 Py_ssize_t curvalid = tok->inp - tok->buf;
1029 Py_ssize_t newsize = curvalid + BUFSIZ;
1030 char *newbuf = tok->buf;
1031 newbuf = (char *)PyMem_REALLOC(newbuf,
1032 newsize);
1033 if (newbuf == NULL) {
1034 tok->done = E_NOMEM;
1035 tok->cur = tok->inp;
1036 return EOF;
1037 }
1038 tok->buf = newbuf;
1039 tok->inp = tok->buf + curvalid;
1040 tok->end = tok->buf + newsize;
1041 tok->start = curstart < 0 ? NULL :
1042 tok->buf + curstart;
1043 if (decoding_fgets(tok->inp,
1044 (int)(tok->end - tok->inp),
1045 tok) == NULL) {
1046 /* Break out early on decoding
1047 errors, as tok->buf will be NULL
1048 */
1049 if (tok->decoding_erred)
1050 return EOF;
1051 /* Last line does not end in \n,
1052 fake one */
1053 strcpy(tok->inp, "\n");
1054 }
1055 tok->inp = strchr(tok->inp, '\0');
1056 done = tok->inp[-1] == '\n';
1057 }
1058 if (tok->buf != NULL) {
1059 tok->cur = tok->buf + cur;
1060 tok->line_start = tok->cur;
1061 /* replace "\r\n" with "\n" */
1062 /* For Mac leave the \r, giving a syntax error */
1063 pt = tok->inp - 2;
1064 if (pt >= tok->buf && *pt == '\r') {
1065 *pt++ = '\n';
1066 *pt = '\0';
1067 tok->inp = pt;
1068 }
1069 }
1070 }
1071 if (tok->done != E_OK) {
1072 if (tok->prompt != NULL)
1073 PySys_WriteStderr("\n");
1074 tok->cur = tok->inp;
1075 return EOF;
1076 }
1077 }
1078 /*NOTREACHED*/
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001079}
1080
1081
1082/* Back-up one character */
1083
1084static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001085tok_backup(struct tok_state *tok, int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001086{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 if (c != EOF) {
1088 if (--tok->cur < tok->buf)
1089 Py_FatalError("tok_backup: beginning of buffer");
1090 if (*tok->cur != c)
1091 *tok->cur = c;
1092 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001093}
1094
1095
1096/* Return the token corresponding to a single character */
1097
1098int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001099PyToken_OneChar(int c)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001100{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 switch (c) {
1102 case '(': return LPAR;
1103 case ')': return RPAR;
1104 case '[': return LSQB;
1105 case ']': return RSQB;
1106 case ':': return COLON;
1107 case ',': return COMMA;
1108 case ';': return SEMI;
1109 case '+': return PLUS;
1110 case '-': return MINUS;
1111 case '*': return STAR;
1112 case '/': return SLASH;
1113 case '|': return VBAR;
1114 case '&': return AMPER;
1115 case '<': return LESS;
1116 case '>': return GREATER;
1117 case '=': return EQUAL;
1118 case '.': return DOT;
1119 case '%': return PERCENT;
1120 case '{': return LBRACE;
1121 case '}': return RBRACE;
1122 case '^': return CIRCUMFLEX;
1123 case '~': return TILDE;
1124 case '@': return AT;
1125 default: return OP;
1126 }
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001127}
1128
1129
Guido van Rossumfbab9051991-10-20 20:25:03 +00001130int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001131PyToken_TwoChars(int c1, int c2)
Guido van Rossumfbab9051991-10-20 20:25:03 +00001132{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001133 switch (c1) {
1134 case '=':
1135 switch (c2) {
1136 case '=': return EQEQUAL;
1137 }
1138 break;
1139 case '!':
1140 switch (c2) {
1141 case '=': return NOTEQUAL;
1142 }
1143 break;
1144 case '<':
1145 switch (c2) {
1146 case '>': return NOTEQUAL;
1147 case '=': return LESSEQUAL;
1148 case '<': return LEFTSHIFT;
1149 }
1150 break;
1151 case '>':
1152 switch (c2) {
1153 case '=': return GREATEREQUAL;
1154 case '>': return RIGHTSHIFT;
1155 }
1156 break;
1157 case '+':
1158 switch (c2) {
1159 case '=': return PLUSEQUAL;
1160 }
1161 break;
1162 case '-':
1163 switch (c2) {
1164 case '=': return MINEQUAL;
1165 case '>': return RARROW;
1166 }
1167 break;
1168 case '*':
1169 switch (c2) {
1170 case '*': return DOUBLESTAR;
1171 case '=': return STAREQUAL;
1172 }
1173 break;
1174 case '/':
1175 switch (c2) {
1176 case '/': return DOUBLESLASH;
1177 case '=': return SLASHEQUAL;
1178 }
1179 break;
1180 case '|':
1181 switch (c2) {
1182 case '=': return VBAREQUAL;
1183 }
1184 break;
1185 case '%':
1186 switch (c2) {
1187 case '=': return PERCENTEQUAL;
1188 }
1189 break;
1190 case '&':
1191 switch (c2) {
1192 case '=': return AMPEREQUAL;
1193 }
1194 break;
1195 case '^':
1196 switch (c2) {
1197 case '=': return CIRCUMFLEXEQUAL;
1198 }
1199 break;
1200 }
1201 return OP;
Guido van Rossumfbab9051991-10-20 20:25:03 +00001202}
1203
Thomas Wouters434d0822000-08-24 20:11:32 +00001204int
1205PyToken_ThreeChars(int c1, int c2, int c3)
1206{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 switch (c1) {
1208 case '<':
1209 switch (c2) {
1210 case '<':
1211 switch (c3) {
1212 case '=':
1213 return LEFTSHIFTEQUAL;
1214 }
1215 break;
1216 }
1217 break;
1218 case '>':
1219 switch (c2) {
1220 case '>':
1221 switch (c3) {
1222 case '=':
1223 return RIGHTSHIFTEQUAL;
1224 }
1225 break;
1226 }
1227 break;
1228 case '*':
1229 switch (c2) {
1230 case '*':
1231 switch (c3) {
1232 case '=':
1233 return DOUBLESTAREQUAL;
1234 }
1235 break;
1236 }
1237 break;
1238 case '/':
1239 switch (c2) {
1240 case '/':
1241 switch (c3) {
1242 case '=':
1243 return DOUBLESLASHEQUAL;
1244 }
1245 break;
1246 }
1247 break;
1248 case '.':
1249 switch (c2) {
Georg Brandldde00282007-03-18 19:01:53 +00001250 case '.':
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 switch (c3) {
1252 case '.':
1253 return ELLIPSIS;
1254 }
1255 break;
1256 }
1257 break;
1258 }
1259 return OP;
Thomas Wouters434d0822000-08-24 20:11:32 +00001260}
Guido van Rossumfbab9051991-10-20 20:25:03 +00001261
Guido van Rossum926f13a1998-04-09 21:38:06 +00001262static int
Thomas Wouters23c9e002000-07-22 19:20:54 +00001263indenterror(struct tok_state *tok)
Guido van Rossum926f13a1998-04-09 21:38:06 +00001264{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 if (tok->alterror) {
1266 tok->done = E_TABSPACE;
1267 tok->cur = tok->inp;
1268 return 1;
1269 }
1270 if (tok->altwarning) {
Victor Stinner7f2fee32011-04-05 00:39:01 +02001271#ifdef PGEN
1272 PySys_WriteStderr("inconsistent use of tabs and spaces "
1273 "in indentation\n");
1274#else
1275 PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 "in indentation\n", tok->filename);
Victor Stinner7f2fee32011-04-05 00:39:01 +02001277#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 tok->altwarning = 0;
1279 }
1280 return 0;
Guido van Rossum926f13a1998-04-09 21:38:06 +00001281}
1282
Martin v. Löwis47383402007-08-15 07:32:56 +00001283#ifdef PGEN
Victor Stinner52f6dd72010-03-12 14:45:56 +00001284#define verify_identifier(tok) 1
Martin v. Löwis47383402007-08-15 07:32:56 +00001285#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286/* Verify that the identifier follows PEP 3131.
1287 All identifier strings are guaranteed to be "ready" unicode objects.
1288 */
Martin v. Löwis47383402007-08-15 07:32:56 +00001289static int
Victor Stinner52f6dd72010-03-12 14:45:56 +00001290verify_identifier(struct tok_state *tok)
Martin v. Löwis47383402007-08-15 07:32:56 +00001291{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 PyObject *s;
1293 int result;
1294 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 if (s == NULL || PyUnicode_READY(s) == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1297 PyErr_Clear();
1298 tok->done = E_IDENTIFIER;
1299 } else {
1300 tok->done = E_ERROR;
1301 }
1302 return 0;
1303 }
1304 result = PyUnicode_IsIdentifier(s);
1305 Py_DECREF(s);
1306 if (result == 0)
1307 tok->done = E_IDENTIFIER;
1308 return result;
Martin v. Löwis47383402007-08-15 07:32:56 +00001309}
1310#endif
Guido van Rossum926f13a1998-04-09 21:38:06 +00001311
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001312/* Get next token, after space stripping etc. */
1313
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001314static int
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001315tok_get(struct tok_state *tok, char **p_start, char **p_end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001316{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001317 int c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 int blankline, nonascii;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001319
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 *p_start = *p_end = NULL;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001321 nextline:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 tok->start = NULL;
1323 blankline = 0;
Guido van Rossum8c11a5c1991-07-27 21:42:56 +00001324
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 /* Get indentation level */
1326 if (tok->atbol) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001327 int col = 0;
1328 int altcol = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 tok->atbol = 0;
1330 for (;;) {
1331 c = tok_nextc(tok);
1332 if (c == ' ')
1333 col++, altcol++;
1334 else if (c == '\t') {
1335 col = (col/tok->tabsize + 1) * tok->tabsize;
1336 altcol = (altcol/tok->alttabsize + 1)
1337 * tok->alttabsize;
1338 }
1339 else if (c == '\014') /* Control-L (formfeed) */
1340 col = altcol = 0; /* For Emacs users */
1341 else
1342 break;
1343 }
1344 tok_backup(tok, c);
1345 if (c == '#' || c == '\n') {
1346 /* Lines with only whitespace and/or comments
1347 shouldn't affect the indentation and are
1348 not passed to the parser as NEWLINE tokens,
1349 except *totally* empty lines in interactive
1350 mode, which signal the end of a command group. */
1351 if (col == 0 && c == '\n' && tok->prompt != NULL)
1352 blankline = 0; /* Let it through */
1353 else
1354 blankline = 1; /* Ignore completely */
1355 /* We can't jump back right here since we still
1356 may need to skip to the end of a comment */
1357 }
1358 if (!blankline && tok->level == 0) {
1359 if (col == tok->indstack[tok->indent]) {
1360 /* No change */
1361 if (altcol != tok->altindstack[tok->indent]) {
1362 if (indenterror(tok))
1363 return ERRORTOKEN;
1364 }
1365 }
1366 else if (col > tok->indstack[tok->indent]) {
1367 /* Indent -- always one */
1368 if (tok->indent+1 >= MAXINDENT) {
1369 tok->done = E_TOODEEP;
1370 tok->cur = tok->inp;
1371 return ERRORTOKEN;
1372 }
1373 if (altcol <= tok->altindstack[tok->indent]) {
1374 if (indenterror(tok))
1375 return ERRORTOKEN;
1376 }
1377 tok->pendin++;
1378 tok->indstack[++tok->indent] = col;
1379 tok->altindstack[tok->indent] = altcol;
1380 }
1381 else /* col < tok->indstack[tok->indent] */ {
1382 /* Dedent -- any number, must be consistent */
1383 while (tok->indent > 0 &&
1384 col < tok->indstack[tok->indent]) {
1385 tok->pendin--;
1386 tok->indent--;
1387 }
1388 if (col != tok->indstack[tok->indent]) {
1389 tok->done = E_DEDENT;
1390 tok->cur = tok->inp;
1391 return ERRORTOKEN;
1392 }
1393 if (altcol != tok->altindstack[tok->indent]) {
1394 if (indenterror(tok))
1395 return ERRORTOKEN;
1396 }
1397 }
1398 }
1399 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001401 tok->start = tok->cur;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 /* Return pending indents/dedents */
1404 if (tok->pendin != 0) {
1405 if (tok->pendin < 0) {
1406 tok->pendin++;
1407 return DEDENT;
1408 }
1409 else {
1410 tok->pendin--;
1411 return INDENT;
1412 }
1413 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001415 again:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 tok->start = NULL;
1417 /* Skip spaces */
1418 do {
1419 c = tok_nextc(tok);
1420 } while (c == ' ' || c == '\t' || c == '\014');
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001421
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 /* Set start of current token */
1423 tok->start = tok->cur - 1;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001425 /* Skip comment */
1426 if (c == '#')
1427 while (c != EOF && c != '\n')
1428 c = tok_nextc(tok);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001430 /* Check for EOF and errors now */
1431 if (c == EOF) {
1432 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1433 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 /* Identifier (most frequent token!) */
1436 nonascii = 0;
1437 if (is_potential_identifier_start(c)) {
Christian Heimes0b3847d2012-06-20 11:17:58 +02001438 /* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001439 int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001440 while (1) {
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001441 if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001442 saw_b = 1;
Armin Ronacher6ecf77b2012-03-04 12:04:06 +00001443 /* Since this is a backwards compatibility support literal we don't
1444 want to support it in arbitrary order like byte literals. */
1445 else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1446 saw_u = 1;
Christian Heimes0b3847d2012-06-20 11:17:58 +02001447 /* ur"" and ru"" are not supported */
1448 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
Antoine Pitrou3a5d4cb2012-01-12 22:46:19 +01001449 saw_r = 1;
1450 else
1451 break;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001452 c = tok_nextc(tok);
1453 if (c == '"' || c == '\'')
1454 goto letter_quote;
1455 }
1456 while (is_potential_identifier_char(c)) {
1457 if (c >= 128)
1458 nonascii = 1;
1459 c = tok_nextc(tok);
1460 }
1461 tok_backup(tok, c);
1462 if (nonascii &&
1463 !verify_identifier(tok)) {
1464 tok->done = E_IDENTIFIER;
1465 return ERRORTOKEN;
1466 }
1467 *p_start = tok->start;
1468 *p_end = tok->cur;
1469 return NAME;
1470 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001471
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 /* Newline */
1473 if (c == '\n') {
1474 tok->atbol = 1;
1475 if (blankline || tok->level > 0)
1476 goto nextline;
1477 *p_start = tok->start;
1478 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1479 tok->cont_line = 0;
1480 return NEWLINE;
1481 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001482
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001483 /* Period or number starting with period? */
1484 if (c == '.') {
1485 c = tok_nextc(tok);
1486 if (isdigit(c)) {
1487 goto fraction;
1488 } else if (c == '.') {
1489 c = tok_nextc(tok);
1490 if (c == '.') {
1491 *p_start = tok->start;
1492 *p_end = tok->cur;
1493 return ELLIPSIS;
1494 } else {
1495 tok_backup(tok, c);
1496 }
1497 tok_backup(tok, '.');
1498 } else {
1499 tok_backup(tok, c);
1500 }
1501 *p_start = tok->start;
1502 *p_end = tok->cur;
1503 return DOT;
1504 }
Guido van Rossumf595fde1996-01-12 01:31:58 +00001505
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001506 /* Number */
1507 if (isdigit(c)) {
1508 if (c == '0') {
1509 /* Hex, octal or binary -- maybe. */
1510 c = tok_nextc(tok);
1511 if (c == '.')
1512 goto fraction;
1513 if (c == 'j' || c == 'J')
1514 goto imaginary;
1515 if (c == 'x' || c == 'X') {
Georg Brandlfceab5a2008-01-19 20:08:23 +00001516
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 /* Hex */
1518 c = tok_nextc(tok);
1519 if (!isxdigit(c)) {
1520 tok->done = E_TOKEN;
1521 tok_backup(tok, c);
1522 return ERRORTOKEN;
1523 }
1524 do {
1525 c = tok_nextc(tok);
1526 } while (isxdigit(c));
1527 }
1528 else if (c == 'o' || c == 'O') {
1529 /* Octal */
1530 c = tok_nextc(tok);
1531 if (c < '0' || c >= '8') {
1532 tok->done = E_TOKEN;
1533 tok_backup(tok, c);
1534 return ERRORTOKEN;
1535 }
1536 do {
1537 c = tok_nextc(tok);
1538 } while ('0' <= c && c < '8');
1539 }
1540 else if (c == 'b' || c == 'B') {
1541 /* Binary */
1542 c = tok_nextc(tok);
1543 if (c != '0' && c != '1') {
1544 tok->done = E_TOKEN;
1545 tok_backup(tok, c);
1546 return ERRORTOKEN;
1547 }
1548 do {
1549 c = tok_nextc(tok);
1550 } while (c == '0' || c == '1');
1551 }
1552 else {
1553 int nonzero = 0;
1554 /* maybe old-style octal; c is first char of it */
1555 /* in any case, allow '0' as a literal */
1556 while (c == '0')
1557 c = tok_nextc(tok);
1558 while (isdigit(c)) {
1559 nonzero = 1;
1560 c = tok_nextc(tok);
1561 }
1562 if (c == '.')
1563 goto fraction;
1564 else if (c == 'e' || c == 'E')
1565 goto exponent;
1566 else if (c == 'j' || c == 'J')
1567 goto imaginary;
1568 else if (nonzero) {
1569 tok->done = E_TOKEN;
1570 tok_backup(tok, c);
1571 return ERRORTOKEN;
1572 }
1573 }
1574 }
1575 else {
1576 /* Decimal */
1577 do {
1578 c = tok_nextc(tok);
1579 } while (isdigit(c));
1580 {
1581 /* Accept floating point numbers. */
1582 if (c == '.') {
1583 fraction:
1584 /* Fraction */
1585 do {
1586 c = tok_nextc(tok);
1587 } while (isdigit(c));
1588 }
1589 if (c == 'e' || c == 'E') {
1590 exponent:
1591 /* Exponent part */
1592 c = tok_nextc(tok);
1593 if (c == '+' || c == '-')
1594 c = tok_nextc(tok);
1595 if (!isdigit(c)) {
1596 tok->done = E_TOKEN;
1597 tok_backup(tok, c);
1598 return ERRORTOKEN;
1599 }
1600 do {
1601 c = tok_nextc(tok);
1602 } while (isdigit(c));
1603 }
1604 if (c == 'j' || c == 'J')
1605 /* Imaginary part */
1606 imaginary:
1607 c = tok_nextc(tok);
1608 }
1609 }
1610 tok_backup(tok, c);
1611 *p_start = tok->start;
1612 *p_end = tok->cur;
1613 return NUMBER;
1614 }
Guido van Rossum24dacb31997-04-06 03:46:20 +00001615
1616 letter_quote:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001617 /* String */
1618 if (c == '\'' || c == '"') {
1619 int quote = c;
1620 int quote_size = 1; /* 1 or 3 */
1621 int end_quote_size = 0;
Guido van Rossumcf171a72007-11-16 00:51:45 +00001622
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001623 /* Find the quote size and start of string */
1624 c = tok_nextc(tok);
1625 if (c == quote) {
1626 c = tok_nextc(tok);
1627 if (c == quote)
1628 quote_size = 3;
1629 else
1630 end_quote_size = 1; /* empty string found */
1631 }
1632 if (c != quote)
1633 tok_backup(tok, c);
Guido van Rossumcf171a72007-11-16 00:51:45 +00001634
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001635 /* Get rest of string */
1636 while (end_quote_size != quote_size) {
1637 c = tok_nextc(tok);
1638 if (c == EOF) {
1639 if (quote_size == 3)
1640 tok->done = E_EOFS;
1641 else
1642 tok->done = E_EOLS;
1643 tok->cur = tok->inp;
1644 return ERRORTOKEN;
1645 }
1646 if (quote_size == 1 && c == '\n') {
1647 tok->done = E_EOLS;
1648 tok->cur = tok->inp;
1649 return ERRORTOKEN;
1650 }
1651 if (c == quote)
1652 end_quote_size += 1;
1653 else {
1654 end_quote_size = 0;
1655 if (c == '\\')
1656 c = tok_nextc(tok); /* skip escaped char */
1657 }
1658 }
Guido van Rossumcf171a72007-11-16 00:51:45 +00001659
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001660 *p_start = tok->start;
1661 *p_end = tok->cur;
1662 return STRING;
1663 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001664
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001665 /* Line continuation */
1666 if (c == '\\') {
1667 c = tok_nextc(tok);
1668 if (c != '\n') {
1669 tok->done = E_LINECONT;
1670 tok->cur = tok->inp;
1671 return ERRORTOKEN;
1672 }
1673 tok->cont_line = 1;
1674 goto again; /* Read next line */
1675 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001676
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001677 /* Check for two-character token */
1678 {
1679 int c2 = tok_nextc(tok);
1680 int token = PyToken_TwoChars(c, c2);
1681 if (token != OP) {
1682 int c3 = tok_nextc(tok);
1683 int token3 = PyToken_ThreeChars(c, c2, c3);
1684 if (token3 != OP) {
1685 token = token3;
1686 } else {
1687 tok_backup(tok, c3);
1688 }
1689 *p_start = tok->start;
1690 *p_end = tok->cur;
1691 return token;
1692 }
1693 tok_backup(tok, c2);
1694 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001695
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001696 /* Keep track of parentheses nesting level */
1697 switch (c) {
1698 case '(':
1699 case '[':
1700 case '{':
1701 tok->level++;
1702 break;
1703 case ')':
1704 case ']':
1705 case '}':
1706 tok->level--;
1707 break;
1708 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001709
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001710 /* Punctuation character */
1711 *p_start = tok->start;
1712 *p_end = tok->cur;
1713 return PyToken_OneChar(c);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001714}
1715
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001716int
1717PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1718{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001719 int result = tok_get(tok, p_start, p_end);
1720 if (tok->decoding_erred) {
1721 result = ERRORTOKEN;
1722 tok->done = E_DECODE;
1723 }
1724 return result;
Martin v. Löwis00f1e3f2002-08-04 17:29:52 +00001725}
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001726
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001727/* Get the encoding of a Python file. Check for the coding cookie and check if
1728 the file starts with a BOM.
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001729
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001730 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1731 encoding in the first or second line of the file (in which case the encoding
1732 should be assumed to be UTF-8).
Brett Cannone4539892007-10-20 03:46:49 +00001733
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001734 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1735 by the caller. */
1736
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001737char *
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001738PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum40d20bc2007-10-22 00:09:51 +00001739{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001740 struct tok_state *tok;
1741 FILE *fp;
1742 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001743
Victor Stinnerdaf45552013-08-28 00:53:59 +02001744#ifndef PGEN
1745 fd = _Py_dup(fd);
1746#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001747 fd = dup(fd);
Victor Stinnerdaf45552013-08-28 00:53:59 +02001748#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001749 if (fd < 0) {
1750 return NULL;
1751 }
Victor Stinnerdaf45552013-08-28 00:53:59 +02001752
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001753 fp = fdopen(fd, "r");
1754 if (fp == NULL) {
1755 return NULL;
1756 }
1757 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1758 if (tok == NULL) {
1759 fclose(fp);
1760 return NULL;
1761 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001762#ifndef PGEN
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001763 if (filename != NULL) {
1764 Py_INCREF(filename);
1765 tok->filename = filename;
1766 }
1767 else {
1768 tok->filename = PyUnicode_FromString("<string>");
1769 if (tok->filename == NULL) {
1770 fclose(fp);
1771 PyTokenizer_Free(tok);
1772 return encoding;
1773 }
1774 }
Victor Stinner7f2fee32011-04-05 00:39:01 +02001775#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001776 while (tok->lineno < 2 && tok->done == E_OK) {
1777 PyTokenizer_Get(tok, &p_start, &p_end);
1778 }
1779 fclose(fp);
1780 if (tok->encoding) {
1781 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1782 if (encoding)
1783 strcpy(encoding, tok->encoding);
1784 }
1785 PyTokenizer_Free(tok);
1786 return encoding;
Guido van Rossumce3a72a2007-10-19 23:16:50 +00001787}
Thomas Wouters89d996e2007-09-08 17:39:28 +00001788
Victor Stinnerfe7c5b52011-04-05 01:48:03 +02001789char *
1790PyTokenizer_FindEncoding(int fd)
1791{
1792 return PyTokenizer_FindEncodingFilename(fd, NULL);
1793}
1794
Guido van Rossum408027e1996-12-30 16:17:54 +00001795#ifdef Py_DEBUG
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001796
1797void
Thomas Wouters23c9e002000-07-22 19:20:54 +00001798tok_dump(int type, char *start, char *end)
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001799{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001800 printf("%s", _PyParser_TokenNames[type]);
1801 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1802 printf("(%.*s)", (int)(end - start), start);
Guido van Rossum85a5fbb1990-10-14 12:07:46 +00001803}
1804
1805#endif