Blame - Parser/tokenizer.c - platform/external/python/cpython3

blob: 1eb62aa2a093a13f21492e7894323cefeb0a2fb2 [file] [log] [blame]

Guido van Rossum	f70e43a	1991-02-19 12:39:46 +0000	[diff] [blame]	1
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	2	/* Tokenizer implementation */
				3
Jack Jansen	7b8c754	2002-04-14 20:12:41 +0000	[diff] [blame]	4	#include "Python.h"
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	5	#include "pgenheaders.h"
				6
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	7	#include <ctype.h>
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	8	#include <assert.h>
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	9
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	10	#include "tokenizer.h"
				11	#include "errcode.h"
				12
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	13	#ifndef PGEN
				14	#include "unicodeobject.h"
Christian Heimes	2c9c7a5	2008-05-26 13:42:13 +0000	[diff] [blame]	15	#include "bytesobject.h"
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	16	#include "fileobject.h"
				17	#include "codecs.h"
				18	#include "abstract.h"
				19	#endif /* PGEN */
				20
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	21	#define is_potential_identifier_start(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	22	(c >= 'a' && c <= 'z')\
				23	\|\| (c >= 'A' && c <= 'Z')\
				24	\|\| c == '_'\
				25	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	26
				27	#define is_potential_identifier_char(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	28	(c >= 'a' && c <= 'z')\
				29	\|\| (c >= 'A' && c <= 'Z')\
				30	\|\| (c >= '0' && c <= '9')\
				31	\|\| c == '_'\
				32	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	33
Martin v. Löwis	566f6af	2002-10-26 14:39:10 +0000	[diff] [blame]	34	extern char PyOS_Readline(FILE , FILE , char );
Guido van Rossum	f4b1a64	1994-08-29 12:43:07 +0000	[diff] [blame]	35	/* Return malloc'ed string including trailing \n;
				36	empty malloc'ed string for EOF;
				37	NULL if interrupted */
				38
Guido van Rossum	4fe8729	1992-02-26 15:24:44 +0000	[diff] [blame]	39	/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	40	#define TABSIZE 8
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	41
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	42	/* Forward */
Tim Peters	dbd9ba6	2000-07-09 03:09:57 +0000	[diff] [blame]	43	static struct tok_state *tok_new(void);
				44	static int tok_nextc(struct tok_state *tok);
				45	static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	46
Brett Cannon	d5ec98c	2007-10-20 02:54:14 +0000	[diff] [blame]	47
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	48	/* Token names */
				49
Guido van Rossum	86bea46	1997-04-29 21:03:06 +0000	[diff] [blame]	50	char *_PyParser_TokenNames[] = {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	51	"ENDMARKER",
				52	"NAME",
				53	"NUMBER",
				54	"STRING",
				55	"NEWLINE",
				56	"INDENT",
				57	"DEDENT",
				58	"LPAR",
				59	"RPAR",
				60	"LSQB",
				61	"RSQB",
				62	"COLON",
				63	"COMMA",
				64	"SEMI",
				65	"PLUS",
				66	"MINUS",
				67	"STAR",
				68	"SLASH",
				69	"VBAR",
				70	"AMPER",
				71	"LESS",
				72	"GREATER",
				73	"EQUAL",
				74	"DOT",
				75	"PERCENT",
				76	"LBRACE",
				77	"RBRACE",
				78	"EQEQUAL",
				79	"NOTEQUAL",
				80	"LESSEQUAL",
				81	"GREATEREQUAL",
				82	"TILDE",
				83	"CIRCUMFLEX",
				84	"LEFTSHIFT",
				85	"RIGHTSHIFT",
				86	"DOUBLESTAR",
				87	"PLUSEQUAL",
				88	"MINEQUAL",
				89	"STAREQUAL",
				90	"SLASHEQUAL",
				91	"PERCENTEQUAL",
				92	"AMPEREQUAL",
				93	"VBAREQUAL",
				94	"CIRCUMFLEXEQUAL",
				95	"LEFTSHIFTEQUAL",
				96	"RIGHTSHIFTEQUAL",
				97	"DOUBLESTAREQUAL",
				98	"DOUBLESLASH",
				99	"DOUBLESLASHEQUAL",
				100	"AT",
				101	"RARROW",
				102	"ELLIPSIS",
				103	/* This table must match the #defines in token.h! */
				104	"OP",
				105	"<ERRORTOKEN>",
				106	"<N_TOKENS>"
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	107	};
				108
				109
				110	/* Create and initialize a new tok_state structure */
				111
				112	static struct tok_state *
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	113	tok_new(void)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	114	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	115	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
				116	sizeof(struct tok_state));
				117	if (tok == NULL)
				118	return NULL;
				119	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
				120	tok->done = E_OK;
				121	tok->fp = NULL;
				122	tok->input = NULL;
				123	tok->tabsize = TABSIZE;
				124	tok->indent = 0;
				125	tok->indstack[0] = 0;
				126	tok->atbol = 1;
				127	tok->pendin = 0;
				128	tok->prompt = tok->nextprompt = NULL;
				129	tok->lineno = 0;
				130	tok->level = 0;
				131	tok->filename = NULL;
				132	tok->altwarning = 1;
				133	tok->alterror = 1;
				134	tok->alttabsize = 1;
				135	tok->altindstack[0] = 0;
				136	tok->decoding_state = STATE_INIT;
				137	tok->decoding_erred = 0;
				138	tok->read_coding_spec = 0;
				139	tok->enc = NULL;
				140	tok->encoding = NULL;
				141	tok->cont_line = 0;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	142	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	143	tok->decoding_readline = NULL;
				144	tok->decoding_buffer = NULL;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	145	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	146	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	147	}
				148
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	149	static char *
				150	new_string(const char *s, Py_ssize_t len)
				151	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	152	char* result = (char *)PyMem_MALLOC(len + 1);
				153	if (result != NULL) {
				154	memcpy(result, s, len);
				155	result[len] = '\0';
				156	}
				157	return result;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	158	}
				159
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	160	#ifdef PGEN
				161
				162	static char *
				163	decoding_fgets(char s, int size, struct tok_state tok)
				164	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	165	return fgets(s, size, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	166	}
				167
				168	static int
				169	decoding_feof(struct tok_state *tok)
				170	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	171	return feof(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	172	}
				173
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	174	static char *
				175	decode_str(const char str, int exec_input, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	176	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	177	return new_string(str, strlen(str));
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	178	}
				179
				180	#else /* PGEN */
				181
				182	static char *
				183	error_ret(struct tok_state tok) / XXX */
				184	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	185	tok->decoding_erred = 1;
				186	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
				187	PyMem_FREE(tok->buf);
				188	tok->buf = NULL;
				189	return NULL; /* as if it were EOF */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	190	}
				191
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	192
				193	static char *
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	194	get_normal_name(char s) / for utf-8 and latin-1 */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	195	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	196	char buf[13];
				197	int i;
				198	for (i = 0; i < 12; i++) {
				199	int c = s[i];
				200	if (c == '\0')
				201	break;
				202	else if (c == '_')
				203	buf[i] = '-';
				204	else
				205	buf[i] = tolower(c);
				206	}
				207	buf[i] = '\0';
				208	if (strcmp(buf, "utf-8") == 0 \|\|
				209	strncmp(buf, "utf-8-", 6) == 0)
				210	return "utf-8";
				211	else if (strcmp(buf, "latin-1") == 0 \|\|
				212	strcmp(buf, "iso-8859-1") == 0 \|\|
				213	strcmp(buf, "iso-latin-1") == 0 \|\|
				214	strncmp(buf, "latin-1-", 8) == 0 \|\|
				215	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
				216	strncmp(buf, "iso-latin-1-", 12) == 0)
				217	return "iso-8859-1";
				218	else
				219	return s;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	220	}
				221
				222	/* Return the coding spec in S, or NULL if none is found. */
				223
				224	static char *
Martin v. Löwis	18e1655	2006-02-15 17:27:45 +0000	[diff] [blame]	225	get_coding_spec(const char *s, Py_ssize_t size)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	226	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	227	Py_ssize_t i;
				228	/* Coding spec must be in a comment, and that comment must be
				229	* the only statement on the source code line. */
				230	for (i = 0; i < size - 6; i++) {
				231	if (s[i] == '#')
				232	break;
				233	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
				234	return NULL;
				235	}
				236	for (; i < size - 6; i++) { /* XXX inefficient search */
				237	const char* t = s + i;
				238	if (strncmp(t, "coding", 6) == 0) {
				239	const char* begin = NULL;
				240	t += 6;
				241	if (t[0] != ':' && t[0] != '=')
				242	continue;
				243	do {
				244	t++;
				245	} while (t[0] == '\x20' \|\| t[0] == '\t');
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	246
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	247	begin = t;
				248	while (Py_ISALNUM(t[0]) \|\|
				249	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
				250	t++;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	251
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	252	if (begin < t) {
				253	char* r = new_string(begin, t - begin);
				254	char* q = get_normal_name(r);
				255	if (r != q) {
				256	PyMem_FREE(r);
				257	r = new_string(q, strlen(q));
				258	}
				259	return r;
				260	}
				261	}
				262	}
				263	return NULL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	264	}
				265
				266	/* Check whether the line contains a coding spec. If it does,
				267	invoke the set_readline function for the new encoding.
				268	This function receives the tok_state and the new encoding.
				269	Return 1 on success, 0 on failure. */
				270
				271	static int
Martin v. Löwis	18e1655	2006-02-15 17:27:45 +0000	[diff] [blame]	272	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	273	int set_readline(struct tok_state , const char ))
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	274	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	275	char * cs;
				276	int r = 1;
Tim Peters	17db21f	2002-09-03 15:39:58 +0000	[diff] [blame]	277
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	278	if (tok->cont_line)
				279	/* It's a continuation line, so it can't be a coding spec. */
				280	return 1;
				281	cs = get_coding_spec(line, size);
				282	if (cs != NULL) {
				283	tok->read_coding_spec = 1;
				284	if (tok->encoding == NULL) {
				285	assert(tok->decoding_state == STATE_RAW);
				286	if (strcmp(cs, "utf-8") == 0) {
				287	tok->encoding = cs;
				288	} else {
				289	r = set_readline(tok, cs);
				290	if (r) {
				291	tok->encoding = cs;
				292	tok->decoding_state = STATE_NORMAL;
				293	}
				294	else
				295	PyMem_FREE(cs);
				296	}
				297	} else { /* then, compare cs with BOM */
				298	r = (strcmp(tok->encoding, cs) == 0);
				299	PyMem_FREE(cs);
				300	}
				301	}
				302	if (!r) {
				303	cs = tok->encoding;
				304	if (!cs)
				305	cs = "with BOM";
				306	PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
				307	}
				308	return r;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	309	}
				310
				311	/* See whether the file starts with a BOM. If it does,
				312	invoke the set_readline function with the new encoding.
				313	Return 1 on success, 0 on failure. */
				314
				315	static int
				316	check_bom(int get_char(struct tok_state *),
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	317	void unget_char(int, struct tok_state *),
				318	int set_readline(struct tok_state , const char ),
				319	struct tok_state *tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	320	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	321	int ch1, ch2, ch3;
				322	ch1 = get_char(tok);
				323	tok->decoding_state = STATE_RAW;
				324	if (ch1 == EOF) {
				325	return 1;
				326	} else if (ch1 == 0xEF) {
				327	ch2 = get_char(tok);
				328	if (ch2 != 0xBB) {
				329	unget_char(ch2, tok);
				330	unget_char(ch1, tok);
				331	return 1;
				332	}
				333	ch3 = get_char(tok);
				334	if (ch3 != 0xBF) {
				335	unget_char(ch3, tok);
				336	unget_char(ch2, tok);
				337	unget_char(ch1, tok);
				338	return 1;
				339	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	340	#if 0
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	341	/* Disable support for UTF-16 BOMs until a decision
				342	is made whether this needs to be supported. */
				343	} else if (ch1 == 0xFE) {
				344	ch2 = get_char(tok);
				345	if (ch2 != 0xFF) {
				346	unget_char(ch2, tok);
				347	unget_char(ch1, tok);
				348	return 1;
				349	}
				350	if (!set_readline(tok, "utf-16-be"))
				351	return 0;
				352	tok->decoding_state = STATE_NORMAL;
				353	} else if (ch1 == 0xFF) {
				354	ch2 = get_char(tok);
				355	if (ch2 != 0xFE) {
				356	unget_char(ch2, tok);
				357	unget_char(ch1, tok);
				358	return 1;
				359	}
				360	if (!set_readline(tok, "utf-16-le"))
				361	return 0;
				362	tok->decoding_state = STATE_NORMAL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	363	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	364	} else {
				365	unget_char(ch1, tok);
				366	return 1;
				367	}
				368	if (tok->encoding != NULL)
				369	PyMem_FREE(tok->encoding);
				370	tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
				371	/* No need to set_readline: input is already utf-8 */
				372	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	373	}
				374
				375	/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	376	Return NULL on failure, else S.
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	377
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	378	On entry, tok->decoding_buffer will be one of:
				379	1) NULL: need to call tok->decoding_readline to get a new line
				380	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	381	stored the result in tok->decoding_buffer
Christian Heimes	9c4756e	2008-05-26 13:22:05 +0000	[diff] [blame]	382	3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	383	(in the s buffer) to copy entire contents of the line read
				384	by tok->decoding_readline. tok->decoding_buffer has the overflow.
				385	In this case, fp_readl is called in a loop (with an expanded buffer)
				386	until the buffer ends with a '\n' (or until the end of the file is
				387	reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	388	*/
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	389
				390	static char *
				391	fp_readl(char s, int size, struct tok_state tok)
				392	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	393	PyObject* bufobj;
				394	const char *buf;
				395	Py_ssize_t buflen;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	396
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	397	/* Ask for one less byte so we can terminate it */
				398	assert(size > 0);
				399	size--;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	400
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	401	if (tok->decoding_buffer) {
				402	bufobj = tok->decoding_buffer;
				403	Py_INCREF(bufobj);
				404	}
				405	else
				406	{
				407	bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
				408	if (bufobj == NULL)
				409	goto error;
				410	}
				411	if (PyUnicode_CheckExact(bufobj))
				412	{
				413	buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
				414	if (buf == NULL) {
				415	goto error;
				416	}
				417	}
				418	else
				419	{
				420	buf = PyByteArray_AsString(bufobj);
				421	if (buf == NULL) {
				422	goto error;
				423	}
				424	buflen = PyByteArray_GET_SIZE(bufobj);
				425	}
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	426
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	427	Py_XDECREF(tok->decoding_buffer);
				428	if (buflen > size) {
				429	/* Too many chars, the rest goes into tok->decoding_buffer */
				430	tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
				431	buflen-size);
				432	if (tok->decoding_buffer == NULL)
				433	goto error;
				434	buflen = size;
				435	}
				436	else
				437	tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	438
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	439	memcpy(s, buf, buflen);
				440	s[buflen] = '\0';
				441	if (buflen == 0) /* EOF */
				442	s = NULL;
				443	Py_DECREF(bufobj);
				444	return s;
Neal Norwitz	41eaedd	2007-08-12 00:03:22 +0000	[diff] [blame]	445
				446	error:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	447	Py_XDECREF(bufobj);
				448	return error_ret(tok);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	449	}
				450
				451	/* Set the readline function for TOK to a StreamReader's
				452	readline function. The StreamReader is named ENC.
				453
				454	This function is called from check_bom and check_coding_spec.
				455
				456	ENC is usually identical to the future value of tok->encoding,
				457	except for the (currently unsupported) case of UTF-16.
				458
				459	Return 1 on success, 0 on failure. */
				460
				461	static int
				462	fp_setreadl(struct tok_state tok, const char enc)
				463	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	464	PyObject readline = NULL, stream = NULL, *io = NULL;
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	465	int fd;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	466
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	467	io = PyImport_ImportModuleNoBlock("io");
				468	if (io == NULL)
				469	goto cleanup;
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	470
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	471	fd = fileno(tok->fp);
				472	if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
				473	PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
				474	goto cleanup;
				475	}
				476
				477	stream = PyObject_CallMethod(io, "open", "isisOOO",
				478	fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	479	if (stream == NULL)
				480	goto cleanup;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	481
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	482	Py_XDECREF(tok->decoding_readline);
				483	readline = PyObject_GetAttrString(stream, "readline");
				484	tok->decoding_readline = readline;
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	485
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	486	/* The file has been reopened; parsing will restart from
				487	* the beginning of the file, we have to reset the line number.
				488	* But this function has been called from inside tok_nextc() which
				489	* will increment lineno before it returns. So we set it -1 so that
				490	* the next call to tok_nextc() will start with tok->lineno == 0.
				491	*/
				492	tok->lineno = -1;
Amaury Forgeot d'Arc	cf8016a	2008-10-09 23:37:48 +0000	[diff] [blame]	493
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	494	cleanup:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	495	Py_XDECREF(stream);
				496	Py_XDECREF(io);
				497	return readline != NULL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	498	}
				499
				500	/* Fetch the next byte from TOK. */
				501
				502	static int fp_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	503	return getc(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	504	}
				505
				506	/* Unfetch the last byte back into TOK. */
				507
				508	static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	509	ungetc(c, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	510	}
				511
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	512	/* Check whether the characters at s start a valid
				513	UTF-8 sequence. Return the number of characters forming
				514	the sequence if yes, 0 if not. */
				515	static int valid_utf8(const unsigned char* s)
				516	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	517	int expected = 0;
				518	int length;
				519	if (*s < 0x80)
				520	/* single-byte code */
				521	return 1;
				522	if (*s < 0xc0)
				523	/* following byte */
				524	return 0;
				525	if (*s < 0xE0)
				526	expected = 1;
				527	else if (*s < 0xF0)
				528	expected = 2;
				529	else if (*s < 0xF8)
				530	expected = 3;
				531	else
				532	return 0;
				533	length = expected + 1;
				534	for (; expected; expected--)
				535	if (s[expected] < 0x80 \|\| s[expected] >= 0xC0)
				536	return 0;
				537	return length;
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	538	}
				539
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	540	/* Read a line of input from TOK. Determine encoding
				541	if necessary. */
				542
				543	static char *
				544	decoding_fgets(char s, int size, struct tok_state tok)
				545	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	546	char *line = NULL;
				547	int badchar = 0;
Victor Stinner	83098a4	2010-12-27 20:12:13 +0000	[diff] [blame]	548	PyObject *filename;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	549	for (;;) {
				550	if (tok->decoding_state == STATE_NORMAL) {
				551	/* We already have a codec associated with
				552	this input. */
				553	line = fp_readl(s, size, tok);
				554	break;
				555	} else if (tok->decoding_state == STATE_RAW) {
				556	/* We want a 'raw' read. */
				557	line = Py_UniversalNewlineFgets(s, size,
				558	tok->fp, NULL);
				559	break;
				560	} else {
				561	/* We have not yet determined the encoding.
				562	If an encoding is found, use the file-pointer
				563	reader functions from now on. */
				564	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
				565	return error_ret(tok);
				566	assert(tok->decoding_state != STATE_INIT);
				567	}
				568	}
				569	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
				570	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
				571	return error_ret(tok);
				572	}
				573	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	574	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	575	/* The default encoding is UTF-8, so make sure we don't have any
				576	non-UTF-8 sequences in it. */
				577	if (line && !tok->encoding) {
				578	unsigned char *c;
				579	int length;
				580	for (c = (unsigned char )line; c; c += length)
				581	if (!(length = valid_utf8(c))) {
				582	badchar = *c;
				583	break;
				584	}
				585	}
				586	if (badchar) {
				587	/* Need to add 1 to the line number, since this line
				588	has not been counted, yet. */
Victor Stinner	83098a4	2010-12-27 20:12:13 +0000	[diff] [blame]	589	filename = PyUnicode_DecodeFSDefault(tok->filename);
				590	if (filename != NULL) {
				591	PyErr_Format(PyExc_SyntaxError,
				592	"Non-UTF-8 code starting with '\\x%.2x' "
Victor Stinner	aaa4e9a	2011-01-05 03:33:26 +0000	[diff] [blame]	593	"in file %U on line %i, "
Victor Stinner	83098a4	2010-12-27 20:12:13 +0000	[diff] [blame]	594	"but no encoding declared; "
				595	"see http://python.org/dev/peps/pep-0263/ for details",
				596	badchar, filename, tok->lineno + 1);
				597	Py_DECREF(filename);
				598	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	599	return error_ret(tok);
				600	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	601	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	602	return line;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	603	}
				604
				605	static int
				606	decoding_feof(struct tok_state *tok)
				607	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	608	if (tok->decoding_state != STATE_NORMAL) {
				609	return feof(tok->fp);
				610	} else {
				611	PyObject* buf = tok->decoding_buffer;
				612	if (buf == NULL) {
				613	buf = PyObject_CallObject(tok->decoding_readline, NULL);
				614	if (buf == NULL) {
				615	error_ret(tok);
				616	return 1;
				617	} else {
				618	tok->decoding_buffer = buf;
				619	}
				620	}
				621	return PyObject_Length(buf) == 0;
				622	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	623	}
				624
				625	/* Fetch a byte from TOK, using the string buffer. */
				626
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	627	static int
				628	buf_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	629	return Py_CHARMASK(*tok->str++);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	630	}
				631
				632	/* Unfetch a byte from TOK, using the string buffer. */
				633
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	634	static void
				635	buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	636	tok->str--;
				637	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	638	}
				639
				640	/* Set the readline function for TOK to ENC. For the string-based
				641	tokenizer, this means to just record the encoding. */
				642
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	643	static int
				644	buf_setreadl(struct tok_state tok, const char enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	645	tok->enc = enc;
				646	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	647	}
				648
				649	/* Return a UTF-8 encoding Python string object from the
				650	C byte string STR, which is encoded with ENC. */
				651
				652	static PyObject *
				653	translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	654	PyObject *utf8;
				655	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
				656	if (buf == NULL)
				657	return NULL;
				658	utf8 = PyUnicode_AsUTF8String(buf);
				659	Py_DECREF(buf);
				660	return utf8;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	661	}
				662
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	663
				664	static char *
				665	translate_newlines(const char s, int exec_input, struct tok_state tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	666	int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
				667	char buf, current;
				668	char c = '\0';
				669	buf = PyMem_MALLOC(needed_length);
				670	if (buf == NULL) {
				671	tok->done = E_NOMEM;
				672	return NULL;
				673	}
				674	for (current = buf; *s; s++, current++) {
				675	c = *s;
				676	if (skip_next_lf) {
				677	skip_next_lf = 0;
				678	if (c == '\n') {
				679	c = *++s;
				680	if (!c)
				681	break;
				682	}
				683	}
				684	if (c == '\r') {
				685	skip_next_lf = 1;
				686	c = '\n';
				687	}
				688	*current = c;
				689	}
				690	/* If this is exec input, add a newline to the end of the string if
				691	there isn't one already. */
				692	if (exec_input && c != '\n') {
				693	*current = '\n';
				694	current++;
				695	}
				696	*current = '\0';
				697	final_length = current - buf + 1;
				698	if (final_length < needed_length && final_length)
				699	/* should never fail */
				700	buf = PyMem_REALLOC(buf, final_length);
				701	return buf;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	702	}
				703
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	704	/* Decode a byte string STR for use as the buffer of TOK.
				705	Look for encoding declarations inside STR, and record them
				706	inside TOK. */
				707
				708	static const char *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	709	decode_str(const char input, int single, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	710	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	711	PyObject* utf8 = NULL;
				712	const char *str;
				713	const char *s;
				714	const char *newl[2] = {NULL, NULL};
				715	int lineno = 0;
				716	tok->input = str = translate_newlines(input, single, tok);
				717	if (str == NULL)
				718	return NULL;
				719	tok->enc = NULL;
				720	tok->str = str;
				721	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
				722	return error_ret(tok);
				723	str = tok->str; /* string after BOM if any */
				724	assert(str);
				725	if (tok->enc != NULL) {
				726	utf8 = translate_into_utf8(str, tok->enc);
				727	if (utf8 == NULL)
				728	return error_ret(tok);
				729	str = PyBytes_AsString(utf8);
				730	}
				731	for (s = str;; s++) {
				732	if (*s == '\0') break;
				733	else if (*s == '\n') {
				734	assert(lineno < 2);
				735	newl[lineno] = s;
				736	lineno++;
				737	if (lineno == 2) break;
				738	}
				739	}
				740	tok->enc = NULL;
				741	/* need to check line 1 and 2 separately since check_coding_spec
				742	assumes a single line as input */
				743	if (newl[0]) {
				744	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
				745	return error_ret(tok);
				746	if (tok->enc == NULL && newl[1]) {
				747	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
				748	tok, buf_setreadl))
				749	return error_ret(tok);
				750	}
				751	}
				752	if (tok->enc != NULL) {
				753	assert(utf8 == NULL);
				754	utf8 = translate_into_utf8(str, tok->enc);
				755	if (utf8 == NULL)
				756	return error_ret(tok);
				757	str = PyBytes_AS_STRING(utf8);
				758	}
				759	assert(tok->decoding_buffer == NULL);
				760	tok->decoding_buffer = utf8; /* CAUTION */
				761	return str;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	762	}
				763
				764	#endif /* PGEN */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	765
				766	/* Set up tokenizer for string */
				767
				768	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	769	PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	770	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	771	struct tok_state *tok = tok_new();
				772	if (tok == NULL)
				773	return NULL;
				774	str = (char *)decode_str(str, exec_input, tok);
				775	if (str == NULL) {
				776	PyTokenizer_Free(tok);
				777	return NULL;
				778	}
Neal Norwitz	dee2fd5	2005-11-16 05:12:59 +0000	[diff] [blame]	779
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	780	/* XXX: constify members. */
				781	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				782	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	783	}
				784
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	785	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	786	PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	787	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	788	struct tok_state *tok = tok_new();
				789	if (tok == NULL)
				790	return NULL;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	791	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	792	tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	793	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	794	if (str == NULL) {
				795	PyTokenizer_Free(tok);
				796	return NULL;
				797	}
				798	tok->decoding_state = STATE_RAW;
				799	tok->read_coding_spec = 1;
				800	tok->enc = NULL;
				801	tok->str = str;
				802	tok->encoding = (char *)PyMem_MALLOC(6);
				803	if (!tok->encoding) {
				804	PyTokenizer_Free(tok);
				805	return NULL;
				806	}
				807	strcpy(tok->encoding, "utf-8");
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	808
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	809	/* XXX: constify members. */
				810	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				811	return tok;
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	812	}
				813
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	814	/* Set up tokenizer for file */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	815
				816	struct tok_state *
Martin v. Löwis	85bcc66	2007-09-04 09:18:06 +0000	[diff] [blame]	817	PyTokenizer_FromFile(FILE fp, char enc, char ps1, char ps2)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	818	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	819	struct tok_state *tok = tok_new();
				820	if (tok == NULL)
				821	return NULL;
				822	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
				823	PyTokenizer_Free(tok);
				824	return NULL;
				825	}
				826	tok->cur = tok->inp = tok->buf;
				827	tok->end = tok->buf + BUFSIZ;
				828	tok->fp = fp;
				829	tok->prompt = ps1;
				830	tok->nextprompt = ps2;
				831	if (enc != NULL) {
				832	/* Must copy encoding declaration since it
				833	gets copied into the parse tree. */
				834	tok->encoding = PyMem_MALLOC(strlen(enc)+1);
				835	if (!tok->encoding) {
				836	PyTokenizer_Free(tok);
				837	return NULL;
				838	}
				839	strcpy(tok->encoding, enc);
				840	tok->decoding_state = STATE_NORMAL;
				841	}
				842	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	843	}
				844
				845
				846	/* Free a tok_state structure */
				847
				848	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	849	PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	850	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	851	if (tok->encoding != NULL)
				852	PyMem_FREE(tok->encoding);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	853	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	854	Py_XDECREF(tok->decoding_readline);
				855	Py_XDECREF(tok->decoding_buffer);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	856	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	857	if (tok->fp != NULL && tok->buf != NULL)
				858	PyMem_FREE(tok->buf);
				859	if (tok->input)
				860	PyMem_FREE((char *)tok->input);
				861	PyMem_FREE(tok);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	862	}
				863
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	864	/* Get next char, updating state; error code goes into tok->done */
				865
				866	static int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	867	tok_nextc(register struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	868	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	869	for (;;) {
				870	if (tok->cur != tok->inp) {
				871	return Py_CHARMASK(tok->cur++); / Fast path */
				872	}
				873	if (tok->done != E_OK)
				874	return EOF;
				875	if (tok->fp == NULL) {
				876	char *end = strchr(tok->inp, '\n');
				877	if (end != NULL)
				878	end++;
				879	else {
				880	end = strchr(tok->inp, '\0');
				881	if (end == tok->inp) {
				882	tok->done = E_EOF;
				883	return EOF;
				884	}
				885	}
				886	if (tok->start == NULL)
				887	tok->buf = tok->cur;
				888	tok->line_start = tok->cur;
				889	tok->lineno++;
				890	tok->inp = end;
				891	return Py_CHARMASK(*tok->cur++);
				892	}
				893	if (tok->prompt != NULL) {
				894	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner	89e3436	2011-01-07 18:47:22 +0000	[diff] [blame^]	895	if (newtok != NULL) {
				896	char *translated = translate_newlines(newtok, 0, tok);
				897	PyMem_FREE(newtok);
				898	if (translated == NULL)
				899	return EOF;
				900	newtok = translated;
				901	}
Martin v. Löwis	85bcc66	2007-09-04 09:18:06 +0000	[diff] [blame]	902	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	903	if (tok->encoding && newtok && *newtok) {
				904	/* Recode to UTF-8 */
				905	Py_ssize_t buflen;
				906	const char* buf;
				907	PyObject *u = translate_into_utf8(newtok, tok->encoding);
				908	PyMem_FREE(newtok);
				909	if (!u) {
				910	tok->done = E_DECODE;
				911	return EOF;
				912	}
				913	buflen = PyBytes_GET_SIZE(u);
				914	buf = PyBytes_AS_STRING(u);
				915	if (!buf) {
				916	Py_DECREF(u);
				917	tok->done = E_DECODE;
				918	return EOF;
				919	}
				920	newtok = PyMem_MALLOC(buflen+1);
				921	strcpy(newtok, buf);
				922	Py_DECREF(u);
				923	}
Martin v. Löwis	85bcc66	2007-09-04 09:18:06 +0000	[diff] [blame]	924	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	925	if (tok->nextprompt != NULL)
				926	tok->prompt = tok->nextprompt;
				927	if (newtok == NULL)
				928	tok->done = E_INTR;
				929	else if (*newtok == '\0') {
				930	PyMem_FREE(newtok);
				931	tok->done = E_EOF;
				932	}
				933	else if (tok->start != NULL) {
				934	size_t start = tok->start - tok->buf;
				935	size_t oldlen = tok->cur - tok->buf;
				936	size_t newlen = oldlen + strlen(newtok);
				937	char *buf = tok->buf;
				938	buf = (char *)PyMem_REALLOC(buf, newlen+1);
				939	tok->lineno++;
				940	if (buf == NULL) {
				941	PyMem_FREE(tok->buf);
				942	tok->buf = NULL;
				943	PyMem_FREE(newtok);
				944	tok->done = E_NOMEM;
				945	return EOF;
				946	}
				947	tok->buf = buf;
				948	tok->cur = tok->buf + oldlen;
				949	tok->line_start = tok->cur;
				950	strcpy(tok->buf + oldlen, newtok);
				951	PyMem_FREE(newtok);
				952	tok->inp = tok->buf + newlen;
				953	tok->end = tok->inp + 1;
				954	tok->start = tok->buf + start;
				955	}
				956	else {
				957	tok->lineno++;
				958	if (tok->buf != NULL)
				959	PyMem_FREE(tok->buf);
				960	tok->buf = newtok;
				961	tok->line_start = tok->buf;
				962	tok->cur = tok->buf;
				963	tok->line_start = tok->buf;
				964	tok->inp = strchr(tok->buf, '\0');
				965	tok->end = tok->inp + 1;
				966	}
				967	}
				968	else {
				969	int done = 0;
				970	Py_ssize_t cur = 0;
				971	char *pt;
				972	if (tok->start == NULL) {
				973	if (tok->buf == NULL) {
				974	tok->buf = (char *)
				975	PyMem_MALLOC(BUFSIZ);
				976	if (tok->buf == NULL) {
				977	tok->done = E_NOMEM;
				978	return EOF;
				979	}
				980	tok->end = tok->buf + BUFSIZ;
				981	}
				982	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
				983	tok) == NULL) {
				984	tok->done = E_EOF;
				985	done = 1;
				986	}
				987	else {
				988	tok->done = E_OK;
				989	tok->inp = strchr(tok->buf, '\0');
				990	done = tok->inp[-1] == '\n';
				991	}
				992	}
				993	else {
				994	cur = tok->cur - tok->buf;
				995	if (decoding_feof(tok)) {
				996	tok->done = E_EOF;
				997	done = 1;
				998	}
				999	else
				1000	tok->done = E_OK;
				1001	}
				1002	tok->lineno++;
				1003	/* Read until '\n' or EOF */
				1004	while (!done) {
				1005	Py_ssize_t curstart = tok->start == NULL ? -1 :
				1006	tok->start - tok->buf;
				1007	Py_ssize_t curvalid = tok->inp - tok->buf;
				1008	Py_ssize_t newsize = curvalid + BUFSIZ;
				1009	char *newbuf = tok->buf;
				1010	newbuf = (char *)PyMem_REALLOC(newbuf,
				1011	newsize);
				1012	if (newbuf == NULL) {
				1013	tok->done = E_NOMEM;
				1014	tok->cur = tok->inp;
				1015	return EOF;
				1016	}
				1017	tok->buf = newbuf;
				1018	tok->inp = tok->buf + curvalid;
				1019	tok->end = tok->buf + newsize;
				1020	tok->start = curstart < 0 ? NULL :
				1021	tok->buf + curstart;
				1022	if (decoding_fgets(tok->inp,
				1023	(int)(tok->end - tok->inp),
				1024	tok) == NULL) {
				1025	/* Break out early on decoding
				1026	errors, as tok->buf will be NULL
				1027	*/
				1028	if (tok->decoding_erred)
				1029	return EOF;
				1030	/* Last line does not end in \n,
				1031	fake one */
				1032	strcpy(tok->inp, "\n");
				1033	}
				1034	tok->inp = strchr(tok->inp, '\0');
				1035	done = tok->inp[-1] == '\n';
				1036	}
				1037	if (tok->buf != NULL) {
				1038	tok->cur = tok->buf + cur;
				1039	tok->line_start = tok->cur;
				1040	/* replace "\r\n" with "\n" */
				1041	/* For Mac leave the \r, giving a syntax error */
				1042	pt = tok->inp - 2;
				1043	if (pt >= tok->buf && *pt == '\r') {
				1044	*pt++ = '\n';
				1045	*pt = '\0';
				1046	tok->inp = pt;
				1047	}
				1048	}
				1049	}
				1050	if (tok->done != E_OK) {
				1051	if (tok->prompt != NULL)
				1052	PySys_WriteStderr("\n");
				1053	tok->cur = tok->inp;
				1054	return EOF;
				1055	}
				1056	}
				1057	/NOTREACHED/
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1058	}
				1059
				1060
				1061	/* Back-up one character */
				1062
				1063	static void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1064	tok_backup(register struct tok_state *tok, register int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1065	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1066	if (c != EOF) {
				1067	if (--tok->cur < tok->buf)
				1068	Py_FatalError("tok_backup: beginning of buffer");
				1069	if (*tok->cur != c)
				1070	*tok->cur = c;
				1071	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1072	}
				1073
				1074
				1075	/* Return the token corresponding to a single character */
				1076
				1077	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1078	PyToken_OneChar(int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1079	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1080	switch (c) {
				1081	case '(': return LPAR;
				1082	case ')': return RPAR;
				1083	case '[': return LSQB;
				1084	case ']': return RSQB;
				1085	case ':': return COLON;
				1086	case ',': return COMMA;
				1087	case ';': return SEMI;
				1088	case '+': return PLUS;
				1089	case '-': return MINUS;
				1090	case '*': return STAR;
				1091	case '/': return SLASH;
				1092	case '\|': return VBAR;
				1093	case '&': return AMPER;
				1094	case '<': return LESS;
				1095	case '>': return GREATER;
				1096	case '=': return EQUAL;
				1097	case '.': return DOT;
				1098	case '%': return PERCENT;
				1099	case '{': return LBRACE;
				1100	case '}': return RBRACE;
				1101	case '^': return CIRCUMFLEX;
				1102	case '~': return TILDE;
				1103	case '@': return AT;
				1104	default: return OP;
				1105	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1106	}
				1107
				1108
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1109	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1110	PyToken_TwoChars(int c1, int c2)
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1111	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1112	switch (c1) {
				1113	case '=':
				1114	switch (c2) {
				1115	case '=': return EQEQUAL;
				1116	}
				1117	break;
				1118	case '!':
				1119	switch (c2) {
				1120	case '=': return NOTEQUAL;
				1121	}
				1122	break;
				1123	case '<':
				1124	switch (c2) {
				1125	case '>': return NOTEQUAL;
				1126	case '=': return LESSEQUAL;
				1127	case '<': return LEFTSHIFT;
				1128	}
				1129	break;
				1130	case '>':
				1131	switch (c2) {
				1132	case '=': return GREATEREQUAL;
				1133	case '>': return RIGHTSHIFT;
				1134	}
				1135	break;
				1136	case '+':
				1137	switch (c2) {
				1138	case '=': return PLUSEQUAL;
				1139	}
				1140	break;
				1141	case '-':
				1142	switch (c2) {
				1143	case '=': return MINEQUAL;
				1144	case '>': return RARROW;
				1145	}
				1146	break;
				1147	case '*':
				1148	switch (c2) {
				1149	case '*': return DOUBLESTAR;
				1150	case '=': return STAREQUAL;
				1151	}
				1152	break;
				1153	case '/':
				1154	switch (c2) {
				1155	case '/': return DOUBLESLASH;
				1156	case '=': return SLASHEQUAL;
				1157	}
				1158	break;
				1159	case '\|':
				1160	switch (c2) {
				1161	case '=': return VBAREQUAL;
				1162	}
				1163	break;
				1164	case '%':
				1165	switch (c2) {
				1166	case '=': return PERCENTEQUAL;
				1167	}
				1168	break;
				1169	case '&':
				1170	switch (c2) {
				1171	case '=': return AMPEREQUAL;
				1172	}
				1173	break;
				1174	case '^':
				1175	switch (c2) {
				1176	case '=': return CIRCUMFLEXEQUAL;
				1177	}
				1178	break;
				1179	}
				1180	return OP;
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1181	}
				1182
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1183	int
				1184	PyToken_ThreeChars(int c1, int c2, int c3)
				1185	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1186	switch (c1) {
				1187	case '<':
				1188	switch (c2) {
				1189	case '<':
				1190	switch (c3) {
				1191	case '=':
				1192	return LEFTSHIFTEQUAL;
				1193	}
				1194	break;
				1195	}
				1196	break;
				1197	case '>':
				1198	switch (c2) {
				1199	case '>':
				1200	switch (c3) {
				1201	case '=':
				1202	return RIGHTSHIFTEQUAL;
				1203	}
				1204	break;
				1205	}
				1206	break;
				1207	case '*':
				1208	switch (c2) {
				1209	case '*':
				1210	switch (c3) {
				1211	case '=':
				1212	return DOUBLESTAREQUAL;
				1213	}
				1214	break;
				1215	}
				1216	break;
				1217	case '/':
				1218	switch (c2) {
				1219	case '/':
				1220	switch (c3) {
				1221	case '=':
				1222	return DOUBLESLASHEQUAL;
				1223	}
				1224	break;
				1225	}
				1226	break;
				1227	case '.':
				1228	switch (c2) {
Georg Brandl	dde0028	2007-03-18 19:01:53 +0000	[diff] [blame]	1229	case '.':
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1230	switch (c3) {
				1231	case '.':
				1232	return ELLIPSIS;
				1233	}
				1234	break;
				1235	}
				1236	break;
				1237	}
				1238	return OP;
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1239	}
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1240
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1241	static int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1242	indenterror(struct tok_state *tok)
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1243	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1244	if (tok->alterror) {
				1245	tok->done = E_TABSPACE;
				1246	tok->cur = tok->inp;
				1247	return 1;
				1248	}
				1249	if (tok->altwarning) {
				1250	PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
				1251	"in indentation\n", tok->filename);
				1252	tok->altwarning = 0;
				1253	}
				1254	return 0;
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1255	}
				1256
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1257	#ifdef PGEN
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1258	#define verify_identifier(tok) 1
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1259	#else
				1260	/* Verify that the identifier follows PEP 3131. */
				1261	static int
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1262	verify_identifier(struct tok_state *tok)
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1263	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1264	PyObject *s;
				1265	int result;
				1266	s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
				1267	if (s == NULL) {
				1268	if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
				1269	PyErr_Clear();
				1270	tok->done = E_IDENTIFIER;
				1271	} else {
				1272	tok->done = E_ERROR;
				1273	}
				1274	return 0;
				1275	}
				1276	result = PyUnicode_IsIdentifier(s);
				1277	Py_DECREF(s);
				1278	if (result == 0)
				1279	tok->done = E_IDENTIFIER;
				1280	return result;
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1281	}
				1282	#endif
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1283
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1284	/* Get next token, after space stripping etc. */
				1285
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1286	static int
				1287	tok_get(register struct tok_state tok, char p_start, char *p_end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1288	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1289	register int c;
				1290	int blankline, nonascii;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1291
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1292	p_start = p_end = NULL;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1293	nextline:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1294	tok->start = NULL;
				1295	blankline = 0;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1296
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1297	/* Get indentation level */
				1298	if (tok->atbol) {
				1299	register int col = 0;
				1300	register int altcol = 0;
				1301	tok->atbol = 0;
				1302	for (;;) {
				1303	c = tok_nextc(tok);
				1304	if (c == ' ')
				1305	col++, altcol++;
				1306	else if (c == '\t') {
				1307	col = (col/tok->tabsize + 1) * tok->tabsize;
				1308	altcol = (altcol/tok->alttabsize + 1)
				1309	* tok->alttabsize;
				1310	}
				1311	else if (c == '\014') /* Control-L (formfeed) */
				1312	col = altcol = 0; /* For Emacs users */
				1313	else
				1314	break;
				1315	}
				1316	tok_backup(tok, c);
				1317	if (c == '#' \|\| c == '\n') {
				1318	/* Lines with only whitespace and/or comments
				1319	shouldn't affect the indentation and are
				1320	not passed to the parser as NEWLINE tokens,
				1321	except totally empty lines in interactive
				1322	mode, which signal the end of a command group. */
				1323	if (col == 0 && c == '\n' && tok->prompt != NULL)
				1324	blankline = 0; /* Let it through */
				1325	else
				1326	blankline = 1; /* Ignore completely */
				1327	/* We can't jump back right here since we still
				1328	may need to skip to the end of a comment */
				1329	}
				1330	if (!blankline && tok->level == 0) {
				1331	if (col == tok->indstack[tok->indent]) {
				1332	/* No change */
				1333	if (altcol != tok->altindstack[tok->indent]) {
				1334	if (indenterror(tok))
				1335	return ERRORTOKEN;
				1336	}
				1337	}
				1338	else if (col > tok->indstack[tok->indent]) {
				1339	/* Indent -- always one */
				1340	if (tok->indent+1 >= MAXINDENT) {
				1341	tok->done = E_TOODEEP;
				1342	tok->cur = tok->inp;
				1343	return ERRORTOKEN;
				1344	}
				1345	if (altcol <= tok->altindstack[tok->indent]) {
				1346	if (indenterror(tok))
				1347	return ERRORTOKEN;
				1348	}
				1349	tok->pendin++;
				1350	tok->indstack[++tok->indent] = col;
				1351	tok->altindstack[tok->indent] = altcol;
				1352	}
				1353	else /* col < tok->indstack[tok->indent] */ {
				1354	/* Dedent -- any number, must be consistent */
				1355	while (tok->indent > 0 &&
				1356	col < tok->indstack[tok->indent]) {
				1357	tok->pendin--;
				1358	tok->indent--;
				1359	}
				1360	if (col != tok->indstack[tok->indent]) {
				1361	tok->done = E_DEDENT;
				1362	tok->cur = tok->inp;
				1363	return ERRORTOKEN;
				1364	}
				1365	if (altcol != tok->altindstack[tok->indent]) {
				1366	if (indenterror(tok))
				1367	return ERRORTOKEN;
				1368	}
				1369	}
				1370	}
				1371	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1372
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1373	tok->start = tok->cur;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1374
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1375	/* Return pending indents/dedents */
				1376	if (tok->pendin != 0) {
				1377	if (tok->pendin < 0) {
				1378	tok->pendin++;
				1379	return DEDENT;
				1380	}
				1381	else {
				1382	tok->pendin--;
				1383	return INDENT;
				1384	}
				1385	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1386
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1387	again:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1388	tok->start = NULL;
				1389	/* Skip spaces */
				1390	do {
				1391	c = tok_nextc(tok);
				1392	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1393
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1394	/* Set start of current token */
				1395	tok->start = tok->cur - 1;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1396
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1397	/* Skip comment */
				1398	if (c == '#')
				1399	while (c != EOF && c != '\n')
				1400	c = tok_nextc(tok);
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1401
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1402	/* Check for EOF and errors now */
				1403	if (c == EOF) {
				1404	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
				1405	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1406
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1407	/* Identifier (most frequent token!) */
				1408	nonascii = 0;
				1409	if (is_potential_identifier_start(c)) {
				1410	/* Process b"", r"" and br"" */
				1411	if (c == 'b' \|\| c == 'B') {
				1412	c = tok_nextc(tok);
				1413	if (c == '"' \|\| c == '\'')
				1414	goto letter_quote;
				1415	}
				1416	if (c == 'r' \|\| c == 'R') {
				1417	c = tok_nextc(tok);
				1418	if (c == '"' \|\| c == '\'')
				1419	goto letter_quote;
				1420	}
				1421	while (is_potential_identifier_char(c)) {
				1422	if (c >= 128)
				1423	nonascii = 1;
				1424	c = tok_nextc(tok);
				1425	}
				1426	tok_backup(tok, c);
				1427	if (nonascii &&
				1428	!verify_identifier(tok)) {
				1429	tok->done = E_IDENTIFIER;
				1430	return ERRORTOKEN;
				1431	}
				1432	*p_start = tok->start;
				1433	*p_end = tok->cur;
				1434	return NAME;
				1435	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1436
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1437	/* Newline */
				1438	if (c == '\n') {
				1439	tok->atbol = 1;
				1440	if (blankline \|\| tok->level > 0)
				1441	goto nextline;
				1442	*p_start = tok->start;
				1443	p_end = tok->cur - 1; / Leave '\n' out of the string */
				1444	tok->cont_line = 0;
				1445	return NEWLINE;
				1446	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1447
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1448	/* Period or number starting with period? */
				1449	if (c == '.') {
				1450	c = tok_nextc(tok);
				1451	if (isdigit(c)) {
				1452	goto fraction;
				1453	} else if (c == '.') {
				1454	c = tok_nextc(tok);
				1455	if (c == '.') {
				1456	*p_start = tok->start;
				1457	*p_end = tok->cur;
				1458	return ELLIPSIS;
				1459	} else {
				1460	tok_backup(tok, c);
				1461	}
				1462	tok_backup(tok, '.');
				1463	} else {
				1464	tok_backup(tok, c);
				1465	}
				1466	*p_start = tok->start;
				1467	*p_end = tok->cur;
				1468	return DOT;
				1469	}
Guido van Rossum	f595fde	1996-01-12 01:31:58 +0000	[diff] [blame]	1470
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1471	/* Number */
				1472	if (isdigit(c)) {
				1473	if (c == '0') {
				1474	/* Hex, octal or binary -- maybe. */
				1475	c = tok_nextc(tok);
				1476	if (c == '.')
				1477	goto fraction;
				1478	if (c == 'j' \|\| c == 'J')
				1479	goto imaginary;
				1480	if (c == 'x' \|\| c == 'X') {
Georg Brandl	fceab5a	2008-01-19 20:08:23 +0000	[diff] [blame]	1481
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1482	/* Hex */
				1483	c = tok_nextc(tok);
				1484	if (!isxdigit(c)) {
				1485	tok->done = E_TOKEN;
				1486	tok_backup(tok, c);
				1487	return ERRORTOKEN;
				1488	}
				1489	do {
				1490	c = tok_nextc(tok);
				1491	} while (isxdigit(c));
				1492	}
				1493	else if (c == 'o' \|\| c == 'O') {
				1494	/* Octal */
				1495	c = tok_nextc(tok);
				1496	if (c < '0' \|\| c >= '8') {
				1497	tok->done = E_TOKEN;
				1498	tok_backup(tok, c);
				1499	return ERRORTOKEN;
				1500	}
				1501	do {
				1502	c = tok_nextc(tok);
				1503	} while ('0' <= c && c < '8');
				1504	}
				1505	else if (c == 'b' \|\| c == 'B') {
				1506	/* Binary */
				1507	c = tok_nextc(tok);
				1508	if (c != '0' && c != '1') {
				1509	tok->done = E_TOKEN;
				1510	tok_backup(tok, c);
				1511	return ERRORTOKEN;
				1512	}
				1513	do {
				1514	c = tok_nextc(tok);
				1515	} while (c == '0' \|\| c == '1');
				1516	}
				1517	else {
				1518	int nonzero = 0;
				1519	/* maybe old-style octal; c is first char of it */
				1520	/* in any case, allow '0' as a literal */
				1521	while (c == '0')
				1522	c = tok_nextc(tok);
				1523	while (isdigit(c)) {
				1524	nonzero = 1;
				1525	c = tok_nextc(tok);
				1526	}
				1527	if (c == '.')
				1528	goto fraction;
				1529	else if (c == 'e' \|\| c == 'E')
				1530	goto exponent;
				1531	else if (c == 'j' \|\| c == 'J')
				1532	goto imaginary;
				1533	else if (nonzero) {
				1534	tok->done = E_TOKEN;
				1535	tok_backup(tok, c);
				1536	return ERRORTOKEN;
				1537	}
				1538	}
				1539	}
				1540	else {
				1541	/* Decimal */
				1542	do {
				1543	c = tok_nextc(tok);
				1544	} while (isdigit(c));
				1545	{
				1546	/* Accept floating point numbers. */
				1547	if (c == '.') {
				1548	fraction:
				1549	/* Fraction */
				1550	do {
				1551	c = tok_nextc(tok);
				1552	} while (isdigit(c));
				1553	}
				1554	if (c == 'e' \|\| c == 'E') {
				1555	exponent:
				1556	/* Exponent part */
				1557	c = tok_nextc(tok);
				1558	if (c == '+' \|\| c == '-')
				1559	c = tok_nextc(tok);
				1560	if (!isdigit(c)) {
				1561	tok->done = E_TOKEN;
				1562	tok_backup(tok, c);
				1563	return ERRORTOKEN;
				1564	}
				1565	do {
				1566	c = tok_nextc(tok);
				1567	} while (isdigit(c));
				1568	}
				1569	if (c == 'j' \|\| c == 'J')
				1570	/* Imaginary part */
				1571	imaginary:
				1572	c = tok_nextc(tok);
				1573	}
				1574	}
				1575	tok_backup(tok, c);
				1576	*p_start = tok->start;
				1577	*p_end = tok->cur;
				1578	return NUMBER;
				1579	}
Guido van Rossum	24dacb3	1997-04-06 03:46:20 +0000	[diff] [blame]	1580
				1581	letter_quote:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1582	/* String */
				1583	if (c == '\'' \|\| c == '"') {
				1584	int quote = c;
				1585	int quote_size = 1; /* 1 or 3 */
				1586	int end_quote_size = 0;
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1587
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1588	/* Find the quote size and start of string */
				1589	c = tok_nextc(tok);
				1590	if (c == quote) {
				1591	c = tok_nextc(tok);
				1592	if (c == quote)
				1593	quote_size = 3;
				1594	else
				1595	end_quote_size = 1; /* empty string found */
				1596	}
				1597	if (c != quote)
				1598	tok_backup(tok, c);
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1599
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1600	/* Get rest of string */
				1601	while (end_quote_size != quote_size) {
				1602	c = tok_nextc(tok);
				1603	if (c == EOF) {
				1604	if (quote_size == 3)
				1605	tok->done = E_EOFS;
				1606	else
				1607	tok->done = E_EOLS;
				1608	tok->cur = tok->inp;
				1609	return ERRORTOKEN;
				1610	}
				1611	if (quote_size == 1 && c == '\n') {
				1612	tok->done = E_EOLS;
				1613	tok->cur = tok->inp;
				1614	return ERRORTOKEN;
				1615	}
				1616	if (c == quote)
				1617	end_quote_size += 1;
				1618	else {
				1619	end_quote_size = 0;
				1620	if (c == '\\')
				1621	c = tok_nextc(tok); /* skip escaped char */
				1622	}
				1623	}
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1624
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1625	*p_start = tok->start;
				1626	*p_end = tok->cur;
				1627	return STRING;
				1628	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1629
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1630	/* Line continuation */
				1631	if (c == '\\') {
				1632	c = tok_nextc(tok);
				1633	if (c != '\n') {
				1634	tok->done = E_LINECONT;
				1635	tok->cur = tok->inp;
				1636	return ERRORTOKEN;
				1637	}
				1638	tok->cont_line = 1;
				1639	goto again; /* Read next line */
				1640	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1641
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1642	/* Check for two-character token */
				1643	{
				1644	int c2 = tok_nextc(tok);
				1645	int token = PyToken_TwoChars(c, c2);
				1646	if (token != OP) {
				1647	int c3 = tok_nextc(tok);
				1648	int token3 = PyToken_ThreeChars(c, c2, c3);
				1649	if (token3 != OP) {
				1650	token = token3;
				1651	} else {
				1652	tok_backup(tok, c3);
				1653	}
				1654	*p_start = tok->start;
				1655	*p_end = tok->cur;
				1656	return token;
				1657	}
				1658	tok_backup(tok, c2);
				1659	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1660
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1661	/* Keep track of parentheses nesting level */
				1662	switch (c) {
				1663	case '(':
				1664	case '[':
				1665	case '{':
				1666	tok->level++;
				1667	break;
				1668	case ')':
				1669	case ']':
				1670	case '}':
				1671	tok->level--;
				1672	break;
				1673	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1674
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1675	/* Punctuation character */
				1676	*p_start = tok->start;
				1677	*p_end = tok->cur;
				1678	return PyToken_OneChar(c);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1679	}
				1680
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1681	int
				1682	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
				1683	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1684	int result = tok_get(tok, p_start, p_end);
				1685	if (tok->decoding_erred) {
				1686	result = ERRORTOKEN;
				1687	tok->done = E_DECODE;
				1688	}
				1689	return result;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1690	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1691
Guido van Rossum	40d20bc	2007-10-22 00:09:51 +0000	[diff] [blame]	1692	/* Get -- encoding -- from a Python file.
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1693
				1694	PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1695	the first or second line of the file (in which case the encoding
Brett Cannon	e453989	2007-10-20 03:46:49 +0000	[diff] [blame]	1696	should be assumed to be PyUnicode_GetDefaultEncoding()).
				1697
Guido van Rossum	40d20bc	2007-10-22 00:09:51 +0000	[diff] [blame]	1698	The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
				1699	by the caller.
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1700	*/
				1701	char *
Guido van Rossum	40d20bc	2007-10-22 00:09:51 +0000	[diff] [blame]	1702	PyTokenizer_FindEncoding(int fd)
				1703	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1704	struct tok_state *tok;
				1705	FILE *fp;
				1706	char p_start =NULL , p_end =NULL , *encoding = NULL;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1707
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1708	fd = dup(fd);
				1709	if (fd < 0) {
				1710	return NULL;
				1711	}
				1712	fp = fdopen(fd, "r");
				1713	if (fp == NULL) {
				1714	return NULL;
				1715	}
				1716	tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
				1717	if (tok == NULL) {
				1718	fclose(fp);
				1719	return NULL;
				1720	}
				1721	while (tok->lineno < 2 && tok->done == E_OK) {
				1722	PyTokenizer_Get(tok, &p_start, &p_end);
				1723	}
				1724	fclose(fp);
				1725	if (tok->encoding) {
				1726	encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
				1727	if (encoding)
				1728	strcpy(encoding, tok->encoding);
				1729	}
				1730	PyTokenizer_Free(tok);
				1731	return encoding;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1732	}
Thomas Wouters	89d996e	2007-09-08 17:39:28 +0000	[diff] [blame]	1733
Guido van Rossum	408027e	1996-12-30 16:17:54 +0000	[diff] [blame]	1734	#ifdef Py_DEBUG
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1735
				1736	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1737	tok_dump(int type, char start, char end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1738	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1739	printf("%s", _PyParser_TokenNames[type]);
				1740	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
				1741	printf("(%.*s)", (int)(end - start), start);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1742	}
				1743
				1744	#endif