Blame - Parser/tokenizer.c - platform/external/python/cpython3

blob: 22accd1061aeaffaeca1bfdf013404fa169b0853 [file] [log] [blame]

Guido van Rossum	f70e43a	1991-02-19 12:39:46 +0000	[diff] [blame]	1
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	2	/* Tokenizer implementation */
				3
Jack Jansen	7b8c754	2002-04-14 20:12:41 +0000	[diff] [blame]	4	#include "Python.h"
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	5	#include "pgenheaders.h"
				6
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	7	#include <ctype.h>
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	8	#include <assert.h>
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	9
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	10	#include "tokenizer.h"
				11	#include "errcode.h"
				12
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	13	#ifndef PGEN
				14	#include "unicodeobject.h"
Christian Heimes	2c9c7a5	2008-05-26 13:42:13 +0000	[diff] [blame]	15	#include "bytesobject.h"
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	16	#include "fileobject.h"
				17	#include "codecs.h"
				18	#include "abstract.h"
				19	#endif /* PGEN */
				20
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	21	#define is_potential_identifier_start(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	22	(c >= 'a' && c <= 'z')\
				23	\|\| (c >= 'A' && c <= 'Z')\
				24	\|\| c == '_'\
				25	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	26
				27	#define is_potential_identifier_char(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	28	(c >= 'a' && c <= 'z')\
				29	\|\| (c >= 'A' && c <= 'Z')\
				30	\|\| (c >= '0' && c <= '9')\
				31	\|\| c == '_'\
				32	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	33
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	34	extern char PyOS_Readline(FILE , FILE , const char );
Guido van Rossum	f4b1a64	1994-08-29 12:43:07 +0000	[diff] [blame]	35	/* Return malloc'ed string including trailing \n;
				36	empty malloc'ed string for EOF;
				37	NULL if interrupted */
				38
Guido van Rossum	4fe8729	1992-02-26 15:24:44 +0000	[diff] [blame]	39	/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	40	#define TABSIZE 8
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	41
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	42	/* Forward */
Tim Peters	dbd9ba6	2000-07-09 03:09:57 +0000	[diff] [blame]	43	static struct tok_state *tok_new(void);
				44	static int tok_nextc(struct tok_state *tok);
				45	static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	46
Brett Cannon	d5ec98c	2007-10-20 02:54:14 +0000	[diff] [blame]	47
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	48	/* Token names */
				49
Benjamin Peterson	d084558	2012-10-24 08:21:52 -0700	[diff] [blame]	50	const char *_PyParser_TokenNames[] = {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	51	"ENDMARKER",
				52	"NAME",
				53	"NUMBER",
				54	"STRING",
				55	"NEWLINE",
				56	"INDENT",
				57	"DEDENT",
				58	"LPAR",
				59	"RPAR",
				60	"LSQB",
				61	"RSQB",
				62	"COLON",
				63	"COMMA",
				64	"SEMI",
				65	"PLUS",
				66	"MINUS",
				67	"STAR",
				68	"SLASH",
				69	"VBAR",
				70	"AMPER",
				71	"LESS",
				72	"GREATER",
				73	"EQUAL",
				74	"DOT",
				75	"PERCENT",
				76	"LBRACE",
				77	"RBRACE",
				78	"EQEQUAL",
				79	"NOTEQUAL",
				80	"LESSEQUAL",
				81	"GREATEREQUAL",
				82	"TILDE",
				83	"CIRCUMFLEX",
				84	"LEFTSHIFT",
				85	"RIGHTSHIFT",
				86	"DOUBLESTAR",
				87	"PLUSEQUAL",
				88	"MINEQUAL",
				89	"STAREQUAL",
				90	"SLASHEQUAL",
				91	"PERCENTEQUAL",
				92	"AMPEREQUAL",
				93	"VBAREQUAL",
				94	"CIRCUMFLEXEQUAL",
				95	"LEFTSHIFTEQUAL",
				96	"RIGHTSHIFTEQUAL",
				97	"DOUBLESTAREQUAL",
				98	"DOUBLESLASH",
				99	"DOUBLESLASHEQUAL",
				100	"AT",
				101	"RARROW",
				102	"ELLIPSIS",
				103	/* This table must match the #defines in token.h! */
				104	"OP",
				105	"<ERRORTOKEN>",
				106	"<N_TOKENS>"
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	107	};
				108
				109
				110	/* Create and initialize a new tok_state structure */
				111
				112	static struct tok_state *
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	113	tok_new(void)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	114	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	115	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
				116	sizeof(struct tok_state));
				117	if (tok == NULL)
				118	return NULL;
				119	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
				120	tok->done = E_OK;
				121	tok->fp = NULL;
				122	tok->input = NULL;
				123	tok->tabsize = TABSIZE;
				124	tok->indent = 0;
				125	tok->indstack[0] = 0;
				126	tok->atbol = 1;
				127	tok->pendin = 0;
				128	tok->prompt = tok->nextprompt = NULL;
				129	tok->lineno = 0;
				130	tok->level = 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	131	tok->altwarning = 1;
				132	tok->alterror = 1;
				133	tok->alttabsize = 1;
				134	tok->altindstack[0] = 0;
				135	tok->decoding_state = STATE_INIT;
				136	tok->decoding_erred = 0;
				137	tok->read_coding_spec = 0;
				138	tok->enc = NULL;
				139	tok->encoding = NULL;
				140	tok->cont_line = 0;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	141	#ifndef PGEN
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	142	tok->filename = NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	143	tok->decoding_readline = NULL;
				144	tok->decoding_buffer = NULL;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	145	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	146	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	147	}
				148
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	149	static char *
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	150	new_string(const char s, Py_ssize_t len, struct tok_state tok)
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	151	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	152	char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	153	if (!result) {
				154	tok->done = E_NOMEM;
				155	return NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	156	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	157	memcpy(result, s, len);
				158	result[len] = '\0';
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	159	return result;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	160	}
				161
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	162	#ifdef PGEN
				163
				164	static char *
				165	decoding_fgets(char s, int size, struct tok_state tok)
				166	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	167	return fgets(s, size, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	168	}
				169
				170	static int
				171	decoding_feof(struct tok_state *tok)
				172	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	173	return feof(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	174	}
				175
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	176	static char *
				177	decode_str(const char str, int exec_input, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	178	{
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	179	return new_string(str, strlen(str), tok);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	180	}
				181
				182	#else /* PGEN */
				183
				184	static char *
				185	error_ret(struct tok_state tok) / XXX */
				186	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	187	tok->decoding_erred = 1;
				188	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
				189	PyMem_FREE(tok->buf);
				190	tok->buf = NULL;
				191	return NULL; /* as if it were EOF */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	192	}
				193
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	194
				195	static char *
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	196	get_normal_name(char s) / for utf-8 and latin-1 */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	197	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	198	char buf[13];
				199	int i;
				200	for (i = 0; i < 12; i++) {
				201	int c = s[i];
				202	if (c == '\0')
				203	break;
				204	else if (c == '_')
				205	buf[i] = '-';
				206	else
				207	buf[i] = tolower(c);
				208	}
				209	buf[i] = '\0';
				210	if (strcmp(buf, "utf-8") == 0 \|\|
				211	strncmp(buf, "utf-8-", 6) == 0)
				212	return "utf-8";
				213	else if (strcmp(buf, "latin-1") == 0 \|\|
				214	strcmp(buf, "iso-8859-1") == 0 \|\|
				215	strcmp(buf, "iso-latin-1") == 0 \|\|
				216	strncmp(buf, "latin-1-", 8) == 0 \|\|
				217	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
				218	strncmp(buf, "iso-latin-1-", 12) == 0)
				219	return "iso-8859-1";
				220	else
				221	return s;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	222	}
				223
				224	/* Return the coding spec in S, or NULL if none is found. */
				225
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	226	static int
				227	get_coding_spec(const char s, char spec, Py_ssize_t size, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	228	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	229	Py_ssize_t i;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	230	*spec = NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	231	/* Coding spec must be in a comment, and that comment must be
				232	* the only statement on the source code line. */
				233	for (i = 0; i < size - 6; i++) {
				234	if (s[i] == '#')
				235	break;
				236	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	237	return 1;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	238	}
				239	for (; i < size - 6; i++) { /* XXX inefficient search */
				240	const char* t = s + i;
				241	if (strncmp(t, "coding", 6) == 0) {
				242	const char* begin = NULL;
				243	t += 6;
				244	if (t[0] != ':' && t[0] != '=')
				245	continue;
				246	do {
				247	t++;
				248	} while (t[0] == '\x20' \|\| t[0] == '\t');
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	249
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	250	begin = t;
				251	while (Py_ISALNUM(t[0]) \|\|
				252	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
				253	t++;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	254
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	255	if (begin < t) {
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	256	char* r = new_string(begin, t - begin, tok);
Benjamin Peterson	265fba4	2013-07-15 20:50:22 -0700	[diff] [blame]	257	char* q;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	258	if (!r)
				259	return 0;
Benjamin Peterson	265fba4	2013-07-15 20:50:22 -0700	[diff] [blame]	260	q = get_normal_name(r);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	261	if (r != q) {
				262	PyMem_FREE(r);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	263	r = new_string(q, strlen(q), tok);
				264	if (!r)
				265	return 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	266	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	267	*spec = r;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	268	}
				269	}
				270	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	271	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	272	}
				273
				274	/* Check whether the line contains a coding spec. If it does,
				275	invoke the set_readline function for the new encoding.
				276	This function receives the tok_state and the new encoding.
				277	Return 1 on success, 0 on failure. */
				278
				279	static int
Martin v. Löwis	18e1655	2006-02-15 17:27:45 +0000	[diff] [blame]	280	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	281	int set_readline(struct tok_state , const char ))
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	282	{
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	283	char *cs;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	284	int r = 1;
Tim Peters	17db21f	2002-09-03 15:39:58 +0000	[diff] [blame]	285
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	286	if (tok->cont_line) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	287	/* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	288	tok->read_coding_spec = 1;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	289	return 1;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	290	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	291	if (!get_coding_spec(line, &cs, size, tok))
				292	return 0;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	293	if (!cs) {
				294	Py_ssize_t i;
				295	for (i = 0; i < size; i++) {
				296	if (line[i] == '#' \|\| line[i] == '\n' \|\| line[i] == '\r')
				297	break;
				298	if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
				299	/* Stop checking coding spec after a line containing
				300	* anything except a comment. */
				301	tok->read_coding_spec = 1;
				302	break;
				303	}
				304	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	305	return 1;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	306	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	307	tok->read_coding_spec = 1;
				308	if (tok->encoding == NULL) {
				309	assert(tok->decoding_state == STATE_RAW);
				310	if (strcmp(cs, "utf-8") == 0) {
				311	tok->encoding = cs;
				312	} else {
				313	r = set_readline(tok, cs);
				314	if (r) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	315	tok->encoding = cs;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	316	tok->decoding_state = STATE_NORMAL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	317	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	318	else {
Serhiy Storchaka	3af14aa	2013-06-09 16:51:52 +0300	[diff] [blame]	319	PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	320	"encoding problem: %s", cs);
				321	PyMem_FREE(cs);
				322	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	323	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	324	} else { /* then, compare cs with BOM */
				325	r = (strcmp(tok->encoding, cs) == 0);
				326	if (!r)
				327	PyErr_Format(PyExc_SyntaxError,
				328	"encoding problem: %s with BOM", cs);
				329	PyMem_FREE(cs);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	330	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	331	return r;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	332	}
				333
				334	/* See whether the file starts with a BOM. If it does,
				335	invoke the set_readline function with the new encoding.
				336	Return 1 on success, 0 on failure. */
				337
				338	static int
				339	check_bom(int get_char(struct tok_state *),
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	340	void unget_char(int, struct tok_state *),
				341	int set_readline(struct tok_state , const char ),
				342	struct tok_state *tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	343	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	344	int ch1, ch2, ch3;
				345	ch1 = get_char(tok);
				346	tok->decoding_state = STATE_RAW;
				347	if (ch1 == EOF) {
				348	return 1;
				349	} else if (ch1 == 0xEF) {
				350	ch2 = get_char(tok);
				351	if (ch2 != 0xBB) {
				352	unget_char(ch2, tok);
				353	unget_char(ch1, tok);
				354	return 1;
				355	}
				356	ch3 = get_char(tok);
				357	if (ch3 != 0xBF) {
				358	unget_char(ch3, tok);
				359	unget_char(ch2, tok);
				360	unget_char(ch1, tok);
				361	return 1;
				362	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	363	#if 0
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	364	/* Disable support for UTF-16 BOMs until a decision
				365	is made whether this needs to be supported. */
				366	} else if (ch1 == 0xFE) {
				367	ch2 = get_char(tok);
				368	if (ch2 != 0xFF) {
				369	unget_char(ch2, tok);
				370	unget_char(ch1, tok);
				371	return 1;
				372	}
				373	if (!set_readline(tok, "utf-16-be"))
				374	return 0;
				375	tok->decoding_state = STATE_NORMAL;
				376	} else if (ch1 == 0xFF) {
				377	ch2 = get_char(tok);
				378	if (ch2 != 0xFE) {
				379	unget_char(ch2, tok);
				380	unget_char(ch1, tok);
				381	return 1;
				382	}
				383	if (!set_readline(tok, "utf-16-le"))
				384	return 0;
				385	tok->decoding_state = STATE_NORMAL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	386	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	387	} else {
				388	unget_char(ch1, tok);
				389	return 1;
				390	}
				391	if (tok->encoding != NULL)
				392	PyMem_FREE(tok->encoding);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	393	tok->encoding = new_string("utf-8", 5, tok);
				394	if (!tok->encoding)
				395	return 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	396	/* No need to set_readline: input is already utf-8 */
				397	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	398	}
				399
				400	/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	401	Return NULL on failure, else S.
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	402
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	403	On entry, tok->decoding_buffer will be one of:
				404	1) NULL: need to call tok->decoding_readline to get a new line
				405	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	406	stored the result in tok->decoding_buffer
Christian Heimes	9c4756e	2008-05-26 13:22:05 +0000	[diff] [blame]	407	3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	408	(in the s buffer) to copy entire contents of the line read
				409	by tok->decoding_readline. tok->decoding_buffer has the overflow.
				410	In this case, fp_readl is called in a loop (with an expanded buffer)
				411	until the buffer ends with a '\n' (or until the end of the file is
				412	reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	413	*/
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	414
				415	static char *
				416	fp_readl(char s, int size, struct tok_state tok)
				417	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	418	PyObject* bufobj;
				419	const char *buf;
				420	Py_ssize_t buflen;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	421
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	422	/* Ask for one less byte so we can terminate it */
				423	assert(size > 0);
				424	size--;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	425
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	426	if (tok->decoding_buffer) {
				427	bufobj = tok->decoding_buffer;
				428	Py_INCREF(bufobj);
				429	}
				430	else
				431	{
				432	bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
				433	if (bufobj == NULL)
				434	goto error;
				435	}
				436	if (PyUnicode_CheckExact(bufobj))
				437	{
				438	buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
				439	if (buf == NULL) {
				440	goto error;
				441	}
				442	}
				443	else
				444	{
				445	buf = PyByteArray_AsString(bufobj);
				446	if (buf == NULL) {
				447	goto error;
				448	}
				449	buflen = PyByteArray_GET_SIZE(bufobj);
				450	}
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	451
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	452	Py_XDECREF(tok->decoding_buffer);
				453	if (buflen > size) {
				454	/* Too many chars, the rest goes into tok->decoding_buffer */
				455	tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
				456	buflen-size);
				457	if (tok->decoding_buffer == NULL)
				458	goto error;
				459	buflen = size;
				460	}
				461	else
				462	tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	463
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	464	memcpy(s, buf, buflen);
				465	s[buflen] = '\0';
				466	if (buflen == 0) /* EOF */
				467	s = NULL;
				468	Py_DECREF(bufobj);
				469	return s;
Neal Norwitz	41eaedd	2007-08-12 00:03:22 +0000	[diff] [blame]	470
				471	error:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	472	Py_XDECREF(bufobj);
				473	return error_ret(tok);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	474	}
				475
				476	/* Set the readline function for TOK to a StreamReader's
				477	readline function. The StreamReader is named ENC.
				478
				479	This function is called from check_bom and check_coding_spec.
				480
				481	ENC is usually identical to the future value of tok->encoding,
				482	except for the (currently unsupported) case of UTF-16.
				483
				484	Return 1 on success, 0 on failure. */
				485
				486	static int
				487	fp_setreadl(struct tok_state tok, const char enc)
				488	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	489	PyObject readline = NULL, stream = NULL, *io = NULL;
Martin v. Löwis	bd928fe	2011-10-14 10:20:37 +0200	[diff] [blame]	490	_Py_IDENTIFIER(open);
				491	_Py_IDENTIFIER(readline);
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	492	int fd;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	493	long pos;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	494
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	495	io = PyImport_ImportModuleNoBlock("io");
				496	if (io == NULL)
				497	goto cleanup;
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	498
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	499	fd = fileno(tok->fp);
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	500	/* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	501	* position of tok->fp. If tok->fp was opened in text mode on Windows,
				502	* its file position counts CRLF as one char and can't be directly mapped
				503	* to the file offset for fd. Instead we step back one byte and read to
				504	* the end of line.*/
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	505	pos = ftell(tok->fp);
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	506	if (pos == -1 \|\|
				507	lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	508	PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
				509	goto cleanup;
				510	}
				511
Martin v. Löwis	afe55bb	2011-10-09 10:38:36 +0200	[diff] [blame]	512	stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	513	fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	514	if (stream == NULL)
				515	goto cleanup;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	516
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	517	Py_XDECREF(tok->decoding_readline);
Martin v. Löwis	1ee1b6f	2011-10-10 18:11:30 +0200	[diff] [blame]	518	readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	519	tok->decoding_readline = readline;
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	520	if (pos > 0) {
				521	if (PyObject_CallObject(readline, NULL) == NULL) {
				522	readline = NULL;
				523	goto cleanup;
				524	}
				525	}
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	526
				527	cleanup:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	528	Py_XDECREF(stream);
				529	Py_XDECREF(io);
				530	return readline != NULL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	531	}
				532
				533	/* Fetch the next byte from TOK. */
				534
				535	static int fp_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	536	return getc(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	537	}
				538
				539	/* Unfetch the last byte back into TOK. */
				540
				541	static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	542	ungetc(c, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	543	}
				544
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	545	/* Check whether the characters at s start a valid
				546	UTF-8 sequence. Return the number of characters forming
				547	the sequence if yes, 0 if not. */
				548	static int valid_utf8(const unsigned char* s)
				549	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	550	int expected = 0;
				551	int length;
				552	if (*s < 0x80)
				553	/* single-byte code */
				554	return 1;
				555	if (*s < 0xc0)
				556	/* following byte */
				557	return 0;
				558	if (*s < 0xE0)
				559	expected = 1;
				560	else if (*s < 0xF0)
				561	expected = 2;
				562	else if (*s < 0xF8)
				563	expected = 3;
				564	else
				565	return 0;
				566	length = expected + 1;
				567	for (; expected; expected--)
				568	if (s[expected] < 0x80 \|\| s[expected] >= 0xC0)
				569	return 0;
				570	return length;
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	571	}
				572
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	573	/* Read a line of input from TOK. Determine encoding
				574	if necessary. */
				575
				576	static char *
				577	decoding_fgets(char s, int size, struct tok_state tok)
				578	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	579	char *line = NULL;
				580	int badchar = 0;
				581	for (;;) {
				582	if (tok->decoding_state == STATE_NORMAL) {
				583	/* We already have a codec associated with
				584	this input. */
				585	line = fp_readl(s, size, tok);
				586	break;
				587	} else if (tok->decoding_state == STATE_RAW) {
				588	/* We want a 'raw' read. */
				589	line = Py_UniversalNewlineFgets(s, size,
				590	tok->fp, NULL);
				591	break;
				592	} else {
				593	/* We have not yet determined the encoding.
				594	If an encoding is found, use the file-pointer
				595	reader functions from now on. */
				596	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
				597	return error_ret(tok);
				598	assert(tok->decoding_state != STATE_INIT);
				599	}
				600	}
				601	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
				602	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
				603	return error_ret(tok);
				604	}
				605	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	606	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	607	/* The default encoding is UTF-8, so make sure we don't have any
				608	non-UTF-8 sequences in it. */
				609	if (line && !tok->encoding) {
				610	unsigned char *c;
				611	int length;
				612	for (c = (unsigned char )line; c; c += length)
				613	if (!(length = valid_utf8(c))) {
				614	badchar = *c;
				615	break;
				616	}
				617	}
				618	if (badchar) {
				619	/* Need to add 1 to the line number, since this line
				620	has not been counted, yet. */
Jesus Cea	c1935d2	2011-04-25 04:03:58 +0200	[diff] [blame]	621	PyErr_Format(PyExc_SyntaxError,
				622	"Non-UTF-8 code starting with '\\x%.2x' "
				623	"in file %U on line %i, "
				624	"but no encoding declared; "
				625	"see http://python.org/dev/peps/pep-0263/ for details",
				626	badchar, tok->filename, tok->lineno + 1);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	627	return error_ret(tok);
				628	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	629	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	630	return line;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	631	}
				632
				633	static int
				634	decoding_feof(struct tok_state *tok)
				635	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	636	if (tok->decoding_state != STATE_NORMAL) {
				637	return feof(tok->fp);
				638	} else {
				639	PyObject* buf = tok->decoding_buffer;
				640	if (buf == NULL) {
				641	buf = PyObject_CallObject(tok->decoding_readline, NULL);
				642	if (buf == NULL) {
				643	error_ret(tok);
				644	return 1;
				645	} else {
				646	tok->decoding_buffer = buf;
				647	}
				648	}
				649	return PyObject_Length(buf) == 0;
				650	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	651	}
				652
				653	/* Fetch a byte from TOK, using the string buffer. */
				654
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	655	static int
				656	buf_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	657	return Py_CHARMASK(*tok->str++);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	658	}
				659
				660	/* Unfetch a byte from TOK, using the string buffer. */
				661
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	662	static void
				663	buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	664	tok->str--;
				665	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	666	}
				667
				668	/* Set the readline function for TOK to ENC. For the string-based
				669	tokenizer, this means to just record the encoding. */
				670
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	671	static int
				672	buf_setreadl(struct tok_state tok, const char enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	673	tok->enc = enc;
				674	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	675	}
				676
				677	/* Return a UTF-8 encoding Python string object from the
				678	C byte string STR, which is encoded with ENC. */
				679
				680	static PyObject *
				681	translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	682	PyObject *utf8;
				683	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
				684	if (buf == NULL)
				685	return NULL;
				686	utf8 = PyUnicode_AsUTF8String(buf);
				687	Py_DECREF(buf);
				688	return utf8;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	689	}
				690
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	691
				692	static char *
				693	translate_newlines(const char s, int exec_input, struct tok_state tok) {
Victor Stinner	7969773	2013-06-05 00:44:00 +0200	[diff] [blame]	694	int skip_next_lf = 0;
				695	size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	696	char buf, current;
				697	char c = '\0';
				698	buf = PyMem_MALLOC(needed_length);
				699	if (buf == NULL) {
				700	tok->done = E_NOMEM;
				701	return NULL;
				702	}
				703	for (current = buf; *s; s++, current++) {
				704	c = *s;
				705	if (skip_next_lf) {
				706	skip_next_lf = 0;
				707	if (c == '\n') {
				708	c = *++s;
				709	if (!c)
				710	break;
				711	}
				712	}
				713	if (c == '\r') {
				714	skip_next_lf = 1;
				715	c = '\n';
				716	}
				717	*current = c;
				718	}
				719	/* If this is exec input, add a newline to the end of the string if
				720	there isn't one already. */
				721	if (exec_input && c != '\n') {
				722	*current = '\n';
				723	current++;
				724	}
				725	*current = '\0';
				726	final_length = current - buf + 1;
				727	if (final_length < needed_length && final_length)
				728	/* should never fail */
				729	buf = PyMem_REALLOC(buf, final_length);
				730	return buf;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	731	}
				732
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	733	/* Decode a byte string STR for use as the buffer of TOK.
				734	Look for encoding declarations inside STR, and record them
				735	inside TOK. */
				736
				737	static const char *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	738	decode_str(const char input, int single, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	739	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	740	PyObject* utf8 = NULL;
				741	const char *str;
				742	const char *s;
				743	const char *newl[2] = {NULL, NULL};
				744	int lineno = 0;
				745	tok->input = str = translate_newlines(input, single, tok);
				746	if (str == NULL)
				747	return NULL;
				748	tok->enc = NULL;
				749	tok->str = str;
				750	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
				751	return error_ret(tok);
				752	str = tok->str; /* string after BOM if any */
				753	assert(str);
				754	if (tok->enc != NULL) {
				755	utf8 = translate_into_utf8(str, tok->enc);
				756	if (utf8 == NULL)
				757	return error_ret(tok);
				758	str = PyBytes_AsString(utf8);
				759	}
				760	for (s = str;; s++) {
				761	if (*s == '\0') break;
				762	else if (*s == '\n') {
				763	assert(lineno < 2);
				764	newl[lineno] = s;
				765	lineno++;
				766	if (lineno == 2) break;
				767	}
				768	}
				769	tok->enc = NULL;
				770	/* need to check line 1 and 2 separately since check_coding_spec
				771	assumes a single line as input */
				772	if (newl[0]) {
				773	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
				774	return error_ret(tok);
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	775	if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	776	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
				777	tok, buf_setreadl))
				778	return error_ret(tok);
				779	}
				780	}
				781	if (tok->enc != NULL) {
				782	assert(utf8 == NULL);
				783	utf8 = translate_into_utf8(str, tok->enc);
				784	if (utf8 == NULL)
				785	return error_ret(tok);
				786	str = PyBytes_AS_STRING(utf8);
				787	}
				788	assert(tok->decoding_buffer == NULL);
				789	tok->decoding_buffer = utf8; /* CAUTION */
				790	return str;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	791	}
				792
				793	#endif /* PGEN */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	794
				795	/* Set up tokenizer for string */
				796
				797	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	798	PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	799	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	800	struct tok_state *tok = tok_new();
				801	if (tok == NULL)
				802	return NULL;
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	803	str = decode_str(str, exec_input, tok);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	804	if (str == NULL) {
				805	PyTokenizer_Free(tok);
				806	return NULL;
				807	}
Neal Norwitz	dee2fd5	2005-11-16 05:12:59 +0000	[diff] [blame]	808
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	809	/* XXX: constify members. */
				810	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				811	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	812	}
				813
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	814	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	815	PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	816	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	817	struct tok_state *tok = tok_new();
				818	if (tok == NULL)
				819	return NULL;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	820	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	821	tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	822	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	823	if (str == NULL) {
				824	PyTokenizer_Free(tok);
				825	return NULL;
				826	}
				827	tok->decoding_state = STATE_RAW;
				828	tok->read_coding_spec = 1;
				829	tok->enc = NULL;
				830	tok->str = str;
				831	tok->encoding = (char *)PyMem_MALLOC(6);
				832	if (!tok->encoding) {
				833	PyTokenizer_Free(tok);
				834	return NULL;
				835	}
				836	strcpy(tok->encoding, "utf-8");
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	837
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	838	/* XXX: constify members. */
				839	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				840	return tok;
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	841	}
				842
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	843	/* Set up tokenizer for file */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	844
				845	struct tok_state *
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	846	PyTokenizer_FromFile(FILE fp, const char enc,
				847	const char ps1, const char ps2)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	848	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	849	struct tok_state *tok = tok_new();
				850	if (tok == NULL)
				851	return NULL;
				852	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
				853	PyTokenizer_Free(tok);
				854	return NULL;
				855	}
				856	tok->cur = tok->inp = tok->buf;
				857	tok->end = tok->buf + BUFSIZ;
				858	tok->fp = fp;
				859	tok->prompt = ps1;
				860	tok->nextprompt = ps2;
				861	if (enc != NULL) {
				862	/* Must copy encoding declaration since it
				863	gets copied into the parse tree. */
				864	tok->encoding = PyMem_MALLOC(strlen(enc)+1);
				865	if (!tok->encoding) {
				866	PyTokenizer_Free(tok);
				867	return NULL;
				868	}
				869	strcpy(tok->encoding, enc);
				870	tok->decoding_state = STATE_NORMAL;
				871	}
				872	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	873	}
				874
				875
				876	/* Free a tok_state structure */
				877
				878	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	879	PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	880	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	881	if (tok->encoding != NULL)
				882	PyMem_FREE(tok->encoding);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	883	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	884	Py_XDECREF(tok->decoding_readline);
				885	Py_XDECREF(tok->decoding_buffer);
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	886	Py_XDECREF(tok->filename);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	887	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	888	if (tok->fp != NULL && tok->buf != NULL)
				889	PyMem_FREE(tok->buf);
				890	if (tok->input)
				891	PyMem_FREE((char *)tok->input);
				892	PyMem_FREE(tok);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	893	}
				894
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	895	/* Get next char, updating state; error code goes into tok->done */
				896
				897	static int
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	898	tok_nextc(struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	899	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	900	for (;;) {
				901	if (tok->cur != tok->inp) {
				902	return Py_CHARMASK(tok->cur++); / Fast path */
				903	}
				904	if (tok->done != E_OK)
				905	return EOF;
				906	if (tok->fp == NULL) {
				907	char *end = strchr(tok->inp, '\n');
				908	if (end != NULL)
				909	end++;
				910	else {
				911	end = strchr(tok->inp, '\0');
				912	if (end == tok->inp) {
				913	tok->done = E_EOF;
				914	return EOF;
				915	}
				916	}
				917	if (tok->start == NULL)
				918	tok->buf = tok->cur;
				919	tok->line_start = tok->cur;
				920	tok->lineno++;
				921	tok->inp = end;
				922	return Py_CHARMASK(*tok->cur++);
				923	}
				924	if (tok->prompt != NULL) {
				925	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner	034c753	2011-01-07 18:56:19 +0000	[diff] [blame]	926	#ifndef PGEN
Victor Stinner	89e3436	2011-01-07 18:47:22 +0000	[diff] [blame]	927	if (newtok != NULL) {
				928	char *translated = translate_newlines(newtok, 0, tok);
				929	PyMem_FREE(newtok);
				930	if (translated == NULL)
				931	return EOF;
				932	newtok = translated;
				933	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	934	if (tok->encoding && newtok && *newtok) {
				935	/* Recode to UTF-8 */
				936	Py_ssize_t buflen;
				937	const char* buf;
				938	PyObject *u = translate_into_utf8(newtok, tok->encoding);
				939	PyMem_FREE(newtok);
				940	if (!u) {
				941	tok->done = E_DECODE;
				942	return EOF;
				943	}
				944	buflen = PyBytes_GET_SIZE(u);
				945	buf = PyBytes_AS_STRING(u);
				946	if (!buf) {
				947	Py_DECREF(u);
				948	tok->done = E_DECODE;
				949	return EOF;
				950	}
				951	newtok = PyMem_MALLOC(buflen+1);
				952	strcpy(newtok, buf);
				953	Py_DECREF(u);
				954	}
Martin v. Löwis	85bcc66	2007-09-04 09:18:06 +0000	[diff] [blame]	955	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	956	if (tok->nextprompt != NULL)
				957	tok->prompt = tok->nextprompt;
				958	if (newtok == NULL)
				959	tok->done = E_INTR;
				960	else if (*newtok == '\0') {
				961	PyMem_FREE(newtok);
				962	tok->done = E_EOF;
				963	}
				964	else if (tok->start != NULL) {
				965	size_t start = tok->start - tok->buf;
				966	size_t oldlen = tok->cur - tok->buf;
				967	size_t newlen = oldlen + strlen(newtok);
				968	char *buf = tok->buf;
				969	buf = (char *)PyMem_REALLOC(buf, newlen+1);
				970	tok->lineno++;
				971	if (buf == NULL) {
				972	PyMem_FREE(tok->buf);
				973	tok->buf = NULL;
				974	PyMem_FREE(newtok);
				975	tok->done = E_NOMEM;
				976	return EOF;
				977	}
				978	tok->buf = buf;
				979	tok->cur = tok->buf + oldlen;
				980	tok->line_start = tok->cur;
				981	strcpy(tok->buf + oldlen, newtok);
				982	PyMem_FREE(newtok);
				983	tok->inp = tok->buf + newlen;
				984	tok->end = tok->inp + 1;
				985	tok->start = tok->buf + start;
				986	}
				987	else {
				988	tok->lineno++;
				989	if (tok->buf != NULL)
				990	PyMem_FREE(tok->buf);
				991	tok->buf = newtok;
				992	tok->line_start = tok->buf;
				993	tok->cur = tok->buf;
				994	tok->line_start = tok->buf;
				995	tok->inp = strchr(tok->buf, '\0');
				996	tok->end = tok->inp + 1;
				997	}
				998	}
				999	else {
				1000	int done = 0;
				1001	Py_ssize_t cur = 0;
				1002	char *pt;
				1003	if (tok->start == NULL) {
				1004	if (tok->buf == NULL) {
				1005	tok->buf = (char *)
				1006	PyMem_MALLOC(BUFSIZ);
				1007	if (tok->buf == NULL) {
				1008	tok->done = E_NOMEM;
				1009	return EOF;
				1010	}
				1011	tok->end = tok->buf + BUFSIZ;
				1012	}
				1013	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
				1014	tok) == NULL) {
				1015	tok->done = E_EOF;
				1016	done = 1;
				1017	}
				1018	else {
				1019	tok->done = E_OK;
				1020	tok->inp = strchr(tok->buf, '\0');
				1021	done = tok->inp[-1] == '\n';
				1022	}
				1023	}
				1024	else {
				1025	cur = tok->cur - tok->buf;
				1026	if (decoding_feof(tok)) {
				1027	tok->done = E_EOF;
				1028	done = 1;
				1029	}
				1030	else
				1031	tok->done = E_OK;
				1032	}
				1033	tok->lineno++;
				1034	/* Read until '\n' or EOF */
				1035	while (!done) {
				1036	Py_ssize_t curstart = tok->start == NULL ? -1 :
				1037	tok->start - tok->buf;
				1038	Py_ssize_t curvalid = tok->inp - tok->buf;
				1039	Py_ssize_t newsize = curvalid + BUFSIZ;
				1040	char *newbuf = tok->buf;
				1041	newbuf = (char *)PyMem_REALLOC(newbuf,
				1042	newsize);
				1043	if (newbuf == NULL) {
				1044	tok->done = E_NOMEM;
				1045	tok->cur = tok->inp;
				1046	return EOF;
				1047	}
				1048	tok->buf = newbuf;
				1049	tok->inp = tok->buf + curvalid;
				1050	tok->end = tok->buf + newsize;
				1051	tok->start = curstart < 0 ? NULL :
				1052	tok->buf + curstart;
				1053	if (decoding_fgets(tok->inp,
				1054	(int)(tok->end - tok->inp),
				1055	tok) == NULL) {
				1056	/* Break out early on decoding
				1057	errors, as tok->buf will be NULL
				1058	*/
				1059	if (tok->decoding_erred)
				1060	return EOF;
				1061	/* Last line does not end in \n,
				1062	fake one */
				1063	strcpy(tok->inp, "\n");
				1064	}
				1065	tok->inp = strchr(tok->inp, '\0');
				1066	done = tok->inp[-1] == '\n';
				1067	}
				1068	if (tok->buf != NULL) {
				1069	tok->cur = tok->buf + cur;
				1070	tok->line_start = tok->cur;
				1071	/* replace "\r\n" with "\n" */
				1072	/* For Mac leave the \r, giving a syntax error */
				1073	pt = tok->inp - 2;
				1074	if (pt >= tok->buf && *pt == '\r') {
				1075	*pt++ = '\n';
				1076	*pt = '\0';
				1077	tok->inp = pt;
				1078	}
				1079	}
				1080	}
				1081	if (tok->done != E_OK) {
				1082	if (tok->prompt != NULL)
				1083	PySys_WriteStderr("\n");
				1084	tok->cur = tok->inp;
				1085	return EOF;
				1086	}
				1087	}
				1088	/NOTREACHED/
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1089	}
				1090
				1091
				1092	/* Back-up one character */
				1093
				1094	static void
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1095	tok_backup(struct tok_state *tok, int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1096	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1097	if (c != EOF) {
				1098	if (--tok->cur < tok->buf)
				1099	Py_FatalError("tok_backup: beginning of buffer");
				1100	if (*tok->cur != c)
				1101	*tok->cur = c;
				1102	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1103	}
				1104
				1105
				1106	/* Return the token corresponding to a single character */
				1107
				1108	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1109	PyToken_OneChar(int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1110	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1111	switch (c) {
				1112	case '(': return LPAR;
				1113	case ')': return RPAR;
				1114	case '[': return LSQB;
				1115	case ']': return RSQB;
				1116	case ':': return COLON;
				1117	case ',': return COMMA;
				1118	case ';': return SEMI;
				1119	case '+': return PLUS;
				1120	case '-': return MINUS;
				1121	case '*': return STAR;
				1122	case '/': return SLASH;
				1123	case '\|': return VBAR;
				1124	case '&': return AMPER;
				1125	case '<': return LESS;
				1126	case '>': return GREATER;
				1127	case '=': return EQUAL;
				1128	case '.': return DOT;
				1129	case '%': return PERCENT;
				1130	case '{': return LBRACE;
				1131	case '}': return RBRACE;
				1132	case '^': return CIRCUMFLEX;
				1133	case '~': return TILDE;
				1134	case '@': return AT;
				1135	default: return OP;
				1136	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1137	}
				1138
				1139
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1140	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1141	PyToken_TwoChars(int c1, int c2)
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1142	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1143	switch (c1) {
				1144	case '=':
				1145	switch (c2) {
				1146	case '=': return EQEQUAL;
				1147	}
				1148	break;
				1149	case '!':
				1150	switch (c2) {
				1151	case '=': return NOTEQUAL;
				1152	}
				1153	break;
				1154	case '<':
				1155	switch (c2) {
				1156	case '>': return NOTEQUAL;
				1157	case '=': return LESSEQUAL;
				1158	case '<': return LEFTSHIFT;
				1159	}
				1160	break;
				1161	case '>':
				1162	switch (c2) {
				1163	case '=': return GREATEREQUAL;
				1164	case '>': return RIGHTSHIFT;
				1165	}
				1166	break;
				1167	case '+':
				1168	switch (c2) {
				1169	case '=': return PLUSEQUAL;
				1170	}
				1171	break;
				1172	case '-':
				1173	switch (c2) {
				1174	case '=': return MINEQUAL;
				1175	case '>': return RARROW;
				1176	}
				1177	break;
				1178	case '*':
				1179	switch (c2) {
				1180	case '*': return DOUBLESTAR;
				1181	case '=': return STAREQUAL;
				1182	}
				1183	break;
				1184	case '/':
				1185	switch (c2) {
				1186	case '/': return DOUBLESLASH;
				1187	case '=': return SLASHEQUAL;
				1188	}
				1189	break;
				1190	case '\|':
				1191	switch (c2) {
				1192	case '=': return VBAREQUAL;
				1193	}
				1194	break;
				1195	case '%':
				1196	switch (c2) {
				1197	case '=': return PERCENTEQUAL;
				1198	}
				1199	break;
				1200	case '&':
				1201	switch (c2) {
				1202	case '=': return AMPEREQUAL;
				1203	}
				1204	break;
				1205	case '^':
				1206	switch (c2) {
				1207	case '=': return CIRCUMFLEXEQUAL;
				1208	}
				1209	break;
				1210	}
				1211	return OP;
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1212	}
				1213
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1214	int
				1215	PyToken_ThreeChars(int c1, int c2, int c3)
				1216	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1217	switch (c1) {
				1218	case '<':
				1219	switch (c2) {
				1220	case '<':
				1221	switch (c3) {
				1222	case '=':
				1223	return LEFTSHIFTEQUAL;
				1224	}
				1225	break;
				1226	}
				1227	break;
				1228	case '>':
				1229	switch (c2) {
				1230	case '>':
				1231	switch (c3) {
				1232	case '=':
				1233	return RIGHTSHIFTEQUAL;
				1234	}
				1235	break;
				1236	}
				1237	break;
				1238	case '*':
				1239	switch (c2) {
				1240	case '*':
				1241	switch (c3) {
				1242	case '=':
				1243	return DOUBLESTAREQUAL;
				1244	}
				1245	break;
				1246	}
				1247	break;
				1248	case '/':
				1249	switch (c2) {
				1250	case '/':
				1251	switch (c3) {
				1252	case '=':
				1253	return DOUBLESLASHEQUAL;
				1254	}
				1255	break;
				1256	}
				1257	break;
				1258	case '.':
				1259	switch (c2) {
Georg Brandl	dde0028	2007-03-18 19:01:53 +0000	[diff] [blame]	1260	case '.':
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1261	switch (c3) {
				1262	case '.':
				1263	return ELLIPSIS;
				1264	}
				1265	break;
				1266	}
				1267	break;
				1268	}
				1269	return OP;
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1270	}
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1271
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1272	static int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1273	indenterror(struct tok_state *tok)
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1274	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1275	if (tok->alterror) {
				1276	tok->done = E_TABSPACE;
				1277	tok->cur = tok->inp;
				1278	return 1;
				1279	}
				1280	if (tok->altwarning) {
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1281	#ifdef PGEN
				1282	PySys_WriteStderr("inconsistent use of tabs and spaces "
				1283	"in indentation\n");
				1284	#else
				1285	PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1286	"in indentation\n", tok->filename);
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1287	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1288	tok->altwarning = 0;
				1289	}
				1290	return 0;
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1291	}
				1292
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1293	#ifdef PGEN
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1294	#define verify_identifier(tok) 1
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1295	#else
Martin v. Löwis	d63a3b8	2011-09-28 07:41:54 +0200	[diff] [blame]	1296	/* Verify that the identifier follows PEP 3131.
				1297	All identifier strings are guaranteed to be "ready" unicode objects.
				1298	*/
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1299	static int
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1300	verify_identifier(struct tok_state *tok)
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1301	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1302	PyObject *s;
				1303	int result;
				1304	s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwis	d63a3b8	2011-09-28 07:41:54 +0200	[diff] [blame]	1305	if (s == NULL \|\| PyUnicode_READY(s) == -1) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1306	if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
				1307	PyErr_Clear();
				1308	tok->done = E_IDENTIFIER;
				1309	} else {
				1310	tok->done = E_ERROR;
				1311	}
				1312	return 0;
				1313	}
				1314	result = PyUnicode_IsIdentifier(s);
				1315	Py_DECREF(s);
				1316	if (result == 0)
				1317	tok->done = E_IDENTIFIER;
				1318	return result;
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1319	}
				1320	#endif
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1321
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1322	/* Get next token, after space stripping etc. */
				1323
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1324	static int
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1325	tok_get(struct tok_state tok, char p_start, char *p_end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1326	{
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1327	int c;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1328	int blankline, nonascii;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1329
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1330	p_start = p_end = NULL;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1331	nextline:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1332	tok->start = NULL;
				1333	blankline = 0;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1334
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1335	/* Get indentation level */
				1336	if (tok->atbol) {
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1337	int col = 0;
				1338	int altcol = 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1339	tok->atbol = 0;
				1340	for (;;) {
				1341	c = tok_nextc(tok);
				1342	if (c == ' ')
				1343	col++, altcol++;
				1344	else if (c == '\t') {
				1345	col = (col/tok->tabsize + 1) * tok->tabsize;
				1346	altcol = (altcol/tok->alttabsize + 1)
				1347	* tok->alttabsize;
				1348	}
				1349	else if (c == '\014') /* Control-L (formfeed) */
				1350	col = altcol = 0; /* For Emacs users */
				1351	else
				1352	break;
				1353	}
				1354	tok_backup(tok, c);
				1355	if (c == '#' \|\| c == '\n') {
				1356	/* Lines with only whitespace and/or comments
				1357	shouldn't affect the indentation and are
				1358	not passed to the parser as NEWLINE tokens,
				1359	except totally empty lines in interactive
				1360	mode, which signal the end of a command group. */
				1361	if (col == 0 && c == '\n' && tok->prompt != NULL)
				1362	blankline = 0; /* Let it through */
				1363	else
				1364	blankline = 1; /* Ignore completely */
				1365	/* We can't jump back right here since we still
				1366	may need to skip to the end of a comment */
				1367	}
				1368	if (!blankline && tok->level == 0) {
				1369	if (col == tok->indstack[tok->indent]) {
				1370	/* No change */
				1371	if (altcol != tok->altindstack[tok->indent]) {
				1372	if (indenterror(tok))
				1373	return ERRORTOKEN;
				1374	}
				1375	}
				1376	else if (col > tok->indstack[tok->indent]) {
				1377	/* Indent -- always one */
				1378	if (tok->indent+1 >= MAXINDENT) {
				1379	tok->done = E_TOODEEP;
				1380	tok->cur = tok->inp;
				1381	return ERRORTOKEN;
				1382	}
				1383	if (altcol <= tok->altindstack[tok->indent]) {
				1384	if (indenterror(tok))
				1385	return ERRORTOKEN;
				1386	}
				1387	tok->pendin++;
				1388	tok->indstack[++tok->indent] = col;
				1389	tok->altindstack[tok->indent] = altcol;
				1390	}
				1391	else /* col < tok->indstack[tok->indent] */ {
				1392	/* Dedent -- any number, must be consistent */
				1393	while (tok->indent > 0 &&
				1394	col < tok->indstack[tok->indent]) {
				1395	tok->pendin--;
				1396	tok->indent--;
				1397	}
				1398	if (col != tok->indstack[tok->indent]) {
				1399	tok->done = E_DEDENT;
				1400	tok->cur = tok->inp;
				1401	return ERRORTOKEN;
				1402	}
				1403	if (altcol != tok->altindstack[tok->indent]) {
				1404	if (indenterror(tok))
				1405	return ERRORTOKEN;
				1406	}
				1407	}
				1408	}
				1409	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1410
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1411	tok->start = tok->cur;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1412
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1413	/* Return pending indents/dedents */
				1414	if (tok->pendin != 0) {
				1415	if (tok->pendin < 0) {
				1416	tok->pendin++;
				1417	return DEDENT;
				1418	}
				1419	else {
				1420	tok->pendin--;
				1421	return INDENT;
				1422	}
				1423	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1424
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1425	again:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1426	tok->start = NULL;
				1427	/* Skip spaces */
				1428	do {
				1429	c = tok_nextc(tok);
				1430	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1431
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1432	/* Set start of current token */
				1433	tok->start = tok->cur - 1;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1434
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1435	/* Skip comment */
				1436	if (c == '#')
				1437	while (c != EOF && c != '\n')
				1438	c = tok_nextc(tok);
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1439
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1440	/* Check for EOF and errors now */
				1441	if (c == EOF) {
				1442	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
				1443	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1444
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1445	/* Identifier (most frequent token!) */
				1446	nonascii = 0;
				1447	if (is_potential_identifier_start(c)) {
Christian Heimes	0b3847d	2012-06-20 11:17:58 +0200	[diff] [blame]	1448	/* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1449	int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1450	while (1) {
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1451	if (!(saw_b \|\| saw_u) && (c == 'b' \|\| c == 'B'))
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1452	saw_b = 1;
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1453	/* Since this is a backwards compatibility support literal we don't
				1454	want to support it in arbitrary order like byte literals. */
				1455	else if (!(saw_b \|\| saw_u \|\| saw_r) && (c == 'u' \|\| c == 'U'))
				1456	saw_u = 1;
Christian Heimes	0b3847d	2012-06-20 11:17:58 +0200	[diff] [blame]	1457	/* ur"" and ru"" are not supported */
				1458	else if (!(saw_r \|\| saw_u) && (c == 'r' \|\| c == 'R'))
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1459	saw_r = 1;
				1460	else
				1461	break;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1462	c = tok_nextc(tok);
				1463	if (c == '"' \|\| c == '\'')
				1464	goto letter_quote;
				1465	}
				1466	while (is_potential_identifier_char(c)) {
				1467	if (c >= 128)
				1468	nonascii = 1;
				1469	c = tok_nextc(tok);
				1470	}
				1471	tok_backup(tok, c);
				1472	if (nonascii &&
				1473	!verify_identifier(tok)) {
				1474	tok->done = E_IDENTIFIER;
				1475	return ERRORTOKEN;
				1476	}
				1477	*p_start = tok->start;
				1478	*p_end = tok->cur;
				1479	return NAME;
				1480	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1481
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1482	/* Newline */
				1483	if (c == '\n') {
				1484	tok->atbol = 1;
				1485	if (blankline \|\| tok->level > 0)
				1486	goto nextline;
				1487	*p_start = tok->start;
				1488	p_end = tok->cur - 1; / Leave '\n' out of the string */
				1489	tok->cont_line = 0;
				1490	return NEWLINE;
				1491	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1492
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1493	/* Period or number starting with period? */
				1494	if (c == '.') {
				1495	c = tok_nextc(tok);
				1496	if (isdigit(c)) {
				1497	goto fraction;
				1498	} else if (c == '.') {
				1499	c = tok_nextc(tok);
				1500	if (c == '.') {
				1501	*p_start = tok->start;
				1502	*p_end = tok->cur;
				1503	return ELLIPSIS;
				1504	} else {
				1505	tok_backup(tok, c);
				1506	}
				1507	tok_backup(tok, '.');
				1508	} else {
				1509	tok_backup(tok, c);
				1510	}
				1511	*p_start = tok->start;
				1512	*p_end = tok->cur;
				1513	return DOT;
				1514	}
Guido van Rossum	f595fde	1996-01-12 01:31:58 +0000	[diff] [blame]	1515
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1516	/* Number */
				1517	if (isdigit(c)) {
				1518	if (c == '0') {
				1519	/* Hex, octal or binary -- maybe. */
				1520	c = tok_nextc(tok);
				1521	if (c == '.')
				1522	goto fraction;
				1523	if (c == 'j' \|\| c == 'J')
				1524	goto imaginary;
				1525	if (c == 'x' \|\| c == 'X') {
Georg Brandl	fceab5a	2008-01-19 20:08:23 +0000	[diff] [blame]	1526
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1527	/* Hex */
				1528	c = tok_nextc(tok);
				1529	if (!isxdigit(c)) {
				1530	tok->done = E_TOKEN;
				1531	tok_backup(tok, c);
				1532	return ERRORTOKEN;
				1533	}
				1534	do {
				1535	c = tok_nextc(tok);
				1536	} while (isxdigit(c));
				1537	}
				1538	else if (c == 'o' \|\| c == 'O') {
				1539	/* Octal */
				1540	c = tok_nextc(tok);
				1541	if (c < '0' \|\| c >= '8') {
				1542	tok->done = E_TOKEN;
				1543	tok_backup(tok, c);
				1544	return ERRORTOKEN;
				1545	}
				1546	do {
				1547	c = tok_nextc(tok);
				1548	} while ('0' <= c && c < '8');
				1549	}
				1550	else if (c == 'b' \|\| c == 'B') {
				1551	/* Binary */
				1552	c = tok_nextc(tok);
				1553	if (c != '0' && c != '1') {
				1554	tok->done = E_TOKEN;
				1555	tok_backup(tok, c);
				1556	return ERRORTOKEN;
				1557	}
				1558	do {
				1559	c = tok_nextc(tok);
				1560	} while (c == '0' \|\| c == '1');
				1561	}
				1562	else {
				1563	int nonzero = 0;
				1564	/* maybe old-style octal; c is first char of it */
				1565	/* in any case, allow '0' as a literal */
				1566	while (c == '0')
				1567	c = tok_nextc(tok);
				1568	while (isdigit(c)) {
				1569	nonzero = 1;
				1570	c = tok_nextc(tok);
				1571	}
				1572	if (c == '.')
				1573	goto fraction;
				1574	else if (c == 'e' \|\| c == 'E')
				1575	goto exponent;
				1576	else if (c == 'j' \|\| c == 'J')
				1577	goto imaginary;
				1578	else if (nonzero) {
				1579	tok->done = E_TOKEN;
				1580	tok_backup(tok, c);
				1581	return ERRORTOKEN;
				1582	}
				1583	}
				1584	}
				1585	else {
				1586	/* Decimal */
				1587	do {
				1588	c = tok_nextc(tok);
				1589	} while (isdigit(c));
				1590	{
				1591	/* Accept floating point numbers. */
				1592	if (c == '.') {
				1593	fraction:
				1594	/* Fraction */
				1595	do {
				1596	c = tok_nextc(tok);
				1597	} while (isdigit(c));
				1598	}
				1599	if (c == 'e' \|\| c == 'E') {
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1600	int e;
				1601	exponent:
				1602	e = c;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1603	/* Exponent part */
				1604	c = tok_nextc(tok);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1605	if (c == '+' \|\| c == '-') {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1606	c = tok_nextc(tok);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1607	if (!isdigit(c)) {
				1608	tok->done = E_TOKEN;
				1609	tok_backup(tok, c);
				1610	return ERRORTOKEN;
				1611	}
				1612	} else if (!isdigit(c)) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1613	tok_backup(tok, c);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1614	tok_backup(tok, e);
				1615	*p_start = tok->start;
				1616	*p_end = tok->cur;
				1617	return NUMBER;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1618	}
				1619	do {
				1620	c = tok_nextc(tok);
				1621	} while (isdigit(c));
				1622	}
				1623	if (c == 'j' \|\| c == 'J')
				1624	/* Imaginary part */
				1625	imaginary:
				1626	c = tok_nextc(tok);
				1627	}
				1628	}
				1629	tok_backup(tok, c);
				1630	*p_start = tok->start;
				1631	*p_end = tok->cur;
				1632	return NUMBER;
				1633	}
Guido van Rossum	24dacb3	1997-04-06 03:46:20 +0000	[diff] [blame]	1634
				1635	letter_quote:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1636	/* String */
				1637	if (c == '\'' \|\| c == '"') {
				1638	int quote = c;
				1639	int quote_size = 1; /* 1 or 3 */
				1640	int end_quote_size = 0;
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1641
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1642	/* Find the quote size and start of string */
				1643	c = tok_nextc(tok);
				1644	if (c == quote) {
				1645	c = tok_nextc(tok);
				1646	if (c == quote)
				1647	quote_size = 3;
				1648	else
				1649	end_quote_size = 1; /* empty string found */
				1650	}
				1651	if (c != quote)
				1652	tok_backup(tok, c);
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1653
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1654	/* Get rest of string */
				1655	while (end_quote_size != quote_size) {
				1656	c = tok_nextc(tok);
				1657	if (c == EOF) {
				1658	if (quote_size == 3)
				1659	tok->done = E_EOFS;
				1660	else
				1661	tok->done = E_EOLS;
				1662	tok->cur = tok->inp;
				1663	return ERRORTOKEN;
				1664	}
				1665	if (quote_size == 1 && c == '\n') {
				1666	tok->done = E_EOLS;
				1667	tok->cur = tok->inp;
				1668	return ERRORTOKEN;
				1669	}
				1670	if (c == quote)
				1671	end_quote_size += 1;
				1672	else {
				1673	end_quote_size = 0;
				1674	if (c == '\\')
				1675	c = tok_nextc(tok); /* skip escaped char */
				1676	}
				1677	}
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1678
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1679	*p_start = tok->start;
				1680	*p_end = tok->cur;
				1681	return STRING;
				1682	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1683
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1684	/* Line continuation */
				1685	if (c == '\\') {
				1686	c = tok_nextc(tok);
				1687	if (c != '\n') {
				1688	tok->done = E_LINECONT;
				1689	tok->cur = tok->inp;
				1690	return ERRORTOKEN;
				1691	}
				1692	tok->cont_line = 1;
				1693	goto again; /* Read next line */
				1694	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1695
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1696	/* Check for two-character token */
				1697	{
				1698	int c2 = tok_nextc(tok);
				1699	int token = PyToken_TwoChars(c, c2);
				1700	if (token != OP) {
				1701	int c3 = tok_nextc(tok);
				1702	int token3 = PyToken_ThreeChars(c, c2, c3);
				1703	if (token3 != OP) {
				1704	token = token3;
				1705	} else {
				1706	tok_backup(tok, c3);
				1707	}
				1708	*p_start = tok->start;
				1709	*p_end = tok->cur;
				1710	return token;
				1711	}
				1712	tok_backup(tok, c2);
				1713	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1714
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1715	/* Keep track of parentheses nesting level */
				1716	switch (c) {
				1717	case '(':
				1718	case '[':
				1719	case '{':
				1720	tok->level++;
				1721	break;
				1722	case ')':
				1723	case ']':
				1724	case '}':
				1725	tok->level--;
				1726	break;
				1727	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1728
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1729	/* Punctuation character */
				1730	*p_start = tok->start;
				1731	*p_end = tok->cur;
				1732	return PyToken_OneChar(c);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1733	}
				1734
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1735	int
				1736	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
				1737	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1738	int result = tok_get(tok, p_start, p_end);
				1739	if (tok->decoding_erred) {
				1740	result = ERRORTOKEN;
				1741	tok->done = E_DECODE;
				1742	}
				1743	return result;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1744	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1745
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1746	/* Get the encoding of a Python file. Check for the coding cookie and check if
				1747	the file starts with a BOM.
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1748
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1749	PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
				1750	encoding in the first or second line of the file (in which case the encoding
				1751	should be assumed to be UTF-8).
Brett Cannon	e453989	2007-10-20 03:46:49 +0000	[diff] [blame]	1752
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1753	The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
				1754	by the caller. */
				1755
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1756	char *
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1757	PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum	40d20bc	2007-10-22 00:09:51 +0000	[diff] [blame]	1758	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1759	struct tok_state *tok;
				1760	FILE *fp;
				1761	char p_start =NULL , p_end =NULL , *encoding = NULL;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1762
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1763	#ifndef PGEN
				1764	fd = _Py_dup(fd);
				1765	#else
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1766	fd = dup(fd);
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1767	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1768	if (fd < 0) {
				1769	return NULL;
				1770	}
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1771
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1772	fp = fdopen(fd, "r");
				1773	if (fp == NULL) {
				1774	return NULL;
				1775	}
				1776	tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
				1777	if (tok == NULL) {
				1778	fclose(fp);
				1779	return NULL;
				1780	}
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1781	#ifndef PGEN
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1782	if (filename != NULL) {
				1783	Py_INCREF(filename);
				1784	tok->filename = filename;
				1785	}
				1786	else {
				1787	tok->filename = PyUnicode_FromString("<string>");
				1788	if (tok->filename == NULL) {
				1789	fclose(fp);
				1790	PyTokenizer_Free(tok);
				1791	return encoding;
				1792	}
				1793	}
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1794	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1795	while (tok->lineno < 2 && tok->done == E_OK) {
				1796	PyTokenizer_Get(tok, &p_start, &p_end);
				1797	}
				1798	fclose(fp);
				1799	if (tok->encoding) {
				1800	encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
				1801	if (encoding)
				1802	strcpy(encoding, tok->encoding);
				1803	}
				1804	PyTokenizer_Free(tok);
				1805	return encoding;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1806	}
Thomas Wouters	89d996e	2007-09-08 17:39:28 +0000	[diff] [blame]	1807
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1808	char *
				1809	PyTokenizer_FindEncoding(int fd)
				1810	{
				1811	return PyTokenizer_FindEncodingFilename(fd, NULL);
				1812	}
				1813
Guido van Rossum	408027e	1996-12-30 16:17:54 +0000	[diff] [blame]	1814	#ifdef Py_DEBUG
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1815
				1816	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1817	tok_dump(int type, char start, char end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1818	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1819	printf("%s", _PyParser_TokenNames[type]);
				1820	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
				1821	printf("(%.*s)", (int)(end - start), start);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1822	}
				1823
				1824	#endif