Blame - Parser/tokenizer.c - platform/external/python/cpython3

blob: ef7b19fb42f61209a2a92fc26c2a6491cb1dc386 [file] [log] [blame]

Guido van Rossum	f70e43a	1991-02-19 12:39:46 +0000	[diff] [blame]	1
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	2	/* Tokenizer implementation */
				3
Jack Jansen	7b8c754	2002-04-14 20:12:41 +0000	[diff] [blame]	4	#include "Python.h"
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	5	#include "pgenheaders.h"
				6
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	7	#include <ctype.h>
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	8	#include <assert.h>
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	9
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	10	#include "tokenizer.h"
				11	#include "errcode.h"
				12
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	13	#ifndef PGEN
				14	#include "unicodeobject.h"
Christian Heimes	2c9c7a5	2008-05-26 13:42:13 +0000	[diff] [blame]	15	#include "bytesobject.h"
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	16	#include "fileobject.h"
				17	#include "codecs.h"
				18	#include "abstract.h"
				19	#endif /* PGEN */
				20
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	21	#define is_potential_identifier_start(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	22	(c >= 'a' && c <= 'z')\
				23	\|\| (c >= 'A' && c <= 'Z')\
				24	\|\| c == '_'\
				25	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	26
				27	#define is_potential_identifier_char(c) (\
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	28	(c >= 'a' && c <= 'z')\
				29	\|\| (c >= 'A' && c <= 'Z')\
				30	\|\| (c >= '0' && c <= '9')\
				31	\|\| c == '_'\
				32	\|\| (c >= 128))
Martin v. Löwis	5b22213	2007-06-10 09:51:05 +0000	[diff] [blame]	33
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	34	extern char PyOS_Readline(FILE , FILE , const char );
Guido van Rossum	f4b1a64	1994-08-29 12:43:07 +0000	[diff] [blame]	35	/* Return malloc'ed string including trailing \n;
				36	empty malloc'ed string for EOF;
				37	NULL if interrupted */
				38
Guido van Rossum	4fe8729	1992-02-26 15:24:44 +0000	[diff] [blame]	39	/* Don't ever change this -- it would break the portability of Python code */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	40	#define TABSIZE 8
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	41
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	42	/* Forward */
Tim Peters	dbd9ba6	2000-07-09 03:09:57 +0000	[diff] [blame]	43	static struct tok_state *tok_new(void);
				44	static int tok_nextc(struct tok_state *tok);
				45	static void tok_backup(struct tok_state *tok, int c);
Guido van Rossum	3f5da24	1990-12-20 15:06:42 +0000	[diff] [blame]	46
Brett Cannon	d5ec98c	2007-10-20 02:54:14 +0000	[diff] [blame]	47
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	48	/* Token names */
				49
Benjamin Peterson	d084558	2012-10-24 08:21:52 -0700	[diff] [blame]	50	const char *_PyParser_TokenNames[] = {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	51	"ENDMARKER",
				52	"NAME",
				53	"NUMBER",
				54	"STRING",
				55	"NEWLINE",
				56	"INDENT",
				57	"DEDENT",
				58	"LPAR",
				59	"RPAR",
				60	"LSQB",
				61	"RSQB",
				62	"COLON",
				63	"COMMA",
				64	"SEMI",
				65	"PLUS",
				66	"MINUS",
				67	"STAR",
				68	"SLASH",
				69	"VBAR",
				70	"AMPER",
				71	"LESS",
				72	"GREATER",
				73	"EQUAL",
				74	"DOT",
				75	"PERCENT",
				76	"LBRACE",
				77	"RBRACE",
				78	"EQEQUAL",
				79	"NOTEQUAL",
				80	"LESSEQUAL",
				81	"GREATEREQUAL",
				82	"TILDE",
				83	"CIRCUMFLEX",
				84	"LEFTSHIFT",
				85	"RIGHTSHIFT",
				86	"DOUBLESTAR",
				87	"PLUSEQUAL",
				88	"MINEQUAL",
				89	"STAREQUAL",
				90	"SLASHEQUAL",
				91	"PERCENTEQUAL",
				92	"AMPEREQUAL",
				93	"VBAREQUAL",
				94	"CIRCUMFLEXEQUAL",
				95	"LEFTSHIFTEQUAL",
				96	"RIGHTSHIFTEQUAL",
				97	"DOUBLESTAREQUAL",
				98	"DOUBLESLASH",
				99	"DOUBLESLASHEQUAL",
				100	"AT",
Benjamin Peterson	d51374e	2014-04-09 23:55:56 -0400	[diff] [blame]	101	"ATEQUAL",
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	102	"RARROW",
				103	"ELLIPSIS",
				104	/* This table must match the #defines in token.h! */
				105	"OP",
				106	"<ERRORTOKEN>",
				107	"<N_TOKENS>"
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	108	};
				109
				110
				111	/* Create and initialize a new tok_state structure */
				112
				113	static struct tok_state *
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	114	tok_new(void)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	115	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	116	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
				117	sizeof(struct tok_state));
				118	if (tok == NULL)
				119	return NULL;
				120	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
				121	tok->done = E_OK;
				122	tok->fp = NULL;
				123	tok->input = NULL;
				124	tok->tabsize = TABSIZE;
				125	tok->indent = 0;
				126	tok->indstack[0] = 0;
				127	tok->atbol = 1;
				128	tok->pendin = 0;
				129	tok->prompt = tok->nextprompt = NULL;
				130	tok->lineno = 0;
				131	tok->level = 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	132	tok->altwarning = 1;
				133	tok->alterror = 1;
				134	tok->alttabsize = 1;
				135	tok->altindstack[0] = 0;
				136	tok->decoding_state = STATE_INIT;
				137	tok->decoding_erred = 0;
				138	tok->read_coding_spec = 0;
				139	tok->enc = NULL;
				140	tok->encoding = NULL;
				141	tok->cont_line = 0;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	142	#ifndef PGEN
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	143	tok->filename = NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	144	tok->decoding_readline = NULL;
				145	tok->decoding_buffer = NULL;
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	146	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	147	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	148	}
				149
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	150	static char *
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	151	new_string(const char s, Py_ssize_t len, struct tok_state tok)
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	152	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	153	char* result = (char *)PyMem_MALLOC(len + 1);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	154	if (!result) {
				155	tok->done = E_NOMEM;
				156	return NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	157	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	158	memcpy(result, s, len);
				159	result[len] = '\0';
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	160	return result;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	161	}
				162
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	163	#ifdef PGEN
				164
				165	static char *
				166	decoding_fgets(char s, int size, struct tok_state tok)
				167	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	168	return fgets(s, size, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	169	}
				170
				171	static int
				172	decoding_feof(struct tok_state *tok)
				173	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	174	return feof(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	175	}
				176
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	177	static char *
				178	decode_str(const char str, int exec_input, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	179	{
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	180	return new_string(str, strlen(str), tok);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	181	}
				182
				183	#else /* PGEN */
				184
				185	static char *
				186	error_ret(struct tok_state tok) / XXX */
				187	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	188	tok->decoding_erred = 1;
				189	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
				190	PyMem_FREE(tok->buf);
				191	tok->buf = NULL;
				192	return NULL; /* as if it were EOF */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	193	}
				194
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	195
				196	static char *
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	197	get_normal_name(char s) / for utf-8 and latin-1 */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	198	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	199	char buf[13];
				200	int i;
				201	for (i = 0; i < 12; i++) {
				202	int c = s[i];
				203	if (c == '\0')
				204	break;
				205	else if (c == '_')
				206	buf[i] = '-';
				207	else
				208	buf[i] = tolower(c);
				209	}
				210	buf[i] = '\0';
				211	if (strcmp(buf, "utf-8") == 0 \|\|
				212	strncmp(buf, "utf-8-", 6) == 0)
				213	return "utf-8";
				214	else if (strcmp(buf, "latin-1") == 0 \|\|
				215	strcmp(buf, "iso-8859-1") == 0 \|\|
				216	strcmp(buf, "iso-latin-1") == 0 \|\|
				217	strncmp(buf, "latin-1-", 8) == 0 \|\|
				218	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
				219	strncmp(buf, "iso-latin-1-", 12) == 0)
				220	return "iso-8859-1";
				221	else
				222	return s;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	223	}
				224
				225	/* Return the coding spec in S, or NULL if none is found. */
				226
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	227	static int
				228	get_coding_spec(const char s, char spec, Py_ssize_t size, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	229	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	230	Py_ssize_t i;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	231	*spec = NULL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	232	/* Coding spec must be in a comment, and that comment must be
				233	* the only statement on the source code line. */
				234	for (i = 0; i < size - 6; i++) {
				235	if (s[i] == '#')
				236	break;
				237	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	238	return 1;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	239	}
				240	for (; i < size - 6; i++) { /* XXX inefficient search */
				241	const char* t = s + i;
				242	if (strncmp(t, "coding", 6) == 0) {
				243	const char* begin = NULL;
				244	t += 6;
				245	if (t[0] != ':' && t[0] != '=')
				246	continue;
				247	do {
				248	t++;
				249	} while (t[0] == '\x20' \|\| t[0] == '\t');
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	250
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	251	begin = t;
				252	while (Py_ISALNUM(t[0]) \|\|
				253	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
				254	t++;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	255
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	256	if (begin < t) {
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	257	char* r = new_string(begin, t - begin, tok);
Benjamin Peterson	265fba4	2013-07-15 20:50:22 -0700	[diff] [blame]	258	char* q;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	259	if (!r)
				260	return 0;
Benjamin Peterson	265fba4	2013-07-15 20:50:22 -0700	[diff] [blame]	261	q = get_normal_name(r);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	262	if (r != q) {
				263	PyMem_FREE(r);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	264	r = new_string(q, strlen(q), tok);
				265	if (!r)
				266	return 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	267	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	268	*spec = r;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	269	}
				270	}
				271	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	272	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	273	}
				274
				275	/* Check whether the line contains a coding spec. If it does,
				276	invoke the set_readline function for the new encoding.
				277	This function receives the tok_state and the new encoding.
				278	Return 1 on success, 0 on failure. */
				279
				280	static int
Martin v. Löwis	18e1655	2006-02-15 17:27:45 +0000	[diff] [blame]	281	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	282	int set_readline(struct tok_state , const char ))
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	283	{
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	284	char *cs;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	285	int r = 1;
Tim Peters	17db21f	2002-09-03 15:39:58 +0000	[diff] [blame]	286
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	287	if (tok->cont_line) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	288	/* It's a continuation line, so it can't be a coding spec. */
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	289	tok->read_coding_spec = 1;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	290	return 1;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	291	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	292	if (!get_coding_spec(line, &cs, size, tok))
				293	return 0;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	294	if (!cs) {
				295	Py_ssize_t i;
				296	for (i = 0; i < size; i++) {
				297	if (line[i] == '#' \|\| line[i] == '\n' \|\| line[i] == '\r')
				298	break;
				299	if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
				300	/* Stop checking coding spec after a line containing
				301	* anything except a comment. */
				302	tok->read_coding_spec = 1;
				303	break;
				304	}
				305	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	306	return 1;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	307	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	308	tok->read_coding_spec = 1;
				309	if (tok->encoding == NULL) {
				310	assert(tok->decoding_state == STATE_RAW);
				311	if (strcmp(cs, "utf-8") == 0) {
				312	tok->encoding = cs;
				313	} else {
				314	r = set_readline(tok, cs);
				315	if (r) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	316	tok->encoding = cs;
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	317	tok->decoding_state = STATE_NORMAL;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	318	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	319	else {
Serhiy Storchaka	3af14aa	2013-06-09 16:51:52 +0300	[diff] [blame]	320	PyErr_Format(PyExc_SyntaxError,
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	321	"encoding problem: %s", cs);
				322	PyMem_FREE(cs);
				323	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	324	}
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	325	} else { /* then, compare cs with BOM */
				326	r = (strcmp(tok->encoding, cs) == 0);
				327	if (!r)
				328	PyErr_Format(PyExc_SyntaxError,
				329	"encoding problem: %s with BOM", cs);
				330	PyMem_FREE(cs);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	331	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	332	return r;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	333	}
				334
				335	/* See whether the file starts with a BOM. If it does,
				336	invoke the set_readline function with the new encoding.
				337	Return 1 on success, 0 on failure. */
				338
				339	static int
				340	check_bom(int get_char(struct tok_state *),
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	341	void unget_char(int, struct tok_state *),
				342	int set_readline(struct tok_state , const char ),
				343	struct tok_state *tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	344	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	345	int ch1, ch2, ch3;
				346	ch1 = get_char(tok);
				347	tok->decoding_state = STATE_RAW;
				348	if (ch1 == EOF) {
				349	return 1;
				350	} else if (ch1 == 0xEF) {
				351	ch2 = get_char(tok);
				352	if (ch2 != 0xBB) {
				353	unget_char(ch2, tok);
				354	unget_char(ch1, tok);
				355	return 1;
				356	}
				357	ch3 = get_char(tok);
				358	if (ch3 != 0xBF) {
				359	unget_char(ch3, tok);
				360	unget_char(ch2, tok);
				361	unget_char(ch1, tok);
				362	return 1;
				363	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	364	#if 0
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	365	/* Disable support for UTF-16 BOMs until a decision
				366	is made whether this needs to be supported. */
				367	} else if (ch1 == 0xFE) {
				368	ch2 = get_char(tok);
				369	if (ch2 != 0xFF) {
				370	unget_char(ch2, tok);
				371	unget_char(ch1, tok);
				372	return 1;
				373	}
				374	if (!set_readline(tok, "utf-16-be"))
				375	return 0;
				376	tok->decoding_state = STATE_NORMAL;
				377	} else if (ch1 == 0xFF) {
				378	ch2 = get_char(tok);
				379	if (ch2 != 0xFE) {
				380	unget_char(ch2, tok);
				381	unget_char(ch1, tok);
				382	return 1;
				383	}
				384	if (!set_readline(tok, "utf-16-le"))
				385	return 0;
				386	tok->decoding_state = STATE_NORMAL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	387	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	388	} else {
				389	unget_char(ch1, tok);
				390	return 1;
				391	}
				392	if (tok->encoding != NULL)
				393	PyMem_FREE(tok->encoding);
Benjamin Peterson	2dbfd88	2013-07-15 19:15:34 -0700	[diff] [blame]	394	tok->encoding = new_string("utf-8", 5, tok);
				395	if (!tok->encoding)
				396	return 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	397	/* No need to set_readline: input is already utf-8 */
				398	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	399	}
				400
				401	/* Read a line of text from TOK into S, using the stream in TOK.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	402	Return NULL on failure, else S.
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	403
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	404	On entry, tok->decoding_buffer will be one of:
				405	1) NULL: need to call tok->decoding_readline to get a new line
				406	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	407	stored the result in tok->decoding_buffer
Christian Heimes	9c4756e	2008-05-26 13:22:05 +0000	[diff] [blame]	408	3) PyByteArrayObject *: previous call to fp_readl did not have enough room
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	409	(in the s buffer) to copy entire contents of the line read
				410	by tok->decoding_readline. tok->decoding_buffer has the overflow.
				411	In this case, fp_readl is called in a loop (with an expanded buffer)
				412	until the buffer ends with a '\n' (or until the end of the file is
				413	reached): see tok_nextc and its calls to decoding_fgets.
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	414	*/
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	415
				416	static char *
				417	fp_readl(char s, int size, struct tok_state tok)
				418	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	419	PyObject* bufobj;
				420	const char *buf;
				421	Py_ssize_t buflen;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	422
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	423	/* Ask for one less byte so we can terminate it */
				424	assert(size > 0);
				425	size--;
Walter Dörwald	c1f5fff	2005-07-12 21:53:43 +0000	[diff] [blame]	426
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	427	if (tok->decoding_buffer) {
				428	bufobj = tok->decoding_buffer;
				429	Py_INCREF(bufobj);
				430	}
				431	else
				432	{
				433	bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
				434	if (bufobj == NULL)
				435	goto error;
				436	}
				437	if (PyUnicode_CheckExact(bufobj))
				438	{
				439	buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
				440	if (buf == NULL) {
				441	goto error;
				442	}
				443	}
				444	else
				445	{
				446	buf = PyByteArray_AsString(bufobj);
				447	if (buf == NULL) {
				448	goto error;
				449	}
				450	buflen = PyByteArray_GET_SIZE(bufobj);
				451	}
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	452
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	453	Py_XDECREF(tok->decoding_buffer);
				454	if (buflen > size) {
				455	/* Too many chars, the rest goes into tok->decoding_buffer */
				456	tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
				457	buflen-size);
				458	if (tok->decoding_buffer == NULL)
				459	goto error;
				460	buflen = size;
				461	}
				462	else
				463	tok->decoding_buffer = NULL;
Amaury Forgeot d'Arc	65f9ace	2007-11-15 23:19:43 +0000	[diff] [blame]	464
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	465	memcpy(s, buf, buflen);
				466	s[buflen] = '\0';
				467	if (buflen == 0) /* EOF */
				468	s = NULL;
				469	Py_DECREF(bufobj);
				470	return s;
Neal Norwitz	41eaedd	2007-08-12 00:03:22 +0000	[diff] [blame]	471
				472	error:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	473	Py_XDECREF(bufobj);
				474	return error_ret(tok);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	475	}
				476
				477	/* Set the readline function for TOK to a StreamReader's
				478	readline function. The StreamReader is named ENC.
				479
				480	This function is called from check_bom and check_coding_spec.
				481
				482	ENC is usually identical to the future value of tok->encoding,
				483	except for the (currently unsupported) case of UTF-16.
				484
				485	Return 1 on success, 0 on failure. */
				486
				487	static int
				488	fp_setreadl(struct tok_state tok, const char enc)
				489	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	490	PyObject readline = NULL, stream = NULL, *io = NULL;
Martin v. Löwis	bd928fe	2011-10-14 10:20:37 +0200	[diff] [blame]	491	_Py_IDENTIFIER(open);
				492	_Py_IDENTIFIER(readline);
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	493	int fd;
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	494	long pos;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	495
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	496	io = PyImport_ImportModuleNoBlock("io");
				497	if (io == NULL)
				498	goto cleanup;
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	499
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	500	fd = fileno(tok->fp);
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	501	/* Due to buffering the file offset for fd can be different from the file
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	502	* position of tok->fp. If tok->fp was opened in text mode on Windows,
				503	* its file position counts CRLF as one char and can't be directly mapped
				504	* to the file offset for fd. Instead we step back one byte and read to
				505	* the end of line.*/
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	506	pos = ftell(tok->fp);
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	507	if (pos == -1 \|\|
				508	lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	509	PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
				510	goto cleanup;
				511	}
				512
Martin v. Löwis	afe55bb	2011-10-09 10:38:36 +0200	[diff] [blame]	513	stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
Victor Stinner	22a351a	2010-10-14 12:04:34 +0000	[diff] [blame]	514	fd, "r", -1, enc, Py_None, Py_None, Py_False);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	515	if (stream == NULL)
				516	goto cleanup;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	517
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	518	Py_XDECREF(tok->decoding_readline);
Martin v. Löwis	1ee1b6f	2011-10-10 18:11:30 +0200	[diff] [blame]	519	readline = _PyObject_GetAttrId(stream, &PyId_readline);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	520	tok->decoding_readline = readline;
Martin v. Löwis	815b41b	2014-02-28 15:27:29 +0100	[diff] [blame]	521	if (pos > 0) {
				522	if (PyObject_CallObject(readline, NULL) == NULL) {
				523	readline = NULL;
				524	goto cleanup;
				525	}
				526	}
Guido van Rossum	9cbfffd	2007-06-07 00:54:15 +0000	[diff] [blame]	527
				528	cleanup:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	529	Py_XDECREF(stream);
				530	Py_XDECREF(io);
				531	return readline != NULL;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	532	}
				533
				534	/* Fetch the next byte from TOK. */
				535
				536	static int fp_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	537	return getc(tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	538	}
				539
				540	/* Unfetch the last byte back into TOK. */
				541
				542	static void fp_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	543	ungetc(c, tok->fp);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	544	}
				545
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	546	/* Check whether the characters at s start a valid
				547	UTF-8 sequence. Return the number of characters forming
				548	the sequence if yes, 0 if not. */
				549	static int valid_utf8(const unsigned char* s)
				550	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	551	int expected = 0;
				552	int length;
				553	if (*s < 0x80)
				554	/* single-byte code */
				555	return 1;
				556	if (*s < 0xc0)
				557	/* following byte */
				558	return 0;
				559	if (*s < 0xE0)
				560	expected = 1;
				561	else if (*s < 0xF0)
				562	expected = 2;
				563	else if (*s < 0xF8)
				564	expected = 3;
				565	else
				566	return 0;
				567	length = expected + 1;
				568	for (; expected; expected--)
				569	if (s[expected] < 0x80 \|\| s[expected] >= 0xC0)
				570	return 0;
				571	return length;
Martin v. Löwis	447d33e	2007-07-29 18:10:01 +0000	[diff] [blame]	572	}
				573
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	574	/* Read a line of input from TOK. Determine encoding
				575	if necessary. */
				576
				577	static char *
				578	decoding_fgets(char s, int size, struct tok_state tok)
				579	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	580	char *line = NULL;
				581	int badchar = 0;
				582	for (;;) {
				583	if (tok->decoding_state == STATE_NORMAL) {
				584	/* We already have a codec associated with
				585	this input. */
				586	line = fp_readl(s, size, tok);
				587	break;
				588	} else if (tok->decoding_state == STATE_RAW) {
				589	/* We want a 'raw' read. */
				590	line = Py_UniversalNewlineFgets(s, size,
				591	tok->fp, NULL);
				592	break;
				593	} else {
				594	/* We have not yet determined the encoding.
				595	If an encoding is found, use the file-pointer
				596	reader functions from now on. */
				597	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
				598	return error_ret(tok);
				599	assert(tok->decoding_state != STATE_INIT);
				600	}
				601	}
				602	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
				603	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
				604	return error_ret(tok);
				605	}
				606	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	607	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	608	/* The default encoding is UTF-8, so make sure we don't have any
				609	non-UTF-8 sequences in it. */
				610	if (line && !tok->encoding) {
				611	unsigned char *c;
				612	int length;
				613	for (c = (unsigned char )line; c; c += length)
				614	if (!(length = valid_utf8(c))) {
				615	badchar = *c;
				616	break;
				617	}
				618	}
				619	if (badchar) {
				620	/* Need to add 1 to the line number, since this line
				621	has not been counted, yet. */
Jesus Cea	c1935d2	2011-04-25 04:03:58 +0200	[diff] [blame]	622	PyErr_Format(PyExc_SyntaxError,
				623	"Non-UTF-8 code starting with '\\x%.2x' "
				624	"in file %U on line %i, "
				625	"but no encoding declared; "
				626	"see http://python.org/dev/peps/pep-0263/ for details",
				627	badchar, tok->filename, tok->lineno + 1);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	628	return error_ret(tok);
				629	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	630	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	631	return line;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	632	}
				633
				634	static int
				635	decoding_feof(struct tok_state *tok)
				636	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	637	if (tok->decoding_state != STATE_NORMAL) {
				638	return feof(tok->fp);
				639	} else {
				640	PyObject* buf = tok->decoding_buffer;
				641	if (buf == NULL) {
				642	buf = PyObject_CallObject(tok->decoding_readline, NULL);
				643	if (buf == NULL) {
				644	error_ret(tok);
				645	return 1;
				646	} else {
				647	tok->decoding_buffer = buf;
				648	}
				649	}
				650	return PyObject_Length(buf) == 0;
				651	}
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	652	}
				653
				654	/* Fetch a byte from TOK, using the string buffer. */
				655
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	656	static int
				657	buf_getc(struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	658	return Py_CHARMASK(*tok->str++);
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	659	}
				660
				661	/* Unfetch a byte from TOK, using the string buffer. */
				662
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	663	static void
				664	buf_ungetc(int c, struct tok_state *tok) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	665	tok->str--;
				666	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	667	}
				668
				669	/* Set the readline function for TOK to ENC. For the string-based
				670	tokenizer, this means to just record the encoding. */
				671
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	672	static int
				673	buf_setreadl(struct tok_state tok, const char enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	674	tok->enc = enc;
				675	return 1;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	676	}
				677
				678	/* Return a UTF-8 encoding Python string object from the
				679	C byte string STR, which is encoded with ENC. */
				680
				681	static PyObject *
				682	translate_into_utf8(const char* str, const char* enc) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	683	PyObject *utf8;
				684	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
				685	if (buf == NULL)
				686	return NULL;
				687	utf8 = PyUnicode_AsUTF8String(buf);
				688	Py_DECREF(buf);
				689	return utf8;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	690	}
				691
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	692
				693	static char *
				694	translate_newlines(const char s, int exec_input, struct tok_state tok) {
Victor Stinner	7969773	2013-06-05 00:44:00 +0200	[diff] [blame]	695	int skip_next_lf = 0;
				696	size_t needed_length = strlen(s) + 2, final_length;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	697	char buf, current;
				698	char c = '\0';
				699	buf = PyMem_MALLOC(needed_length);
				700	if (buf == NULL) {
				701	tok->done = E_NOMEM;
				702	return NULL;
				703	}
				704	for (current = buf; *s; s++, current++) {
				705	c = *s;
				706	if (skip_next_lf) {
				707	skip_next_lf = 0;
				708	if (c == '\n') {
				709	c = *++s;
				710	if (!c)
				711	break;
				712	}
				713	}
				714	if (c == '\r') {
				715	skip_next_lf = 1;
				716	c = '\n';
				717	}
				718	*current = c;
				719	}
				720	/* If this is exec input, add a newline to the end of the string if
				721	there isn't one already. */
				722	if (exec_input && c != '\n') {
				723	*current = '\n';
				724	current++;
				725	}
				726	*current = '\0';
				727	final_length = current - buf + 1;
				728	if (final_length < needed_length && final_length)
				729	/* should never fail */
				730	buf = PyMem_REALLOC(buf, final_length);
				731	return buf;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	732	}
				733
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	734	/* Decode a byte string STR for use as the buffer of TOK.
				735	Look for encoding declarations inside STR, and record them
				736	inside TOK. */
				737
				738	static const char *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	739	decode_str(const char input, int single, struct tok_state tok)
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	740	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	741	PyObject* utf8 = NULL;
				742	const char *str;
				743	const char *s;
				744	const char *newl[2] = {NULL, NULL};
				745	int lineno = 0;
				746	tok->input = str = translate_newlines(input, single, tok);
				747	if (str == NULL)
				748	return NULL;
				749	tok->enc = NULL;
				750	tok->str = str;
				751	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
				752	return error_ret(tok);
				753	str = tok->str; /* string after BOM if any */
				754	assert(str);
				755	if (tok->enc != NULL) {
				756	utf8 = translate_into_utf8(str, tok->enc);
				757	if (utf8 == NULL)
				758	return error_ret(tok);
				759	str = PyBytes_AsString(utf8);
				760	}
				761	for (s = str;; s++) {
				762	if (*s == '\0') break;
				763	else if (*s == '\n') {
				764	assert(lineno < 2);
				765	newl[lineno] = s;
				766	lineno++;
				767	if (lineno == 2) break;
				768	}
				769	}
				770	tok->enc = NULL;
				771	/* need to check line 1 and 2 separately since check_coding_spec
				772	assumes a single line as input */
				773	if (newl[0]) {
				774	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
				775	return error_ret(tok);
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	776	if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	777	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
				778	tok, buf_setreadl))
				779	return error_ret(tok);
				780	}
				781	}
				782	if (tok->enc != NULL) {
				783	assert(utf8 == NULL);
				784	utf8 = translate_into_utf8(str, tok->enc);
				785	if (utf8 == NULL)
				786	return error_ret(tok);
				787	str = PyBytes_AS_STRING(utf8);
				788	}
				789	assert(tok->decoding_buffer == NULL);
				790	tok->decoding_buffer = utf8; /* CAUTION */
				791	return str;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	792	}
				793
				794	#endif /* PGEN */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	795
				796	/* Set up tokenizer for string */
				797
				798	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	799	PyTokenizer_FromString(const char *str, int exec_input)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	800	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	801	struct tok_state *tok = tok_new();
				802	if (tok == NULL)
				803	return NULL;
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	804	str = decode_str(str, exec_input, tok);
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	805	if (str == NULL) {
				806	PyTokenizer_Free(tok);
				807	return NULL;
				808	}
Neal Norwitz	dee2fd5	2005-11-16 05:12:59 +0000	[diff] [blame]	809
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	810	/* XXX: constify members. */
				811	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				812	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	813	}
				814
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	815	struct tok_state *
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	816	PyTokenizer_FromUTF8(const char *str, int exec_input)
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	817	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	818	struct tok_state *tok = tok_new();
				819	if (tok == NULL)
				820	return NULL;
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	821	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	822	tok->input = str = translate_newlines(str, exec_input, tok);
Benjamin Peterson	aeaa592	2009-11-13 00:17:59 +0000	[diff] [blame]	823	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	824	if (str == NULL) {
				825	PyTokenizer_Free(tok);
				826	return NULL;
				827	}
				828	tok->decoding_state = STATE_RAW;
				829	tok->read_coding_spec = 1;
				830	tok->enc = NULL;
				831	tok->str = str;
				832	tok->encoding = (char *)PyMem_MALLOC(6);
				833	if (!tok->encoding) {
				834	PyTokenizer_Free(tok);
				835	return NULL;
				836	}
				837	strcpy(tok->encoding, "utf-8");
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	838
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	839	/* XXX: constify members. */
				840	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
				841	return tok;
Benjamin Peterson	f5b5224	2009-03-02 23:31:26 +0000	[diff] [blame]	842	}
				843
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	844	/* Set up tokenizer for file */
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	845
				846	struct tok_state *
Serhiy Storchaka	c679227	2013-10-19 21:03:34 +0300	[diff] [blame]	847	PyTokenizer_FromFile(FILE fp, const char enc,
				848	const char ps1, const char ps2)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	849	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	850	struct tok_state *tok = tok_new();
				851	if (tok == NULL)
				852	return NULL;
				853	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
				854	PyTokenizer_Free(tok);
				855	return NULL;
				856	}
				857	tok->cur = tok->inp = tok->buf;
				858	tok->end = tok->buf + BUFSIZ;
				859	tok->fp = fp;
				860	tok->prompt = ps1;
				861	tok->nextprompt = ps2;
				862	if (enc != NULL) {
				863	/* Must copy encoding declaration since it
				864	gets copied into the parse tree. */
				865	tok->encoding = PyMem_MALLOC(strlen(enc)+1);
				866	if (!tok->encoding) {
				867	PyTokenizer_Free(tok);
				868	return NULL;
				869	}
				870	strcpy(tok->encoding, enc);
				871	tok->decoding_state = STATE_NORMAL;
				872	}
				873	return tok;
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	874	}
				875
				876
				877	/* Free a tok_state structure */
				878
				879	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	880	PyTokenizer_Free(struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	881	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	882	if (tok->encoding != NULL)
				883	PyMem_FREE(tok->encoding);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	884	#ifndef PGEN
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	885	Py_XDECREF(tok->decoding_readline);
				886	Py_XDECREF(tok->decoding_buffer);
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	887	Py_XDECREF(tok->filename);
Martin v. Löwis	1ee99d3	2002-08-04 20:10:29 +0000	[diff] [blame]	888	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	889	if (tok->fp != NULL && tok->buf != NULL)
				890	PyMem_FREE(tok->buf);
				891	if (tok->input)
				892	PyMem_FREE((char *)tok->input);
				893	PyMem_FREE(tok);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	894	}
				895
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	896	/* Get next char, updating state; error code goes into tok->done */
				897
				898	static int
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	899	tok_nextc(struct tok_state *tok)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	900	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	901	for (;;) {
				902	if (tok->cur != tok->inp) {
				903	return Py_CHARMASK(tok->cur++); / Fast path */
				904	}
				905	if (tok->done != E_OK)
				906	return EOF;
				907	if (tok->fp == NULL) {
				908	char *end = strchr(tok->inp, '\n');
				909	if (end != NULL)
				910	end++;
				911	else {
				912	end = strchr(tok->inp, '\0');
				913	if (end == tok->inp) {
				914	tok->done = E_EOF;
				915	return EOF;
				916	}
				917	}
				918	if (tok->start == NULL)
				919	tok->buf = tok->cur;
				920	tok->line_start = tok->cur;
				921	tok->lineno++;
				922	tok->inp = end;
				923	return Py_CHARMASK(*tok->cur++);
				924	}
				925	if (tok->prompt != NULL) {
				926	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
Victor Stinner	034c753	2011-01-07 18:56:19 +0000	[diff] [blame]	927	#ifndef PGEN
Victor Stinner	89e3436	2011-01-07 18:47:22 +0000	[diff] [blame]	928	if (newtok != NULL) {
				929	char *translated = translate_newlines(newtok, 0, tok);
				930	PyMem_FREE(newtok);
				931	if (translated == NULL)
				932	return EOF;
				933	newtok = translated;
				934	}
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	935	if (tok->encoding && newtok && *newtok) {
				936	/* Recode to UTF-8 */
				937	Py_ssize_t buflen;
				938	const char* buf;
				939	PyObject *u = translate_into_utf8(newtok, tok->encoding);
				940	PyMem_FREE(newtok);
				941	if (!u) {
				942	tok->done = E_DECODE;
				943	return EOF;
				944	}
				945	buflen = PyBytes_GET_SIZE(u);
				946	buf = PyBytes_AS_STRING(u);
				947	if (!buf) {
				948	Py_DECREF(u);
				949	tok->done = E_DECODE;
				950	return EOF;
				951	}
				952	newtok = PyMem_MALLOC(buflen+1);
				953	strcpy(newtok, buf);
				954	Py_DECREF(u);
				955	}
Martin v. Löwis	85bcc66	2007-09-04 09:18:06 +0000	[diff] [blame]	956	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	957	if (tok->nextprompt != NULL)
				958	tok->prompt = tok->nextprompt;
				959	if (newtok == NULL)
				960	tok->done = E_INTR;
				961	else if (*newtok == '\0') {
				962	PyMem_FREE(newtok);
				963	tok->done = E_EOF;
				964	}
				965	else if (tok->start != NULL) {
				966	size_t start = tok->start - tok->buf;
				967	size_t oldlen = tok->cur - tok->buf;
				968	size_t newlen = oldlen + strlen(newtok);
				969	char *buf = tok->buf;
				970	buf = (char *)PyMem_REALLOC(buf, newlen+1);
				971	tok->lineno++;
				972	if (buf == NULL) {
				973	PyMem_FREE(tok->buf);
				974	tok->buf = NULL;
				975	PyMem_FREE(newtok);
				976	tok->done = E_NOMEM;
				977	return EOF;
				978	}
				979	tok->buf = buf;
				980	tok->cur = tok->buf + oldlen;
				981	tok->line_start = tok->cur;
				982	strcpy(tok->buf + oldlen, newtok);
				983	PyMem_FREE(newtok);
				984	tok->inp = tok->buf + newlen;
				985	tok->end = tok->inp + 1;
				986	tok->start = tok->buf + start;
				987	}
				988	else {
				989	tok->lineno++;
				990	if (tok->buf != NULL)
				991	PyMem_FREE(tok->buf);
				992	tok->buf = newtok;
				993	tok->line_start = tok->buf;
				994	tok->cur = tok->buf;
				995	tok->line_start = tok->buf;
				996	tok->inp = strchr(tok->buf, '\0');
				997	tok->end = tok->inp + 1;
				998	}
				999	}
				1000	else {
				1001	int done = 0;
				1002	Py_ssize_t cur = 0;
				1003	char *pt;
				1004	if (tok->start == NULL) {
				1005	if (tok->buf == NULL) {
				1006	tok->buf = (char *)
				1007	PyMem_MALLOC(BUFSIZ);
				1008	if (tok->buf == NULL) {
				1009	tok->done = E_NOMEM;
				1010	return EOF;
				1011	}
				1012	tok->end = tok->buf + BUFSIZ;
				1013	}
				1014	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
				1015	tok) == NULL) {
				1016	tok->done = E_EOF;
				1017	done = 1;
				1018	}
				1019	else {
				1020	tok->done = E_OK;
				1021	tok->inp = strchr(tok->buf, '\0');
				1022	done = tok->inp[-1] == '\n';
				1023	}
				1024	}
				1025	else {
				1026	cur = tok->cur - tok->buf;
				1027	if (decoding_feof(tok)) {
				1028	tok->done = E_EOF;
				1029	done = 1;
				1030	}
				1031	else
				1032	tok->done = E_OK;
				1033	}
				1034	tok->lineno++;
				1035	/* Read until '\n' or EOF */
				1036	while (!done) {
				1037	Py_ssize_t curstart = tok->start == NULL ? -1 :
				1038	tok->start - tok->buf;
				1039	Py_ssize_t curvalid = tok->inp - tok->buf;
				1040	Py_ssize_t newsize = curvalid + BUFSIZ;
				1041	char *newbuf = tok->buf;
				1042	newbuf = (char *)PyMem_REALLOC(newbuf,
				1043	newsize);
				1044	if (newbuf == NULL) {
				1045	tok->done = E_NOMEM;
				1046	tok->cur = tok->inp;
				1047	return EOF;
				1048	}
				1049	tok->buf = newbuf;
				1050	tok->inp = tok->buf + curvalid;
				1051	tok->end = tok->buf + newsize;
				1052	tok->start = curstart < 0 ? NULL :
				1053	tok->buf + curstart;
				1054	if (decoding_fgets(tok->inp,
				1055	(int)(tok->end - tok->inp),
				1056	tok) == NULL) {
				1057	/* Break out early on decoding
				1058	errors, as tok->buf will be NULL
				1059	*/
				1060	if (tok->decoding_erred)
				1061	return EOF;
				1062	/* Last line does not end in \n,
				1063	fake one */
				1064	strcpy(tok->inp, "\n");
				1065	}
				1066	tok->inp = strchr(tok->inp, '\0');
				1067	done = tok->inp[-1] == '\n';
				1068	}
				1069	if (tok->buf != NULL) {
				1070	tok->cur = tok->buf + cur;
				1071	tok->line_start = tok->cur;
				1072	/* replace "\r\n" with "\n" */
				1073	/* For Mac leave the \r, giving a syntax error */
				1074	pt = tok->inp - 2;
				1075	if (pt >= tok->buf && *pt == '\r') {
				1076	*pt++ = '\n';
				1077	*pt = '\0';
				1078	tok->inp = pt;
				1079	}
				1080	}
				1081	}
				1082	if (tok->done != E_OK) {
				1083	if (tok->prompt != NULL)
				1084	PySys_WriteStderr("\n");
				1085	tok->cur = tok->inp;
				1086	return EOF;
				1087	}
				1088	}
				1089	/NOTREACHED/
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1090	}
				1091
				1092
				1093	/* Back-up one character */
				1094
				1095	static void
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1096	tok_backup(struct tok_state *tok, int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1097	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1098	if (c != EOF) {
				1099	if (--tok->cur < tok->buf)
				1100	Py_FatalError("tok_backup: beginning of buffer");
				1101	if (*tok->cur != c)
				1102	*tok->cur = c;
				1103	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1104	}
				1105
				1106
				1107	/* Return the token corresponding to a single character */
				1108
				1109	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1110	PyToken_OneChar(int c)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1111	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1112	switch (c) {
				1113	case '(': return LPAR;
				1114	case ')': return RPAR;
				1115	case '[': return LSQB;
				1116	case ']': return RSQB;
				1117	case ':': return COLON;
				1118	case ',': return COMMA;
				1119	case ';': return SEMI;
				1120	case '+': return PLUS;
				1121	case '-': return MINUS;
				1122	case '*': return STAR;
				1123	case '/': return SLASH;
				1124	case '\|': return VBAR;
				1125	case '&': return AMPER;
				1126	case '<': return LESS;
				1127	case '>': return GREATER;
				1128	case '=': return EQUAL;
				1129	case '.': return DOT;
				1130	case '%': return PERCENT;
				1131	case '{': return LBRACE;
				1132	case '}': return RBRACE;
				1133	case '^': return CIRCUMFLEX;
				1134	case '~': return TILDE;
Benjamin Peterson	d51374e	2014-04-09 23:55:56 -0400	[diff] [blame]	1135	case '@': return AT;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1136	default: return OP;
				1137	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1138	}
				1139
				1140
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1141	int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1142	PyToken_TwoChars(int c1, int c2)
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1143	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1144	switch (c1) {
				1145	case '=':
				1146	switch (c2) {
				1147	case '=': return EQEQUAL;
				1148	}
				1149	break;
				1150	case '!':
				1151	switch (c2) {
				1152	case '=': return NOTEQUAL;
				1153	}
				1154	break;
				1155	case '<':
				1156	switch (c2) {
				1157	case '>': return NOTEQUAL;
				1158	case '=': return LESSEQUAL;
				1159	case '<': return LEFTSHIFT;
				1160	}
				1161	break;
				1162	case '>':
				1163	switch (c2) {
				1164	case '=': return GREATEREQUAL;
				1165	case '>': return RIGHTSHIFT;
				1166	}
				1167	break;
				1168	case '+':
				1169	switch (c2) {
				1170	case '=': return PLUSEQUAL;
				1171	}
				1172	break;
				1173	case '-':
				1174	switch (c2) {
				1175	case '=': return MINEQUAL;
				1176	case '>': return RARROW;
				1177	}
				1178	break;
				1179	case '*':
				1180	switch (c2) {
				1181	case '*': return DOUBLESTAR;
				1182	case '=': return STAREQUAL;
				1183	}
				1184	break;
				1185	case '/':
				1186	switch (c2) {
				1187	case '/': return DOUBLESLASH;
				1188	case '=': return SLASHEQUAL;
				1189	}
				1190	break;
				1191	case '\|':
				1192	switch (c2) {
				1193	case '=': return VBAREQUAL;
				1194	}
				1195	break;
				1196	case '%':
				1197	switch (c2) {
				1198	case '=': return PERCENTEQUAL;
				1199	}
				1200	break;
				1201	case '&':
				1202	switch (c2) {
				1203	case '=': return AMPEREQUAL;
				1204	}
				1205	break;
				1206	case '^':
				1207	switch (c2) {
				1208	case '=': return CIRCUMFLEXEQUAL;
				1209	}
				1210	break;
Benjamin Peterson	d51374e	2014-04-09 23:55:56 -0400	[diff] [blame]	1211	case '@':
				1212	switch (c2) {
				1213	case '=': return ATEQUAL;
				1214	}
				1215	break;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1216	}
				1217	return OP;
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1218	}
				1219
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1220	int
				1221	PyToken_ThreeChars(int c1, int c2, int c3)
				1222	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1223	switch (c1) {
				1224	case '<':
				1225	switch (c2) {
				1226	case '<':
				1227	switch (c3) {
				1228	case '=':
				1229	return LEFTSHIFTEQUAL;
				1230	}
				1231	break;
				1232	}
				1233	break;
				1234	case '>':
				1235	switch (c2) {
				1236	case '>':
				1237	switch (c3) {
				1238	case '=':
				1239	return RIGHTSHIFTEQUAL;
				1240	}
				1241	break;
				1242	}
				1243	break;
				1244	case '*':
				1245	switch (c2) {
				1246	case '*':
				1247	switch (c3) {
				1248	case '=':
				1249	return DOUBLESTAREQUAL;
				1250	}
				1251	break;
				1252	}
				1253	break;
				1254	case '/':
				1255	switch (c2) {
				1256	case '/':
				1257	switch (c3) {
				1258	case '=':
				1259	return DOUBLESLASHEQUAL;
				1260	}
				1261	break;
				1262	}
				1263	break;
				1264	case '.':
				1265	switch (c2) {
Georg Brandl	dde0028	2007-03-18 19:01:53 +0000	[diff] [blame]	1266	case '.':
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1267	switch (c3) {
				1268	case '.':
				1269	return ELLIPSIS;
				1270	}
				1271	break;
				1272	}
				1273	break;
				1274	}
				1275	return OP;
Thomas Wouters	434d082	2000-08-24 20:11:32 +0000	[diff] [blame]	1276	}
Guido van Rossum	fbab905	1991-10-20 20:25:03 +0000	[diff] [blame]	1277
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1278	static int
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1279	indenterror(struct tok_state *tok)
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1280	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1281	if (tok->alterror) {
				1282	tok->done = E_TABSPACE;
				1283	tok->cur = tok->inp;
				1284	return 1;
				1285	}
				1286	if (tok->altwarning) {
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1287	#ifdef PGEN
				1288	PySys_WriteStderr("inconsistent use of tabs and spaces "
				1289	"in indentation\n");
				1290	#else
				1291	PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1292	"in indentation\n", tok->filename);
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1293	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1294	tok->altwarning = 0;
				1295	}
				1296	return 0;
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1297	}
				1298
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1299	#ifdef PGEN
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1300	#define verify_identifier(tok) 1
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1301	#else
Martin v. Löwis	d63a3b8	2011-09-28 07:41:54 +0200	[diff] [blame]	1302	/* Verify that the identifier follows PEP 3131.
				1303	All identifier strings are guaranteed to be "ready" unicode objects.
				1304	*/
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1305	static int
Victor Stinner	52f6dd7	2010-03-12 14:45:56 +0000	[diff] [blame]	1306	verify_identifier(struct tok_state *tok)
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1307	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1308	PyObject *s;
				1309	int result;
				1310	s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
Martin v. Löwis	d63a3b8	2011-09-28 07:41:54 +0200	[diff] [blame]	1311	if (s == NULL \|\| PyUnicode_READY(s) == -1) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1312	if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
				1313	PyErr_Clear();
				1314	tok->done = E_IDENTIFIER;
				1315	} else {
				1316	tok->done = E_ERROR;
				1317	}
				1318	return 0;
				1319	}
				1320	result = PyUnicode_IsIdentifier(s);
				1321	Py_DECREF(s);
				1322	if (result == 0)
				1323	tok->done = E_IDENTIFIER;
				1324	return result;
Martin v. Löwis	4738340	2007-08-15 07:32:56 +0000	[diff] [blame]	1325	}
				1326	#endif
Guido van Rossum	926f13a	1998-04-09 21:38:06 +0000	[diff] [blame]	1327
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1328	/* Get next token, after space stripping etc. */
				1329
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1330	static int
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1331	tok_get(struct tok_state tok, char p_start, char *p_end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1332	{
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1333	int c;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1334	int blankline, nonascii;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1335
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1336	p_start = p_end = NULL;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1337	nextline:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1338	tok->start = NULL;
				1339	blankline = 0;
Guido van Rossum	8c11a5c	1991-07-27 21:42:56 +0000	[diff] [blame]	1340
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1341	/* Get indentation level */
				1342	if (tok->atbol) {
Antoine Pitrou	9ed5f27	2013-08-13 20:18:52 +0200	[diff] [blame]	1343	int col = 0;
				1344	int altcol = 0;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1345	tok->atbol = 0;
				1346	for (;;) {
				1347	c = tok_nextc(tok);
				1348	if (c == ' ')
				1349	col++, altcol++;
				1350	else if (c == '\t') {
				1351	col = (col/tok->tabsize + 1) * tok->tabsize;
				1352	altcol = (altcol/tok->alttabsize + 1)
				1353	* tok->alttabsize;
				1354	}
				1355	else if (c == '\014') /* Control-L (formfeed) */
				1356	col = altcol = 0; /* For Emacs users */
				1357	else
				1358	break;
				1359	}
				1360	tok_backup(tok, c);
				1361	if (c == '#' \|\| c == '\n') {
				1362	/* Lines with only whitespace and/or comments
				1363	shouldn't affect the indentation and are
				1364	not passed to the parser as NEWLINE tokens,
				1365	except totally empty lines in interactive
				1366	mode, which signal the end of a command group. */
				1367	if (col == 0 && c == '\n' && tok->prompt != NULL)
				1368	blankline = 0; /* Let it through */
				1369	else
				1370	blankline = 1; /* Ignore completely */
				1371	/* We can't jump back right here since we still
				1372	may need to skip to the end of a comment */
				1373	}
				1374	if (!blankline && tok->level == 0) {
				1375	if (col == tok->indstack[tok->indent]) {
				1376	/* No change */
				1377	if (altcol != tok->altindstack[tok->indent]) {
				1378	if (indenterror(tok))
				1379	return ERRORTOKEN;
				1380	}
				1381	}
				1382	else if (col > tok->indstack[tok->indent]) {
				1383	/* Indent -- always one */
				1384	if (tok->indent+1 >= MAXINDENT) {
				1385	tok->done = E_TOODEEP;
				1386	tok->cur = tok->inp;
				1387	return ERRORTOKEN;
				1388	}
				1389	if (altcol <= tok->altindstack[tok->indent]) {
				1390	if (indenterror(tok))
				1391	return ERRORTOKEN;
				1392	}
				1393	tok->pendin++;
				1394	tok->indstack[++tok->indent] = col;
				1395	tok->altindstack[tok->indent] = altcol;
				1396	}
				1397	else /* col < tok->indstack[tok->indent] */ {
				1398	/* Dedent -- any number, must be consistent */
				1399	while (tok->indent > 0 &&
				1400	col < tok->indstack[tok->indent]) {
				1401	tok->pendin--;
				1402	tok->indent--;
				1403	}
				1404	if (col != tok->indstack[tok->indent]) {
				1405	tok->done = E_DEDENT;
				1406	tok->cur = tok->inp;
				1407	return ERRORTOKEN;
				1408	}
				1409	if (altcol != tok->altindstack[tok->indent]) {
				1410	if (indenterror(tok))
				1411	return ERRORTOKEN;
				1412	}
				1413	}
				1414	}
				1415	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1416
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1417	tok->start = tok->cur;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1418
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1419	/* Return pending indents/dedents */
				1420	if (tok->pendin != 0) {
				1421	if (tok->pendin < 0) {
				1422	tok->pendin++;
				1423	return DEDENT;
				1424	}
				1425	else {
				1426	tok->pendin--;
				1427	return INDENT;
				1428	}
				1429	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1430
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1431	again:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1432	tok->start = NULL;
				1433	/* Skip spaces */
				1434	do {
				1435	c = tok_nextc(tok);
				1436	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1437
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1438	/* Set start of current token */
				1439	tok->start = tok->cur - 1;
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1440
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1441	/* Skip comment */
				1442	if (c == '#')
				1443	while (c != EOF && c != '\n')
				1444	c = tok_nextc(tok);
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1445
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1446	/* Check for EOF and errors now */
				1447	if (c == EOF) {
				1448	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
				1449	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1450
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1451	/* Identifier (most frequent token!) */
				1452	nonascii = 0;
				1453	if (is_potential_identifier_start(c)) {
Christian Heimes	0b3847d	2012-06-20 11:17:58 +0200	[diff] [blame]	1454	/* Process b"", r"", u"", br"" and rb"" */
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1455	int saw_b = 0, saw_r = 0, saw_u = 0;
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1456	while (1) {
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1457	if (!(saw_b \|\| saw_u) && (c == 'b' \|\| c == 'B'))
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1458	saw_b = 1;
Armin Ronacher	6ecf77b	2012-03-04 12:04:06 +0000	[diff] [blame]	1459	/* Since this is a backwards compatibility support literal we don't
				1460	want to support it in arbitrary order like byte literals. */
				1461	else if (!(saw_b \|\| saw_u \|\| saw_r) && (c == 'u' \|\| c == 'U'))
				1462	saw_u = 1;
Christian Heimes	0b3847d	2012-06-20 11:17:58 +0200	[diff] [blame]	1463	/* ur"" and ru"" are not supported */
				1464	else if (!(saw_r \|\| saw_u) && (c == 'r' \|\| c == 'R'))
Antoine Pitrou	3a5d4cb	2012-01-12 22:46:19 +0100	[diff] [blame]	1465	saw_r = 1;
				1466	else
				1467	break;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1468	c = tok_nextc(tok);
				1469	if (c == '"' \|\| c == '\'')
				1470	goto letter_quote;
				1471	}
				1472	while (is_potential_identifier_char(c)) {
				1473	if (c >= 128)
				1474	nonascii = 1;
				1475	c = tok_nextc(tok);
				1476	}
				1477	tok_backup(tok, c);
				1478	if (nonascii &&
				1479	!verify_identifier(tok)) {
				1480	tok->done = E_IDENTIFIER;
				1481	return ERRORTOKEN;
				1482	}
				1483	*p_start = tok->start;
				1484	*p_end = tok->cur;
				1485	return NAME;
				1486	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1487
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1488	/* Newline */
				1489	if (c == '\n') {
				1490	tok->atbol = 1;
				1491	if (blankline \|\| tok->level > 0)
				1492	goto nextline;
				1493	*p_start = tok->start;
				1494	p_end = tok->cur - 1; / Leave '\n' out of the string */
				1495	tok->cont_line = 0;
				1496	return NEWLINE;
				1497	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1498
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1499	/* Period or number starting with period? */
				1500	if (c == '.') {
				1501	c = tok_nextc(tok);
				1502	if (isdigit(c)) {
				1503	goto fraction;
				1504	} else if (c == '.') {
				1505	c = tok_nextc(tok);
				1506	if (c == '.') {
				1507	*p_start = tok->start;
				1508	*p_end = tok->cur;
				1509	return ELLIPSIS;
				1510	} else {
				1511	tok_backup(tok, c);
				1512	}
				1513	tok_backup(tok, '.');
				1514	} else {
				1515	tok_backup(tok, c);
				1516	}
				1517	*p_start = tok->start;
				1518	*p_end = tok->cur;
				1519	return DOT;
				1520	}
Guido van Rossum	f595fde	1996-01-12 01:31:58 +0000	[diff] [blame]	1521
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1522	/* Number */
				1523	if (isdigit(c)) {
				1524	if (c == '0') {
				1525	/* Hex, octal or binary -- maybe. */
				1526	c = tok_nextc(tok);
				1527	if (c == '.')
				1528	goto fraction;
				1529	if (c == 'j' \|\| c == 'J')
				1530	goto imaginary;
				1531	if (c == 'x' \|\| c == 'X') {
Georg Brandl	fceab5a	2008-01-19 20:08:23 +0000	[diff] [blame]	1532
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1533	/* Hex */
				1534	c = tok_nextc(tok);
				1535	if (!isxdigit(c)) {
				1536	tok->done = E_TOKEN;
				1537	tok_backup(tok, c);
				1538	return ERRORTOKEN;
				1539	}
				1540	do {
				1541	c = tok_nextc(tok);
				1542	} while (isxdigit(c));
				1543	}
				1544	else if (c == 'o' \|\| c == 'O') {
				1545	/* Octal */
				1546	c = tok_nextc(tok);
				1547	if (c < '0' \|\| c >= '8') {
				1548	tok->done = E_TOKEN;
				1549	tok_backup(tok, c);
				1550	return ERRORTOKEN;
				1551	}
				1552	do {
				1553	c = tok_nextc(tok);
				1554	} while ('0' <= c && c < '8');
				1555	}
				1556	else if (c == 'b' \|\| c == 'B') {
				1557	/* Binary */
				1558	c = tok_nextc(tok);
				1559	if (c != '0' && c != '1') {
				1560	tok->done = E_TOKEN;
				1561	tok_backup(tok, c);
				1562	return ERRORTOKEN;
				1563	}
				1564	do {
				1565	c = tok_nextc(tok);
				1566	} while (c == '0' \|\| c == '1');
				1567	}
				1568	else {
				1569	int nonzero = 0;
				1570	/* maybe old-style octal; c is first char of it */
				1571	/* in any case, allow '0' as a literal */
				1572	while (c == '0')
				1573	c = tok_nextc(tok);
				1574	while (isdigit(c)) {
				1575	nonzero = 1;
				1576	c = tok_nextc(tok);
				1577	}
				1578	if (c == '.')
				1579	goto fraction;
				1580	else if (c == 'e' \|\| c == 'E')
				1581	goto exponent;
				1582	else if (c == 'j' \|\| c == 'J')
				1583	goto imaginary;
				1584	else if (nonzero) {
				1585	tok->done = E_TOKEN;
				1586	tok_backup(tok, c);
				1587	return ERRORTOKEN;
				1588	}
				1589	}
				1590	}
				1591	else {
				1592	/* Decimal */
				1593	do {
				1594	c = tok_nextc(tok);
				1595	} while (isdigit(c));
				1596	{
				1597	/* Accept floating point numbers. */
				1598	if (c == '.') {
				1599	fraction:
				1600	/* Fraction */
				1601	do {
				1602	c = tok_nextc(tok);
				1603	} while (isdigit(c));
				1604	}
				1605	if (c == 'e' \|\| c == 'E') {
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1606	int e;
				1607	exponent:
				1608	e = c;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1609	/* Exponent part */
				1610	c = tok_nextc(tok);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1611	if (c == '+' \|\| c == '-') {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1612	c = tok_nextc(tok);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1613	if (!isdigit(c)) {
				1614	tok->done = E_TOKEN;
				1615	tok_backup(tok, c);
				1616	return ERRORTOKEN;
				1617	}
				1618	} else if (!isdigit(c)) {
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1619	tok_backup(tok, c);
Benjamin Peterson	c416162	2014-06-07 12:36:39 -0700	[diff] [blame]	1620	tok_backup(tok, e);
				1621	*p_start = tok->start;
				1622	*p_end = tok->cur;
				1623	return NUMBER;
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1624	}
				1625	do {
				1626	c = tok_nextc(tok);
				1627	} while (isdigit(c));
				1628	}
				1629	if (c == 'j' \|\| c == 'J')
				1630	/* Imaginary part */
				1631	imaginary:
				1632	c = tok_nextc(tok);
				1633	}
				1634	}
				1635	tok_backup(tok, c);
				1636	*p_start = tok->start;
				1637	*p_end = tok->cur;
				1638	return NUMBER;
				1639	}
Guido van Rossum	24dacb3	1997-04-06 03:46:20 +0000	[diff] [blame]	1640
				1641	letter_quote:
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1642	/* String */
				1643	if (c == '\'' \|\| c == '"') {
				1644	int quote = c;
				1645	int quote_size = 1; /* 1 or 3 */
				1646	int end_quote_size = 0;
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1647
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1648	/* Find the quote size and start of string */
				1649	c = tok_nextc(tok);
				1650	if (c == quote) {
				1651	c = tok_nextc(tok);
				1652	if (c == quote)
				1653	quote_size = 3;
				1654	else
				1655	end_quote_size = 1; /* empty string found */
				1656	}
				1657	if (c != quote)
				1658	tok_backup(tok, c);
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1659
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1660	/* Get rest of string */
				1661	while (end_quote_size != quote_size) {
				1662	c = tok_nextc(tok);
				1663	if (c == EOF) {
				1664	if (quote_size == 3)
				1665	tok->done = E_EOFS;
				1666	else
				1667	tok->done = E_EOLS;
				1668	tok->cur = tok->inp;
				1669	return ERRORTOKEN;
				1670	}
				1671	if (quote_size == 1 && c == '\n') {
				1672	tok->done = E_EOLS;
				1673	tok->cur = tok->inp;
				1674	return ERRORTOKEN;
				1675	}
				1676	if (c == quote)
				1677	end_quote_size += 1;
				1678	else {
				1679	end_quote_size = 0;
				1680	if (c == '\\')
				1681	c = tok_nextc(tok); /* skip escaped char */
				1682	}
				1683	}
Guido van Rossum	cf171a7	2007-11-16 00:51:45 +0000	[diff] [blame]	1684
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1685	*p_start = tok->start;
				1686	*p_end = tok->cur;
				1687	return STRING;
				1688	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1689
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1690	/* Line continuation */
				1691	if (c == '\\') {
				1692	c = tok_nextc(tok);
				1693	if (c != '\n') {
				1694	tok->done = E_LINECONT;
				1695	tok->cur = tok->inp;
				1696	return ERRORTOKEN;
				1697	}
				1698	tok->cont_line = 1;
				1699	goto again; /* Read next line */
				1700	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1701
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1702	/* Check for two-character token */
				1703	{
				1704	int c2 = tok_nextc(tok);
				1705	int token = PyToken_TwoChars(c, c2);
				1706	if (token != OP) {
				1707	int c3 = tok_nextc(tok);
				1708	int token3 = PyToken_ThreeChars(c, c2, c3);
				1709	if (token3 != OP) {
				1710	token = token3;
				1711	} else {
				1712	tok_backup(tok, c3);
				1713	}
				1714	*p_start = tok->start;
				1715	*p_end = tok->cur;
				1716	return token;
				1717	}
				1718	tok_backup(tok, c2);
				1719	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1720
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1721	/* Keep track of parentheses nesting level */
				1722	switch (c) {
				1723	case '(':
				1724	case '[':
				1725	case '{':
				1726	tok->level++;
				1727	break;
				1728	case ')':
				1729	case ']':
				1730	case '}':
				1731	tok->level--;
				1732	break;
				1733	}
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	1734
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1735	/* Punctuation character */
				1736	*p_start = tok->start;
				1737	*p_end = tok->cur;
				1738	return PyToken_OneChar(c);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1739	}
				1740
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1741	int
				1742	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
				1743	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1744	int result = tok_get(tok, p_start, p_end);
				1745	if (tok->decoding_erred) {
				1746	result = ERRORTOKEN;
				1747	tok->done = E_DECODE;
				1748	}
				1749	return result;
Martin v. Löwis	00f1e3f	2002-08-04 17:29:52 +0000	[diff] [blame]	1750	}
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1751
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1752	/* Get the encoding of a Python file. Check for the coding cookie and check if
				1753	the file starts with a BOM.
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1754
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1755	PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
				1756	encoding in the first or second line of the file (in which case the encoding
				1757	should be assumed to be UTF-8).
Brett Cannon	e453989	2007-10-20 03:46:49 +0000	[diff] [blame]	1758
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1759	The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
				1760	by the caller. */
				1761
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1762	char *
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1763	PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
Guido van Rossum	40d20bc	2007-10-22 00:09:51 +0000	[diff] [blame]	1764	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1765	struct tok_state *tok;
				1766	FILE *fp;
				1767	char p_start =NULL , p_end =NULL , *encoding = NULL;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1768
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1769	#ifndef PGEN
				1770	fd = _Py_dup(fd);
				1771	#else
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1772	fd = dup(fd);
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1773	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1774	if (fd < 0) {
				1775	return NULL;
				1776	}
Victor Stinner	daf4555	2013-08-28 00:53:59 +0200	[diff] [blame]	1777
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1778	fp = fdopen(fd, "r");
				1779	if (fp == NULL) {
				1780	return NULL;
				1781	}
				1782	tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
				1783	if (tok == NULL) {
				1784	fclose(fp);
				1785	return NULL;
				1786	}
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1787	#ifndef PGEN
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1788	if (filename != NULL) {
				1789	Py_INCREF(filename);
				1790	tok->filename = filename;
				1791	}
				1792	else {
				1793	tok->filename = PyUnicode_FromString("<string>");
				1794	if (tok->filename == NULL) {
				1795	fclose(fp);
				1796	PyTokenizer_Free(tok);
				1797	return encoding;
				1798	}
				1799	}
Victor Stinner	7f2fee3	2011-04-05 00:39:01 +0200	[diff] [blame]	1800	#endif
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1801	while (tok->lineno < 2 && tok->done == E_OK) {
				1802	PyTokenizer_Get(tok, &p_start, &p_end);
				1803	}
				1804	fclose(fp);
				1805	if (tok->encoding) {
				1806	encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
				1807	if (encoding)
				1808	strcpy(encoding, tok->encoding);
				1809	}
				1810	PyTokenizer_Free(tok);
				1811	return encoding;
Guido van Rossum	ce3a72a	2007-10-19 23:16:50 +0000	[diff] [blame]	1812	}
Thomas Wouters	89d996e	2007-09-08 17:39:28 +0000	[diff] [blame]	1813
Victor Stinner	fe7c5b5	2011-04-05 01:48:03 +0200	[diff] [blame]	1814	char *
				1815	PyTokenizer_FindEncoding(int fd)
				1816	{
				1817	return PyTokenizer_FindEncodingFilename(fd, NULL);
				1818	}
				1819
Guido van Rossum	408027e	1996-12-30 16:17:54 +0000	[diff] [blame]	1820	#ifdef Py_DEBUG
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1821
				1822	void
Thomas Wouters	23c9e00	2000-07-22 19:20:54 +0000	[diff] [blame]	1823	tok_dump(int type, char start, char end)
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1824	{
Antoine Pitrou	f95a1b3	2010-05-09 15:52:27 +0000	[diff] [blame]	1825	printf("%s", _PyParser_TokenNames[type]);
				1826	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
				1827	printf("(%.*s)", (int)(end - start), start);
Guido van Rossum	85a5fbb	1990-10-14 12:07:46 +0000	[diff] [blame]	1828	}
				1829
				1830	#endif