Blame - Parser/pegen_errors.c - platform/external/python/cpython3

blob: 93057d151db3865ee713db103280c365c918b371 [file] [log] [blame]

Pablo Galindo Salgado	07cf66f	2021-11-21 04:15:22 +0000	[diff] [blame]	1	#include <Python.h>
				2	#include <errcode.h>
				3
				4	#include "tokenizer.h"
				5	#include "pegen.h"
				6
				7	// TOKENIZER ERRORS
				8
				9	void
				10	_PyPegen_raise_tokenizer_init_error(PyObject *filename)
				11	{
				12	if (!(PyErr_ExceptionMatches(PyExc_LookupError)
				13	\|\| PyErr_ExceptionMatches(PyExc_SyntaxError)
				14	\|\| PyErr_ExceptionMatches(PyExc_ValueError)
				15	\|\| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
				16	return;
				17	}
				18	PyObject *errstr = NULL;
				19	PyObject *tuple = NULL;
				20	PyObject *type;
				21	PyObject *value;
				22	PyObject *tback;
				23	PyErr_Fetch(&type, &value, &tback);
				24	errstr = PyObject_Str(value);
				25	if (!errstr) {
				26	goto error;
				27	}
				28
				29	PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
				30	if (!tmp) {
				31	goto error;
				32	}
				33
				34	tuple = PyTuple_Pack(2, errstr, tmp);
				35	Py_DECREF(tmp);
				36	if (!value) {
				37	goto error;
				38	}
				39	PyErr_SetObject(PyExc_SyntaxError, tuple);
				40
				41	error:
				42	Py_XDECREF(type);
				43	Py_XDECREF(value);
				44	Py_XDECREF(tback);
				45	Py_XDECREF(errstr);
				46	Py_XDECREF(tuple);
				47	}
				48
				49	static inline void
				50	raise_unclosed_parentheses_error(Parser *p) {
				51	int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
				52	int error_col = p->tok->parencolstack[p->tok->level-1];
				53	RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
				54	error_lineno, error_col, error_lineno, -1,
				55	"'%c' was never closed",
				56	p->tok->parenstack[p->tok->level-1]);
				57	}
				58
				59	int
				60	_Pypegen_tokenizer_error(Parser *p)
				61	{
				62	if (PyErr_Occurred()) {
				63	return -1;
				64	}
				65
				66	const char *msg = NULL;
				67	PyObject* errtype = PyExc_SyntaxError;
				68	Py_ssize_t col_offset = -1;
				69	switch (p->tok->done) {
				70	case E_TOKEN:
				71	msg = "invalid token";
				72	break;
				73	case E_EOF:
				74	if (p->tok->level) {
				75	raise_unclosed_parentheses_error(p);
				76	} else {
				77	RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
				78	}
				79	return -1;
				80	case E_DEDENT:
				81	RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
				82	return -1;
				83	case E_INTR:
				84	if (!PyErr_Occurred()) {
				85	PyErr_SetNone(PyExc_KeyboardInterrupt);
				86	}
				87	return -1;
				88	case E_NOMEM:
				89	PyErr_NoMemory();
				90	return -1;
				91	case E_TABSPACE:
				92	errtype = PyExc_TabError;
				93	msg = "inconsistent use of tabs and spaces in indentation";
				94	break;
				95	case E_TOODEEP:
				96	errtype = PyExc_IndentationError;
				97	msg = "too many levels of indentation";
				98	break;
				99	case E_LINECONT: {
				100	col_offset = p->tok->cur - p->tok->buf - 1;
				101	msg = "unexpected character after line continuation character";
				102	break;
				103	}
				104	default:
				105	msg = "unknown parsing error";
				106	}
				107
				108	RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
				109	col_offset >= 0 ? col_offset : 0,
				110	p->tok->lineno, -1, msg);
				111	return -1;
				112	}
				113
				114	int
				115	_Pypegen_raise_decode_error(Parser *p)
				116	{
				117	assert(PyErr_Occurred());
				118	const char *errtype = NULL;
				119	if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
				120	errtype = "unicode error";
				121	}
				122	else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
				123	errtype = "value error";
				124	}
				125	if (errtype) {
				126	PyObject *type;
				127	PyObject *value;
				128	PyObject *tback;
				129	PyObject *errstr;
				130	PyErr_Fetch(&type, &value, &tback);
				131	errstr = PyObject_Str(value);
				132	if (errstr) {
				133	RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
				134	Py_DECREF(errstr);
				135	}
				136	else {
				137	PyErr_Clear();
				138	RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
				139	}
				140	Py_XDECREF(type);
				141	Py_XDECREF(value);
				142	Py_XDECREF(tback);
				143	}
				144
				145	return -1;
				146	}
				147
				148	static int
				149	_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
				150	// Tokenize the whole input to see if there are any tokenization
				151	// errors such as mistmatching parentheses. These will get priority
				152	// over generic syntax errors only if the line number of the error is
				153	// before the one that we had for the generic error.
				154
				155	// We don't want to tokenize to the end for interactive input
				156	if (p->tok->prompt != NULL) {
				157	return 0;
				158	}
				159
				160	PyObject type, value, *traceback;
				161	PyErr_Fetch(&type, &value, &traceback);
				162
				163	Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
				164	Py_ssize_t current_err_line = current_token->lineno;
				165
				166	int ret = 0;
				167
				168	for (;;) {
				169	const char *start;
				170	const char *end;
				171	switch (_PyTokenizer_Get(p->tok, &start, &end)) {
				172	case ERRORTOKEN:
				173	if (p->tok->level != 0) {
				174	int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
				175	if (current_err_line > error_lineno) {
				176	raise_unclosed_parentheses_error(p);
				177	ret = -1;
				178	goto exit;
				179	}
				180	}
				181	break;
				182	case ENDMARKER:
				183	break;
				184	default:
				185	continue;
				186	}
				187	break;
				188	}
				189
				190
				191	exit:
				192	if (PyErr_Occurred()) {
				193	Py_XDECREF(value);
				194	Py_XDECREF(type);
				195	Py_XDECREF(traceback);
				196	} else {
				197	PyErr_Restore(type, value, traceback);
				198	}
				199	return ret;
				200	}
				201
				202	// PARSER ERRORS
				203
				204	void *
				205	_PyPegen_raise_error(Parser p, PyObject errtype, const char *errmsg, ...)
				206	{
				207	if (p->fill == 0) {
				208	va_list va;
				209	va_start(va, errmsg);
				210	_PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
				211	va_end(va);
				212	return NULL;
				213	}
				214
				215	Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
				216	Py_ssize_t col_offset;
				217	Py_ssize_t end_col_offset = -1;
				218	if (t->col_offset == -1) {
				219	if (p->tok->cur == p->tok->buf) {
				220	col_offset = 0;
				221	} else {
				222	const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
				223	col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
				224	}
				225	} else {
				226	col_offset = t->col_offset + 1;
				227	}
				228
				229	if (t->end_col_offset != -1) {
				230	end_col_offset = t->end_col_offset + 1;
				231	}
				232
				233	va_list va;
				234	va_start(va, errmsg);
				235	_PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
				236	va_end(va);
				237
				238	return NULL;
				239	}
				240
				241	static PyObject *
				242	get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
				243	{
				244	/* If the file descriptor is interactive, the source lines of the current
				245	* (multi-line) statement are stored in p->tok->interactive_src_start.
				246	* If not, we're parsing from a string, which means that the whole source
				247	* is stored in p->tok->str. */
				248	assert((p->tok->fp == NULL && p->tok->str != NULL) \|\| p->tok->fp == stdin);
				249
				250	char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
				251	assert(cur_line != NULL);
				252
				253	for (int i = 0; i < lineno - 1; i++) {
				254	cur_line = strchr(cur_line, '\n') + 1;
				255	}
				256
				257	char *next_newline;
				258	if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
				259	next_newline = cur_line + strlen(cur_line);
				260	}
				261	return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
				262	}
				263
				264	void *
				265	_PyPegen_raise_error_known_location(Parser p, PyObject errtype,
				266	Py_ssize_t lineno, Py_ssize_t col_offset,
				267	Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
				268	const char *errmsg, va_list va)
				269	{
				270	PyObject *value = NULL;
				271	PyObject *errstr = NULL;
				272	PyObject *error_line = NULL;
				273	PyObject *tmp = NULL;
				274	p->error_indicator = 1;
				275
				276	if (end_lineno == CURRENT_POS) {
				277	end_lineno = p->tok->lineno;
				278	}
				279	if (end_col_offset == CURRENT_POS) {
				280	end_col_offset = p->tok->cur - p->tok->line_start;
				281	}
				282
				283	if (p->start_rule == Py_fstring_input) {
				284	const char *fstring_msg = "f-string: ";
				285	Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
				286
				287	char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
				288	if (!new_errmsg) {
				289	return (void *) PyErr_NoMemory();
				290	}
				291
				292	// Copy both strings into new buffer
				293	memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
				294	memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
				295	new_errmsg[len] = 0;
				296	errmsg = new_errmsg;
				297	}
				298	errstr = PyUnicode_FromFormatV(errmsg, va);
				299	if (!errstr) {
				300	goto error;
				301	}
				302
				303	if (p->tok->fp_interactive) {
				304	error_line = get_error_line_from_tokenizer_buffers(p, lineno);
				305	}
				306	else if (p->start_rule == Py_file_input) {
				307	error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
				308	(int) lineno, p->tok->encoding);
				309	}
				310
				311	if (!error_line) {
				312	/* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
				313	then we need to find the error line from some other source, because
				314	p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
				315	failed or we're parsing from a string or the REPL. There's a third edge case where
				316	we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
				317	`PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
				318	does not physically exist */
				319	assert(p->tok->fp == NULL \|\| p->tok->fp == stdin \|\| p->tok->done == E_EOF);
				320
				321	if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
				322	Py_ssize_t size = p->tok->inp - p->tok->buf;
				323	error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
				324	}
				325	else if (p->tok->fp == NULL \|\| p->tok->fp == stdin) {
				326	error_line = get_error_line_from_tokenizer_buffers(p, lineno);
				327	}
				328	else {
				329	error_line = PyUnicode_FromStringAndSize("", 0);
				330	}
				331	if (!error_line) {
				332	goto error;
				333	}
				334	}
				335
				336	if (p->start_rule == Py_fstring_input) {
				337	col_offset -= p->starting_col_offset;
				338	end_col_offset -= p->starting_col_offset;
				339	}
				340
				341	Py_ssize_t col_number = col_offset;
				342	Py_ssize_t end_col_number = end_col_offset;
				343
				344	if (p->tok->encoding != NULL) {
				345	col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
				346	if (col_number < 0) {
				347	goto error;
				348	}
				349	if (end_col_number > 0) {
				350	Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
				351	if (end_col_offset < 0) {
				352	goto error;
				353	} else {
				354	end_col_number = end_col_offset;
				355	}
				356	}
				357	}
				358	tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
				359	if (!tmp) {
				360	goto error;
				361	}
				362	value = PyTuple_Pack(2, errstr, tmp);
				363	Py_DECREF(tmp);
				364	if (!value) {
				365	goto error;
				366	}
				367	PyErr_SetObject(errtype, value);
				368
				369	Py_DECREF(errstr);
				370	Py_DECREF(value);
				371	if (p->start_rule == Py_fstring_input) {
				372	PyMem_Free((void *)errmsg);
				373	}
				374	return NULL;
				375
				376	error:
				377	Py_XDECREF(errstr);
				378	Py_XDECREF(error_line);
				379	if (p->start_rule == Py_fstring_input) {
				380	PyMem_Free((void *)errmsg);
				381	}
				382	return NULL;
				383	}
				384
				385	void
				386	_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
				387	// Existing sintax error
				388	if (PyErr_Occurred()) {
				389	// Prioritize tokenizer errors to custom syntax errors raised
				390	// on the second phase only if the errors come from the parser.
				391	if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
				392	_PyPegen_tokenize_full_source_to_check_for_errors(p);
				393	}
				394	// Propagate the existing syntax error.
				395	return;
				396	}
				397	// Initialization error
				398	if (p->fill == 0) {
				399	RAISE_SYNTAX_ERROR("error at start before reading any input");
				400	}
				401	// Parser encountered EOF (End of File) unexpectedtly
Pablo Galindo Salgado	c72311d	2021-11-25 01:01:40 +0000	[diff] [blame]	402	if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
Pablo Galindo Salgado	07cf66f	2021-11-21 04:15:22 +0000	[diff] [blame]	403	if (p->tok->level) {
				404	raise_unclosed_parentheses_error(p);
				405	} else {
				406	RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
				407	}
				408	return;
				409	}
				410	// Indentation error in the tokenizer
				411	if (last_token->type == INDENT \|\| last_token->type == DEDENT) {
				412	RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
				413	return;
				414	}
				415	// Unknown error (generic case)
				416
				417	// Use the last token we found on the first pass to avoid reporting
				418	// incorrect locations for generic syntax errors just because we reached
				419	// further away when trying to find specific syntax errors in the second
				420	// pass.
				421	RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
				422	// _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
				423	// generic SyntaxError we just raised if errors are found.
				424	_PyPegen_tokenize_full_source_to_check_for_errors(p);
Pablo Galindo Salgado	c72311d	2021-11-25 01:01:40 +0000	[diff] [blame]	425	}