bpo-40334: Improve column offsets for thrown syntax errors by Pegen (GH-19782)
diff --git a/Parser/pegen/parse.c b/Parser/pegen/parse.c
index 2be5e38..33c92c2 100644
--- a/Parser/pegen/parse.c
+++ b/Parser/pegen/parse.c
@@ -10515,7 +10515,7 @@
(_tmp_132_var = _tmp_132_rule(p))
)
{
- res = RAISE_SYNTAX_ERROR ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
+ res = RAISE_SYNTAX_ERROR_NO_COL_OFFSET ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
if (res == NULL && PyErr_Occurred()) {
p->error_indicator = 1;
return NULL;
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index 40c09ff..a7add8f 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -145,11 +145,15 @@
if (!str) {
return 0;
}
- PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL);
+ PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
if (!text) {
return 0;
}
Py_ssize_t size = PyUnicode_GET_LENGTH(text);
+ str = PyUnicode_AsUTF8(text);
+ if (str != NULL && (int)strlen(str) == col_offset) {
+ size = strlen(str);
+ }
Py_DECREF(text);
return size;
}
@@ -297,69 +301,24 @@
}
static inline PyObject *
-get_error_line(char *buffer)
+get_error_line(char *buffer, int is_file)
{
- char *newline = strchr(buffer, '\n');
+ const char *newline;
+ if (is_file) {
+ newline = strrchr(buffer, '\n');
+ } else {
+ newline = strchr(buffer, '\n');
+ }
+
if (newline) {
- return PyUnicode_FromStringAndSize(buffer, newline - buffer);
+ return PyUnicode_DecodeUTF8(buffer, newline - buffer, "replace");
}
else {
- return PyUnicode_FromString(buffer);
+ return PyUnicode_DecodeUTF8(buffer, strlen(buffer), "replace");
}
}
static int
-tokenizer_error_with_col_offset(Parser *p, PyObject *errtype, const char *errmsg)
-{
- PyObject *errstr = NULL;
- PyObject *value = NULL;
- size_t col_number = -1;
-
- errstr = PyUnicode_FromString(errmsg);
- if (!errstr) {
- return -1;
- }
-
- PyObject *loc = NULL;
- if (p->start_rule == Py_file_input) {
- loc = PyErr_ProgramTextObject(p->tok->filename, p->tok->lineno);
- }
- if (!loc) {
- loc = get_error_line(p->tok->buf);
- }
-
- if (loc) {
- col_number = p->tok->cur - p->tok->buf;
- }
- else {
- Py_INCREF(Py_None);
- loc = Py_None;
- }
-
- PyObject *tmp = Py_BuildValue("(OiiN)", p->tok->filename, p->tok->lineno,
- col_number, loc);
- if (!tmp) {
- goto error;
- }
-
- value = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(errtype, value);
-
- Py_XDECREF(value);
- Py_XDECREF(errstr);
- return -1;
-
-error:
- Py_XDECREF(errstr);
- Py_XDECREF(loc);
- return -1;
-}
-
-static int
tokenizer_error(Parser *p)
{
if (PyErr_Occurred()) {
@@ -376,20 +335,20 @@
msg = "invalid character in identifier";
break;
case E_BADPREFIX:
- return tokenizer_error_with_col_offset(p,
- errtype, "invalid string prefix");
+ RAISE_SYNTAX_ERROR("invalid string prefix");
+ return -1;
case E_EOFS:
- return tokenizer_error_with_col_offset(p,
- errtype, "EOF while scanning triple-quoted string literal");
+ RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
+ return -1;
case E_EOLS:
- return tokenizer_error_with_col_offset(p,
- errtype, "EOL while scanning string literal");
+ RAISE_SYNTAX_ERROR("EOL while scanning string literal");
+ return -1;
case E_EOF:
- return tokenizer_error_with_col_offset(p,
- errtype, "unexpected EOF while parsing");
+ RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+ return -1;
case E_DEDENT:
- return tokenizer_error_with_col_offset(p,
- PyExc_IndentationError, "unindent does not match any outer indentation level");
+ RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
+ return -1;
case E_INTR:
if (!PyErr_Occurred()) {
PyErr_SetNone(PyExc_KeyboardInterrupt);
@@ -421,14 +380,14 @@
}
void *
-_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
+_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...)
{
PyObject *value = NULL;
PyObject *errstr = NULL;
PyObject *loc = NULL;
PyObject *tmp = NULL;
Token *t = p->tokens[p->fill - 1];
- Py_ssize_t col_number = 0;
+ Py_ssize_t col_number = !with_col_number;
va_list va;
va_start(va, errmsg);
@@ -443,14 +402,20 @@
}
if (!loc) {
- loc = get_error_line(p->tok->buf);
+ loc = get_error_line(p->tok->buf, p->start_rule == Py_file_input);
}
- if (loc) {
- int col_offset = t->col_offset == -1 ? 0 : t->col_offset;
- col_number = byte_offset_to_character_offset(loc, col_offset) + 1;
+ if (loc && with_col_number) {
+ int col_offset;
+ if (t->col_offset == -1) {
+ col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf,
+ intptr_t, int);
+ } else {
+ col_offset = t->col_offset + 1;
+ }
+ col_number = byte_offset_to_character_offset(loc, col_offset);
}
- else {
+ else if (!loc) {
Py_INCREF(Py_None);
loc = Py_None;
}
@@ -632,14 +597,6 @@
type = PyTokenizer_Get(p->tok, &start, &end);
}
- if (type == ERRORTOKEN) {
- if (p->tok->done == E_DECODE) {
- return raise_decode_error(p);
- }
- else {
- return tokenizer_error(p);
- }
- }
if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
type = NEWLINE; /* Add an extra newline */
p->parsing_started = 0;
@@ -700,6 +657,16 @@
t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;
p->fill += 1;
+
+ if (type == ERRORTOKEN) {
+ if (p->tok->done == E_DECODE) {
+ return raise_decode_error(p);
+ }
+ else {
+ return tokenizer_error(p);
+ }
+ }
+
return 0;
}
diff --git a/Parser/pegen/pegen.h b/Parser/pegen/pegen.h
index 1620f92..cbe6f19 100644
--- a/Parser/pegen/pegen.h
+++ b/Parser/pegen/pegen.h
@@ -126,14 +126,15 @@
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
const char *_PyPegen_get_expr_name(expr_ty);
-void *_PyPegen_raise_error(Parser *p, PyObject *, const char *errmsg, ...);
+void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...);
void *_PyPegen_dummy_name(Parser *p, ...);
#define UNUSED(expr) do { (void)(expr); } while (0)
#define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena
#define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena
-#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__)
-#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__)
+#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__)
+#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__)
+#define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__)
Py_LOCAL_INLINE(void *)
CHECK_CALL(Parser *p, void *result)
@@ -190,8 +191,8 @@
}
if (p->feature_version < version) {
p->error_indicator = 1;
- return _PyPegen_raise_error(p, PyExc_SyntaxError, "%s only supported in Python 3.%i and greater",
- msg, version);
+ return RAISE_SYNTAX_ERROR("%s only supported in Python 3.%i and greater",
+ msg, version);
}
return node;
}