Pablo Galindo Salgado | 07cf66f | 2021-11-21 04:15:22 +0000 | [diff] [blame] | 1 | #include <Python.h> |
| 2 | #include <errcode.h> |
| 3 | |
| 4 | #include "tokenizer.h" |
| 5 | #include "pegen.h" |
| 6 | |
| 7 | // TOKENIZER ERRORS |
| 8 | |
| 9 | void |
| 10 | _PyPegen_raise_tokenizer_init_error(PyObject *filename) |
| 11 | { |
| 12 | if (!(PyErr_ExceptionMatches(PyExc_LookupError) |
| 13 | || PyErr_ExceptionMatches(PyExc_SyntaxError) |
| 14 | || PyErr_ExceptionMatches(PyExc_ValueError) |
| 15 | || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { |
| 16 | return; |
| 17 | } |
| 18 | PyObject *errstr = NULL; |
| 19 | PyObject *tuple = NULL; |
| 20 | PyObject *type; |
| 21 | PyObject *value; |
| 22 | PyObject *tback; |
| 23 | PyErr_Fetch(&type, &value, &tback); |
| 24 | errstr = PyObject_Str(value); |
| 25 | if (!errstr) { |
| 26 | goto error; |
| 27 | } |
| 28 | |
| 29 | PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); |
| 30 | if (!tmp) { |
| 31 | goto error; |
| 32 | } |
| 33 | |
| 34 | tuple = PyTuple_Pack(2, errstr, tmp); |
| 35 | Py_DECREF(tmp); |
| 36 | if (!value) { |
| 37 | goto error; |
| 38 | } |
| 39 | PyErr_SetObject(PyExc_SyntaxError, tuple); |
| 40 | |
| 41 | error: |
| 42 | Py_XDECREF(type); |
| 43 | Py_XDECREF(value); |
| 44 | Py_XDECREF(tback); |
| 45 | Py_XDECREF(errstr); |
| 46 | Py_XDECREF(tuple); |
| 47 | } |
| 48 | |
| 49 | static inline void |
| 50 | raise_unclosed_parentheses_error(Parser *p) { |
| 51 | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
| 52 | int error_col = p->tok->parencolstack[p->tok->level-1]; |
| 53 | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, |
| 54 | error_lineno, error_col, error_lineno, -1, |
| 55 | "'%c' was never closed", |
| 56 | p->tok->parenstack[p->tok->level-1]); |
| 57 | } |
| 58 | |
| 59 | int |
| 60 | _Pypegen_tokenizer_error(Parser *p) |
| 61 | { |
| 62 | if (PyErr_Occurred()) { |
| 63 | return -1; |
| 64 | } |
| 65 | |
| 66 | const char *msg = NULL; |
| 67 | PyObject* errtype = PyExc_SyntaxError; |
| 68 | Py_ssize_t col_offset = -1; |
| 69 | switch (p->tok->done) { |
| 70 | case E_TOKEN: |
| 71 | msg = "invalid token"; |
| 72 | break; |
| 73 | case E_EOF: |
| 74 | if (p->tok->level) { |
| 75 | raise_unclosed_parentheses_error(p); |
| 76 | } else { |
| 77 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
| 78 | } |
| 79 | return -1; |
| 80 | case E_DEDENT: |
| 81 | RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); |
| 82 | return -1; |
| 83 | case E_INTR: |
| 84 | if (!PyErr_Occurred()) { |
| 85 | PyErr_SetNone(PyExc_KeyboardInterrupt); |
| 86 | } |
| 87 | return -1; |
| 88 | case E_NOMEM: |
| 89 | PyErr_NoMemory(); |
| 90 | return -1; |
| 91 | case E_TABSPACE: |
| 92 | errtype = PyExc_TabError; |
| 93 | msg = "inconsistent use of tabs and spaces in indentation"; |
| 94 | break; |
| 95 | case E_TOODEEP: |
| 96 | errtype = PyExc_IndentationError; |
| 97 | msg = "too many levels of indentation"; |
| 98 | break; |
| 99 | case E_LINECONT: { |
| 100 | col_offset = p->tok->cur - p->tok->buf - 1; |
| 101 | msg = "unexpected character after line continuation character"; |
| 102 | break; |
| 103 | } |
| 104 | default: |
| 105 | msg = "unknown parsing error"; |
| 106 | } |
| 107 | |
| 108 | RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, |
| 109 | col_offset >= 0 ? col_offset : 0, |
| 110 | p->tok->lineno, -1, msg); |
| 111 | return -1; |
| 112 | } |
| 113 | |
| 114 | int |
| 115 | _Pypegen_raise_decode_error(Parser *p) |
| 116 | { |
| 117 | assert(PyErr_Occurred()); |
| 118 | const char *errtype = NULL; |
| 119 | if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { |
| 120 | errtype = "unicode error"; |
| 121 | } |
| 122 | else if (PyErr_ExceptionMatches(PyExc_ValueError)) { |
| 123 | errtype = "value error"; |
| 124 | } |
| 125 | if (errtype) { |
| 126 | PyObject *type; |
| 127 | PyObject *value; |
| 128 | PyObject *tback; |
| 129 | PyObject *errstr; |
| 130 | PyErr_Fetch(&type, &value, &tback); |
| 131 | errstr = PyObject_Str(value); |
| 132 | if (errstr) { |
| 133 | RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); |
| 134 | Py_DECREF(errstr); |
| 135 | } |
| 136 | else { |
| 137 | PyErr_Clear(); |
| 138 | RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); |
| 139 | } |
| 140 | Py_XDECREF(type); |
| 141 | Py_XDECREF(value); |
| 142 | Py_XDECREF(tback); |
| 143 | } |
| 144 | |
| 145 | return -1; |
| 146 | } |
| 147 | |
| 148 | static int |
| 149 | _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { |
| 150 | // Tokenize the whole input to see if there are any tokenization |
| 151 | // errors such as mistmatching parentheses. These will get priority |
| 152 | // over generic syntax errors only if the line number of the error is |
| 153 | // before the one that we had for the generic error. |
| 154 | |
| 155 | // We don't want to tokenize to the end for interactive input |
| 156 | if (p->tok->prompt != NULL) { |
| 157 | return 0; |
| 158 | } |
| 159 | |
| 160 | PyObject *type, *value, *traceback; |
| 161 | PyErr_Fetch(&type, &value, &traceback); |
| 162 | |
| 163 | Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
| 164 | Py_ssize_t current_err_line = current_token->lineno; |
| 165 | |
| 166 | int ret = 0; |
| 167 | |
| 168 | for (;;) { |
| 169 | const char *start; |
| 170 | const char *end; |
| 171 | switch (_PyTokenizer_Get(p->tok, &start, &end)) { |
| 172 | case ERRORTOKEN: |
| 173 | if (p->tok->level != 0) { |
| 174 | int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; |
| 175 | if (current_err_line > error_lineno) { |
| 176 | raise_unclosed_parentheses_error(p); |
| 177 | ret = -1; |
| 178 | goto exit; |
| 179 | } |
| 180 | } |
| 181 | break; |
| 182 | case ENDMARKER: |
| 183 | break; |
| 184 | default: |
| 185 | continue; |
| 186 | } |
| 187 | break; |
| 188 | } |
| 189 | |
| 190 | |
| 191 | exit: |
| 192 | if (PyErr_Occurred()) { |
| 193 | Py_XDECREF(value); |
| 194 | Py_XDECREF(type); |
| 195 | Py_XDECREF(traceback); |
| 196 | } else { |
| 197 | PyErr_Restore(type, value, traceback); |
| 198 | } |
| 199 | return ret; |
| 200 | } |
| 201 | |
| 202 | // PARSER ERRORS |
| 203 | |
| 204 | void * |
| 205 | _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) |
| 206 | { |
| 207 | if (p->fill == 0) { |
| 208 | va_list va; |
| 209 | va_start(va, errmsg); |
| 210 | _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); |
| 211 | va_end(va); |
| 212 | return NULL; |
| 213 | } |
| 214 | |
| 215 | Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; |
| 216 | Py_ssize_t col_offset; |
| 217 | Py_ssize_t end_col_offset = -1; |
| 218 | if (t->col_offset == -1) { |
| 219 | if (p->tok->cur == p->tok->buf) { |
| 220 | col_offset = 0; |
| 221 | } else { |
| 222 | const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; |
| 223 | col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); |
| 224 | } |
| 225 | } else { |
| 226 | col_offset = t->col_offset + 1; |
| 227 | } |
| 228 | |
| 229 | if (t->end_col_offset != -1) { |
| 230 | end_col_offset = t->end_col_offset + 1; |
| 231 | } |
| 232 | |
| 233 | va_list va; |
| 234 | va_start(va, errmsg); |
| 235 | _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); |
| 236 | va_end(va); |
| 237 | |
| 238 | return NULL; |
| 239 | } |
| 240 | |
| 241 | static PyObject * |
| 242 | get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) |
| 243 | { |
| 244 | /* If the file descriptor is interactive, the source lines of the current |
| 245 | * (multi-line) statement are stored in p->tok->interactive_src_start. |
| 246 | * If not, we're parsing from a string, which means that the whole source |
| 247 | * is stored in p->tok->str. */ |
| 248 | assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin); |
| 249 | |
| 250 | char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; |
| 251 | assert(cur_line != NULL); |
| 252 | |
| 253 | for (int i = 0; i < lineno - 1; i++) { |
| 254 | cur_line = strchr(cur_line, '\n') + 1; |
| 255 | } |
| 256 | |
| 257 | char *next_newline; |
| 258 | if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line |
| 259 | next_newline = cur_line + strlen(cur_line); |
| 260 | } |
| 261 | return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); |
| 262 | } |
| 263 | |
| 264 | void * |
| 265 | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, |
| 266 | Py_ssize_t lineno, Py_ssize_t col_offset, |
| 267 | Py_ssize_t end_lineno, Py_ssize_t end_col_offset, |
| 268 | const char *errmsg, va_list va) |
| 269 | { |
| 270 | PyObject *value = NULL; |
| 271 | PyObject *errstr = NULL; |
| 272 | PyObject *error_line = NULL; |
| 273 | PyObject *tmp = NULL; |
| 274 | p->error_indicator = 1; |
| 275 | |
| 276 | if (end_lineno == CURRENT_POS) { |
| 277 | end_lineno = p->tok->lineno; |
| 278 | } |
| 279 | if (end_col_offset == CURRENT_POS) { |
| 280 | end_col_offset = p->tok->cur - p->tok->line_start; |
| 281 | } |
| 282 | |
| 283 | if (p->start_rule == Py_fstring_input) { |
| 284 | const char *fstring_msg = "f-string: "; |
| 285 | Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); |
| 286 | |
| 287 | char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character |
| 288 | if (!new_errmsg) { |
| 289 | return (void *) PyErr_NoMemory(); |
| 290 | } |
| 291 | |
| 292 | // Copy both strings into new buffer |
| 293 | memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); |
| 294 | memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); |
| 295 | new_errmsg[len] = 0; |
| 296 | errmsg = new_errmsg; |
| 297 | } |
| 298 | errstr = PyUnicode_FromFormatV(errmsg, va); |
| 299 | if (!errstr) { |
| 300 | goto error; |
| 301 | } |
| 302 | |
| 303 | if (p->tok->fp_interactive) { |
| 304 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
| 305 | } |
| 306 | else if (p->start_rule == Py_file_input) { |
| 307 | error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, |
| 308 | (int) lineno, p->tok->encoding); |
| 309 | } |
| 310 | |
| 311 | if (!error_line) { |
| 312 | /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, |
| 313 | then we need to find the error line from some other source, because |
| 314 | p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly |
| 315 | failed or we're parsing from a string or the REPL. There's a third edge case where |
| 316 | we're actually parsing from a file, which has an E_EOF SyntaxError and in that case |
| 317 | `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which |
| 318 | does not physically exist */ |
| 319 | assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); |
| 320 | |
| 321 | if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { |
| 322 | Py_ssize_t size = p->tok->inp - p->tok->buf; |
| 323 | error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); |
| 324 | } |
| 325 | else if (p->tok->fp == NULL || p->tok->fp == stdin) { |
| 326 | error_line = get_error_line_from_tokenizer_buffers(p, lineno); |
| 327 | } |
| 328 | else { |
| 329 | error_line = PyUnicode_FromStringAndSize("", 0); |
| 330 | } |
| 331 | if (!error_line) { |
| 332 | goto error; |
| 333 | } |
| 334 | } |
| 335 | |
| 336 | if (p->start_rule == Py_fstring_input) { |
| 337 | col_offset -= p->starting_col_offset; |
| 338 | end_col_offset -= p->starting_col_offset; |
| 339 | } |
| 340 | |
| 341 | Py_ssize_t col_number = col_offset; |
| 342 | Py_ssize_t end_col_number = end_col_offset; |
| 343 | |
| 344 | if (p->tok->encoding != NULL) { |
| 345 | col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); |
| 346 | if (col_number < 0) { |
| 347 | goto error; |
| 348 | } |
| 349 | if (end_col_number > 0) { |
| 350 | Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); |
| 351 | if (end_col_offset < 0) { |
| 352 | goto error; |
| 353 | } else { |
| 354 | end_col_number = end_col_offset; |
| 355 | } |
| 356 | } |
| 357 | } |
| 358 | tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); |
| 359 | if (!tmp) { |
| 360 | goto error; |
| 361 | } |
| 362 | value = PyTuple_Pack(2, errstr, tmp); |
| 363 | Py_DECREF(tmp); |
| 364 | if (!value) { |
| 365 | goto error; |
| 366 | } |
| 367 | PyErr_SetObject(errtype, value); |
| 368 | |
| 369 | Py_DECREF(errstr); |
| 370 | Py_DECREF(value); |
| 371 | if (p->start_rule == Py_fstring_input) { |
| 372 | PyMem_Free((void *)errmsg); |
| 373 | } |
| 374 | return NULL; |
| 375 | |
| 376 | error: |
| 377 | Py_XDECREF(errstr); |
| 378 | Py_XDECREF(error_line); |
| 379 | if (p->start_rule == Py_fstring_input) { |
| 380 | PyMem_Free((void *)errmsg); |
| 381 | } |
| 382 | return NULL; |
| 383 | } |
| 384 | |
| 385 | void |
| 386 | _Pypegen_set_syntax_error(Parser* p, Token* last_token) { |
| 387 | // Existing sintax error |
| 388 | if (PyErr_Occurred()) { |
| 389 | // Prioritize tokenizer errors to custom syntax errors raised |
| 390 | // on the second phase only if the errors come from the parser. |
| 391 | if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) { |
| 392 | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
| 393 | } |
| 394 | // Propagate the existing syntax error. |
| 395 | return; |
| 396 | } |
| 397 | // Initialization error |
| 398 | if (p->fill == 0) { |
| 399 | RAISE_SYNTAX_ERROR("error at start before reading any input"); |
| 400 | } |
| 401 | // Parser encountered EOF (End of File) unexpectedtly |
Pablo Galindo Salgado | c72311d | 2021-11-25 01:01:40 +0000 | [diff] [blame] | 402 | if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { |
Pablo Galindo Salgado | 07cf66f | 2021-11-21 04:15:22 +0000 | [diff] [blame] | 403 | if (p->tok->level) { |
| 404 | raise_unclosed_parentheses_error(p); |
| 405 | } else { |
| 406 | RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); |
| 407 | } |
| 408 | return; |
| 409 | } |
| 410 | // Indentation error in the tokenizer |
| 411 | if (last_token->type == INDENT || last_token->type == DEDENT) { |
| 412 | RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); |
| 413 | return; |
| 414 | } |
| 415 | // Unknown error (generic case) |
| 416 | |
| 417 | // Use the last token we found on the first pass to avoid reporting |
| 418 | // incorrect locations for generic syntax errors just because we reached |
| 419 | // further away when trying to find specific syntax errors in the second |
| 420 | // pass. |
| 421 | RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); |
| 422 | // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing |
| 423 | // generic SyntaxError we just raised if errors are found. |
| 424 | _PyPegen_tokenize_full_source_to_check_for_errors(p); |
Pablo Galindo Salgado | c72311d | 2021-11-25 01:01:40 +0000 | [diff] [blame] | 425 | } |