Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 1 | #include "Python.h" |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 2 | #include "frameobject.h" |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 3 | |
| 4 | #include "pycore_pyerrors.h" |
| 5 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 6 | #define MAX_CANDIDATE_ITEMS 750 |
| 7 | #define MAX_STRING_SIZE 40 |
| 8 | |
| 9 | #define MOVE_COST 2 |
| 10 | #define CASE_COST 1 |
| 11 | |
| 12 | #define LEAST_FIVE_BITS(n) ((n) & 31) |
| 13 | |
| 14 | static inline int |
| 15 | substitution_cost(char a, char b) |
| 16 | { |
| 17 | if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) { |
| 18 | // Not the same, not a case flip. |
| 19 | return MOVE_COST; |
| 20 | } |
| 21 | if (a == b) { |
| 22 | return 0; |
| 23 | } |
| 24 | if ('A' <= a && a <= 'Z') { |
| 25 | a += ('a' - 'A'); |
| 26 | } |
| 27 | if ('A' <= b && b <= 'Z') { |
| 28 | b += ('a' - 'A'); |
| 29 | } |
| 30 | if (a == b) { |
| 31 | return CASE_COST; |
| 32 | } |
| 33 | return MOVE_COST; |
| 34 | } |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 35 | |
| 36 | /* Calculate the Levenshtein distance between string1 and string2 */ |
Pablo Galindo | 0b1c169 | 2021-04-17 23:28:45 +0100 | [diff] [blame] | 37 | static Py_ssize_t |
Pablo Galindo | 3fc65b9 | 2021-04-15 00:03:43 +0100 | [diff] [blame] | 38 | levenshtein_distance(const char *a, size_t a_size, |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 39 | const char *b, size_t b_size, |
| 40 | size_t max_cost) |
| 41 | { |
| 42 | static size_t buffer[MAX_STRING_SIZE]; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 43 | |
| 44 | // Both strings are the same (by identity) |
| 45 | if (a == b) { |
| 46 | return 0; |
| 47 | } |
| 48 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 49 | // Trim away common affixes. |
| 50 | while (a_size && b_size && a[0] == b[0]) { |
| 51 | a++; a_size--; |
| 52 | b++; b_size--; |
| 53 | } |
| 54 | while (a_size && b_size && a[a_size-1] == b[b_size-1]) { |
| 55 | a_size--; |
| 56 | b_size--; |
| 57 | } |
| 58 | if (a_size == 0 || b_size == 0) { |
| 59 | return (a_size + b_size) * MOVE_COST; |
| 60 | } |
| 61 | if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) { |
| 62 | return max_cost + 1; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 63 | } |
| 64 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 65 | // Prefer shorter buffer |
| 66 | if (b_size < a_size) { |
| 67 | const char *t = a; a = b; b = t; |
| 68 | size_t t_size = a_size; a_size = b_size; b_size = t_size; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 69 | } |
| 70 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 71 | // quick fail when a match is impossible. |
| 72 | if ((b_size - a_size) * MOVE_COST > max_cost) { |
| 73 | return max_cost + 1; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 74 | } |
| 75 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 76 | // Instead of producing the whole traditional len(a)-by-len(b) |
| 77 | // matrix, we can update just one row in place. |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 78 | // Initialize the buffer row |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 79 | for (size_t i = 0; i < a_size; i++) { |
| 80 | // cost from b[:0] to a[:i+1] |
| 81 | buffer[i] = (i + 1) * MOVE_COST; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 82 | } |
| 83 | |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 84 | size_t result = 0; |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 85 | for (size_t b_index = 0; b_index < b_size; b_index++) { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 86 | char code = b[b_index]; |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 87 | // cost(b[:b_index], a[:0]) == b_index * MOVE_COST |
| 88 | size_t distance = result = b_index * MOVE_COST; |
| 89 | size_t minimum = SIZE_MAX; |
| 90 | for (size_t index = 0; index < a_size; index++) { |
| 91 | |
| 92 | // cost(b[:b_index+1], a[:index+1]) = min( |
| 93 | // // 1) substitute |
| 94 | // cost(b[:b_index], a[:index]) |
| 95 | // + substitution_cost(b[b_index], a[index]), |
| 96 | // // 2) delete from b |
| 97 | // cost(b[:b_index], a[:index+1]) + MOVE_COST, |
| 98 | // // 3) delete from a |
| 99 | // cost(b[:b_index+1], a[index]) + MOVE_COST |
| 100 | // ) |
| 101 | |
| 102 | // 1) Previous distance in this row is cost(b[:b_index], a[:index]) |
| 103 | size_t substitute = distance + substitution_cost(code, a[index]); |
| 104 | // 2) cost(b[:b_index], a[:index+1]) from previous row |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 105 | distance = buffer[index]; |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 106 | // 3) existing result is cost(b[:b_index+1], a[index]) |
| 107 | |
| 108 | size_t insert_delete = Py_MIN(result, distance) + MOVE_COST; |
| 109 | result = Py_MIN(insert_delete, substitute); |
| 110 | |
| 111 | // cost(b[:b_index+1], a[:index+1]) |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 112 | buffer[index] = result; |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 113 | if (result < minimum) { |
| 114 | minimum = result; |
| 115 | } |
| 116 | } |
| 117 | if (minimum > max_cost) { |
| 118 | // Everything in this row is too big, so bail early. |
| 119 | return max_cost + 1; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 120 | } |
| 121 | } |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 122 | return result; |
| 123 | } |
| 124 | |
| 125 | static inline PyObject * |
| 126 | calculate_suggestions(PyObject *dir, |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 127 | PyObject *name) |
| 128 | { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 129 | assert(!PyErr_Occurred()); |
| 130 | assert(PyList_CheckExact(dir)); |
| 131 | |
| 132 | Py_ssize_t dir_size = PyList_GET_SIZE(dir); |
| 133 | if (dir_size >= MAX_CANDIDATE_ITEMS) { |
| 134 | return NULL; |
| 135 | } |
| 136 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 137 | Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 138 | PyObject *suggestion = NULL; |
Pablo Galindo | 3fc65b9 | 2021-04-15 00:03:43 +0100 | [diff] [blame] | 139 | Py_ssize_t name_size; |
| 140 | const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size); |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 141 | if (name_str == NULL) { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 142 | return NULL; |
| 143 | } |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 144 | |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 145 | for (int i = 0; i < dir_size; ++i) { |
| 146 | PyObject *item = PyList_GET_ITEM(dir, i); |
Pablo Galindo | 3fc65b9 | 2021-04-15 00:03:43 +0100 | [diff] [blame] | 147 | Py_ssize_t item_size; |
| 148 | const char *item_str = PyUnicode_AsUTF8AndSize(item, &item_size); |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 149 | if (item_str == NULL) { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 150 | return NULL; |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 151 | } |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 152 | // No more than 1/3 of the involved characters should need changed. |
| 153 | Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6; |
| 154 | // Don't take matches we've already beaten. |
| 155 | max_distance = Py_MIN(max_distance, suggestion_distance - 1); |
| 156 | Py_ssize_t current_distance = |
| 157 | levenshtein_distance(name_str, name_size, |
| 158 | item_str, item_size, max_distance); |
| 159 | if (current_distance > max_distance) { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 160 | continue; |
| 161 | } |
| 162 | if (!suggestion || current_distance < suggestion_distance) { |
| 163 | suggestion = item; |
| 164 | suggestion_distance = current_distance; |
| 165 | } |
| 166 | } |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 167 | Py_XINCREF(suggestion); |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 168 | return suggestion; |
| 169 | } |
| 170 | |
| 171 | static PyObject * |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 172 | offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) |
| 173 | { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 174 | PyObject *name = exc->name; // borrowed reference |
| 175 | PyObject *obj = exc->obj; // borrowed reference |
| 176 | |
| 177 | // Abort if we don't have an attribute name or we have an invalid one |
| 178 | if (name == NULL || obj == NULL || !PyUnicode_CheckExact(name)) { |
| 179 | return NULL; |
| 180 | } |
| 181 | |
| 182 | PyObject *dir = PyObject_Dir(obj); |
| 183 | if (dir == NULL) { |
| 184 | return NULL; |
| 185 | } |
| 186 | |
| 187 | PyObject *suggestions = calculate_suggestions(dir, name); |
| 188 | Py_DECREF(dir); |
| 189 | return suggestions; |
| 190 | } |
| 191 | |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 192 | |
| 193 | static PyObject * |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 194 | offer_suggestions_for_name_error(PyNameErrorObject *exc) |
| 195 | { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 196 | PyObject *name = exc->name; // borrowed reference |
| 197 | PyTracebackObject *traceback = (PyTracebackObject *) exc->traceback; // borrowed reference |
Pablo Galindo | 3fc65b9 | 2021-04-15 00:03:43 +0100 | [diff] [blame] | 198 | // Abort if we don't have a variable name or we have an invalid one |
| 199 | // or if we don't have a traceback to work with |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 200 | if (name == NULL || traceback == NULL || !PyUnicode_CheckExact(name)) { |
| 201 | return NULL; |
| 202 | } |
| 203 | |
| 204 | // Move to the traceback of the exception |
| 205 | while (traceback->tb_next != NULL) { |
| 206 | traceback = traceback->tb_next; |
| 207 | } |
| 208 | |
| 209 | PyFrameObject *frame = traceback->tb_frame; |
| 210 | assert(frame != NULL); |
| 211 | PyCodeObject *code = frame->f_code; |
| 212 | assert(code != NULL && code->co_varnames != NULL); |
| 213 | PyObject *dir = PySequence_List(code->co_varnames); |
| 214 | if (dir == NULL) { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 215 | return NULL; |
| 216 | } |
| 217 | |
| 218 | PyObject *suggestions = calculate_suggestions(dir, name); |
| 219 | Py_DECREF(dir); |
| 220 | if (suggestions != NULL) { |
| 221 | return suggestions; |
| 222 | } |
| 223 | |
| 224 | dir = PySequence_List(frame->f_globals); |
| 225 | if (dir == NULL) { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 226 | return NULL; |
| 227 | } |
| 228 | suggestions = calculate_suggestions(dir, name); |
| 229 | Py_DECREF(dir); |
Pablo Galindo | 3ab4bea | 2021-04-17 22:26:54 +0100 | [diff] [blame] | 230 | if (suggestions != NULL) { |
| 231 | return suggestions; |
| 232 | } |
| 233 | |
| 234 | dir = PySequence_List(frame->f_builtins); |
| 235 | if (dir == NULL) { |
| 236 | return NULL; |
| 237 | } |
| 238 | suggestions = calculate_suggestions(dir, name); |
| 239 | Py_DECREF(dir); |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 240 | |
| 241 | return suggestions; |
| 242 | } |
| 243 | |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 244 | // Offer suggestions for a given exception. Returns a python string object containing the |
Pablo Galindo | e07f4ab | 2021-04-14 18:58:28 +0100 | [diff] [blame] | 245 | // suggestions. This function returns NULL if no suggestion was found or if an exception happened, |
| 246 | // users must call PyErr_Occurred() to disambiguate. |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 247 | PyObject * |
| 248 | _Py_Offer_Suggestions(PyObject *exception) |
| 249 | { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 250 | PyObject *result = NULL; |
Pablo Galindo | e07f4ab | 2021-04-14 18:58:28 +0100 | [diff] [blame] | 251 | assert(!PyErr_Occurred()); |
Pablo Galindo | 0ad81d4 | 2021-04-16 17:12:03 +0100 | [diff] [blame] | 252 | if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_AttributeError)) { |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 253 | result = offer_suggestions_for_attribute_error((PyAttributeErrorObject *) exception); |
Pablo Galindo | 0ad81d4 | 2021-04-16 17:12:03 +0100 | [diff] [blame] | 254 | } else if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_NameError)) { |
Pablo Galindo | 5bf8bf2 | 2021-04-14 15:10:33 +0100 | [diff] [blame] | 255 | result = offer_suggestions_for_name_error((PyNameErrorObject *) exception); |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 256 | } |
Pablo Galindo | 37494b4 | 2021-04-14 02:36:07 +0100 | [diff] [blame] | 257 | return result; |
| 258 | } |
| 259 | |
Dennis Sweeney | 80a2a4e | 2021-05-03 11:47:27 -0400 | [diff] [blame^] | 260 | Py_ssize_t |
| 261 | _Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost) |
| 262 | { |
| 263 | assert(PyUnicode_Check(a) && PyUnicode_Check(b)); |
| 264 | Py_ssize_t size_a, size_b; |
| 265 | const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a); |
| 266 | if (utf8_a == NULL) { |
| 267 | return -1; |
| 268 | } |
| 269 | const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b); |
| 270 | if (utf8_b == NULL) { |
| 271 | return -1; |
| 272 | } |
| 273 | if (max_cost == -1) { |
| 274 | max_cost = MOVE_COST * Py_MAX(size_a, size_b); |
| 275 | } |
| 276 | return levenshtein_distance(utf8_a, size_a, utf8_b, size_b, max_cost); |
| 277 | } |
| 278 | |