blob: 7fd62fbfabf6425744d468031e0085c2685c81fb [file] [log] [blame]
Pablo Galindo37494b42021-04-14 02:36:07 +01001#include "Python.h"
Pablo Galindo5bf8bf22021-04-14 15:10:33 +01002#include "frameobject.h"
Pablo Galindo37494b42021-04-14 02:36:07 +01003
4#include "pycore_pyerrors.h"
5
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -04006#define MAX_CANDIDATE_ITEMS 750
7#define MAX_STRING_SIZE 40
8
9#define MOVE_COST 2
10#define CASE_COST 1
11
12#define LEAST_FIVE_BITS(n) ((n) & 31)
13
14static inline int
15substitution_cost(char a, char b)
16{
17 if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) {
18 // Not the same, not a case flip.
19 return MOVE_COST;
20 }
21 if (a == b) {
22 return 0;
23 }
24 if ('A' <= a && a <= 'Z') {
25 a += ('a' - 'A');
26 }
27 if ('A' <= b && b <= 'Z') {
28 b += ('a' - 'A');
29 }
30 if (a == b) {
31 return CASE_COST;
32 }
33 return MOVE_COST;
34}
Pablo Galindo37494b42021-04-14 02:36:07 +010035
36/* Calculate the Levenshtein distance between string1 and string2 */
Pablo Galindo0b1c1692021-04-17 23:28:45 +010037static Py_ssize_t
Pablo Galindo3fc65b92021-04-15 00:03:43 +010038levenshtein_distance(const char *a, size_t a_size,
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040039 const char *b, size_t b_size,
40 size_t max_cost)
41{
42 static size_t buffer[MAX_STRING_SIZE];
Pablo Galindo37494b42021-04-14 02:36:07 +010043
44 // Both strings are the same (by identity)
45 if (a == b) {
46 return 0;
47 }
48
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040049 // Trim away common affixes.
50 while (a_size && b_size && a[0] == b[0]) {
51 a++; a_size--;
52 b++; b_size--;
53 }
54 while (a_size && b_size && a[a_size-1] == b[b_size-1]) {
55 a_size--;
56 b_size--;
57 }
58 if (a_size == 0 || b_size == 0) {
59 return (a_size + b_size) * MOVE_COST;
60 }
61 if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
62 return max_cost + 1;
Pablo Galindo37494b42021-04-14 02:36:07 +010063 }
64
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040065 // Prefer shorter buffer
66 if (b_size < a_size) {
67 const char *t = a; a = b; b = t;
68 size_t t_size = a_size; a_size = b_size; b_size = t_size;
Pablo Galindo37494b42021-04-14 02:36:07 +010069 }
70
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040071 // quick fail when a match is impossible.
72 if ((b_size - a_size) * MOVE_COST > max_cost) {
73 return max_cost + 1;
Pablo Galindo37494b42021-04-14 02:36:07 +010074 }
75
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040076 // Instead of producing the whole traditional len(a)-by-len(b)
77 // matrix, we can update just one row in place.
Pablo Galindo37494b42021-04-14 02:36:07 +010078 // Initialize the buffer row
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040079 for (size_t i = 0; i < a_size; i++) {
80 // cost from b[:0] to a[:i+1]
81 buffer[i] = (i + 1) * MOVE_COST;
Pablo Galindo37494b42021-04-14 02:36:07 +010082 }
83
Pablo Galindo37494b42021-04-14 02:36:07 +010084 size_t result = 0;
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040085 for (size_t b_index = 0; b_index < b_size; b_index++) {
Pablo Galindo37494b42021-04-14 02:36:07 +010086 char code = b[b_index];
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -040087 // cost(b[:b_index], a[:0]) == b_index * MOVE_COST
88 size_t distance = result = b_index * MOVE_COST;
89 size_t minimum = SIZE_MAX;
90 for (size_t index = 0; index < a_size; index++) {
91
92 // cost(b[:b_index+1], a[:index+1]) = min(
93 // // 1) substitute
94 // cost(b[:b_index], a[:index])
95 // + substitution_cost(b[b_index], a[index]),
96 // // 2) delete from b
97 // cost(b[:b_index], a[:index+1]) + MOVE_COST,
98 // // 3) delete from a
99 // cost(b[:b_index+1], a[index]) + MOVE_COST
100 // )
101
102 // 1) Previous distance in this row is cost(b[:b_index], a[:index])
103 size_t substitute = distance + substitution_cost(code, a[index]);
104 // 2) cost(b[:b_index], a[:index+1]) from previous row
Pablo Galindo37494b42021-04-14 02:36:07 +0100105 distance = buffer[index];
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400106 // 3) existing result is cost(b[:b_index+1], a[index])
107
108 size_t insert_delete = Py_MIN(result, distance) + MOVE_COST;
109 result = Py_MIN(insert_delete, substitute);
110
111 // cost(b[:b_index+1], a[:index+1])
Pablo Galindo37494b42021-04-14 02:36:07 +0100112 buffer[index] = result;
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400113 if (result < minimum) {
114 minimum = result;
115 }
116 }
117 if (minimum > max_cost) {
118 // Everything in this row is too big, so bail early.
119 return max_cost + 1;
Pablo Galindo37494b42021-04-14 02:36:07 +0100120 }
121 }
Pablo Galindo37494b42021-04-14 02:36:07 +0100122 return result;
123}
124
125static inline PyObject *
126calculate_suggestions(PyObject *dir,
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400127 PyObject *name)
128{
Pablo Galindo37494b42021-04-14 02:36:07 +0100129 assert(!PyErr_Occurred());
130 assert(PyList_CheckExact(dir));
131
132 Py_ssize_t dir_size = PyList_GET_SIZE(dir);
133 if (dir_size >= MAX_CANDIDATE_ITEMS) {
134 return NULL;
135 }
136
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400137 Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX;
Pablo Galindo37494b42021-04-14 02:36:07 +0100138 PyObject *suggestion = NULL;
Pablo Galindo3fc65b92021-04-15 00:03:43 +0100139 Py_ssize_t name_size;
140 const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size);
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100141 if (name_str == NULL) {
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100142 return NULL;
143 }
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400144
Pablo Galindo37494b42021-04-14 02:36:07 +0100145 for (int i = 0; i < dir_size; ++i) {
146 PyObject *item = PyList_GET_ITEM(dir, i);
Pablo Galindo3fc65b92021-04-15 00:03:43 +0100147 Py_ssize_t item_size;
148 const char *item_str = PyUnicode_AsUTF8AndSize(item, &item_size);
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100149 if (item_str == NULL) {
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100150 return NULL;
Pablo Galindo37494b42021-04-14 02:36:07 +0100151 }
Miss Islington (bot)a0b1d402021-07-16 14:16:08 -0700152 if (PyUnicode_CompareWithASCIIString(name, item_str) == 0) {
153 continue;
154 }
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400155 // No more than 1/3 of the involved characters should need changed.
156 Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6;
157 // Don't take matches we've already beaten.
158 max_distance = Py_MIN(max_distance, suggestion_distance - 1);
159 Py_ssize_t current_distance =
160 levenshtein_distance(name_str, name_size,
161 item_str, item_size, max_distance);
162 if (current_distance > max_distance) {
Pablo Galindo37494b42021-04-14 02:36:07 +0100163 continue;
164 }
165 if (!suggestion || current_distance < suggestion_distance) {
166 suggestion = item;
167 suggestion_distance = current_distance;
168 }
169 }
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400170 Py_XINCREF(suggestion);
Pablo Galindo37494b42021-04-14 02:36:07 +0100171 return suggestion;
172}
173
174static PyObject *
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400175offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc)
176{
Pablo Galindo37494b42021-04-14 02:36:07 +0100177 PyObject *name = exc->name; // borrowed reference
178 PyObject *obj = exc->obj; // borrowed reference
179
180 // Abort if we don't have an attribute name or we have an invalid one
181 if (name == NULL || obj == NULL || !PyUnicode_CheckExact(name)) {
182 return NULL;
183 }
184
185 PyObject *dir = PyObject_Dir(obj);
186 if (dir == NULL) {
187 return NULL;
188 }
189
190 PyObject *suggestions = calculate_suggestions(dir, name);
191 Py_DECREF(dir);
192 return suggestions;
193}
194
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100195
196static PyObject *
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400197offer_suggestions_for_name_error(PyNameErrorObject *exc)
198{
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100199 PyObject *name = exc->name; // borrowed reference
200 PyTracebackObject *traceback = (PyTracebackObject *) exc->traceback; // borrowed reference
Pablo Galindo3fc65b92021-04-15 00:03:43 +0100201 // Abort if we don't have a variable name or we have an invalid one
202 // or if we don't have a traceback to work with
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100203 if (name == NULL || traceback == NULL || !PyUnicode_CheckExact(name)) {
204 return NULL;
205 }
206
207 // Move to the traceback of the exception
208 while (traceback->tb_next != NULL) {
209 traceback = traceback->tb_next;
210 }
211
212 PyFrameObject *frame = traceback->tb_frame;
213 assert(frame != NULL);
214 PyCodeObject *code = frame->f_code;
215 assert(code != NULL && code->co_varnames != NULL);
216 PyObject *dir = PySequence_List(code->co_varnames);
217 if (dir == NULL) {
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100218 return NULL;
219 }
220
221 PyObject *suggestions = calculate_suggestions(dir, name);
222 Py_DECREF(dir);
223 if (suggestions != NULL) {
224 return suggestions;
225 }
226
227 dir = PySequence_List(frame->f_globals);
228 if (dir == NULL) {
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100229 return NULL;
230 }
231 suggestions = calculate_suggestions(dir, name);
232 Py_DECREF(dir);
Pablo Galindo3ab4bea2021-04-17 22:26:54 +0100233 if (suggestions != NULL) {
234 return suggestions;
235 }
236
237 dir = PySequence_List(frame->f_builtins);
238 if (dir == NULL) {
239 return NULL;
240 }
241 suggestions = calculate_suggestions(dir, name);
242 Py_DECREF(dir);
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100243
244 return suggestions;
245}
246
Pablo Galindo37494b42021-04-14 02:36:07 +0100247// Offer suggestions for a given exception. Returns a python string object containing the
Pablo Galindoe07f4ab2021-04-14 18:58:28 +0100248// suggestions. This function returns NULL if no suggestion was found or if an exception happened,
249// users must call PyErr_Occurred() to disambiguate.
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400250PyObject *
251_Py_Offer_Suggestions(PyObject *exception)
252{
Pablo Galindo37494b42021-04-14 02:36:07 +0100253 PyObject *result = NULL;
Pablo Galindoe07f4ab2021-04-14 18:58:28 +0100254 assert(!PyErr_Occurred());
Pablo Galindo0ad81d42021-04-16 17:12:03 +0100255 if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_AttributeError)) {
Pablo Galindo37494b42021-04-14 02:36:07 +0100256 result = offer_suggestions_for_attribute_error((PyAttributeErrorObject *) exception);
Pablo Galindo0ad81d42021-04-16 17:12:03 +0100257 } else if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_NameError)) {
Pablo Galindo5bf8bf22021-04-14 15:10:33 +0100258 result = offer_suggestions_for_name_error((PyNameErrorObject *) exception);
Pablo Galindo37494b42021-04-14 02:36:07 +0100259 }
Pablo Galindo37494b42021-04-14 02:36:07 +0100260 return result;
261}
262
Dennis Sweeney80a2a4e2021-05-03 11:47:27 -0400263Py_ssize_t
264_Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost)
265{
266 assert(PyUnicode_Check(a) && PyUnicode_Check(b));
267 Py_ssize_t size_a, size_b;
268 const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a);
269 if (utf8_a == NULL) {
270 return -1;
271 }
272 const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b);
273 if (utf8_b == NULL) {
274 return -1;
275 }
276 if (max_cost == -1) {
277 max_cost = MOVE_COST * Py_MAX(size_a, size_b);
278 }
279 return levenshtein_distance(utf8_a, size_a, utf8_b, size_b, max_cost);
280}
281