blob: cf680280e7966e8e67d42441b78061c7504f6d75 [file] [log] [blame]
Sam McCall87496412017-12-01 17:08:02 +00001//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
11// we consider the possible partial match states:
12//
13// u n i q u e _ p t r
14// +---------------------
15// |A . . . . . . . . . .
16// u|
17// |. . . . . . . . . . .
18// _|
19// |. . . . . . . O . . .
20// p|
21// |. . . . . . . . . . B
22//
23// Each dot represents some prefix of the pattern being matched against some
24// prefix of the word.
25// - A is the initial state: '' matched against ''
26// - O is an intermediate state: 'u_' matched against 'unique_'
27// - B is the target state: 'u_p' matched against 'unique_ptr'
28//
29// We aim to find the best path from A->B.
30// - Moving right (consuming a word character)
31// Always legal: not all word characters must match.
32// - Moving diagonally (consuming both a word and pattern character)
33// Legal if the characters match.
34// - Moving down (consuming a pattern character) is never legal.
35// Never legal: all pattern characters must match something.
36//
37// The scoring is based on heuristics:
38// - when matching a character, apply a bonus or penalty depending on the
39// match quality (does case match, do word segments align, etc)
40// - when skipping a character, apply a penalty if it hurts the match
41// (it starts a word segment, or splits the matched region, etc)
42//
43// These heuristics require the ability to "look backward" one character, to
44// see whether it was matched or not. Therefore the dynamic-programming matrix
45// has an extra dimension (last character matched).
46// Each entry also has an additional flag indicating whether the last-but-one
47// character matched, which is needed to trace back through the scoring table
48// and reconstruct the match.
49//
50// We treat strings as byte-sequences, so only ASCII has first-class support.
51//
52// This algorithm was inspired by VS code's client-side filtering, and aims
53// to be mostly-compatible.
54//
55//===----------------------------------------------------------------------===//
56
57#include "FuzzyMatch.h"
58#include "llvm/ADT/Optional.h"
59#include "llvm/Support/Format.h"
60
Sam McCall8fed6342017-12-01 20:03:19 +000061namespace clang {
62namespace clangd {
Sam McCall87496412017-12-01 17:08:02 +000063using namespace llvm;
Sam McCall87496412017-12-01 17:08:02 +000064
Sam McCalla8c5d3a2017-12-02 02:28:29 +000065constexpr int FuzzyMatcher::MaxPat;
66constexpr int FuzzyMatcher::MaxWord;
Sam McCall87496412017-12-01 17:08:02 +000067
68static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
69// A "negative infinity" score that won't overflow.
70// We use this to mark unreachable states and forbidden solutions.
71// Score field is 15 bits wide, min value is -2^14, we use half of that.
72static constexpr int AwfulScore = -(1 << 13);
73static bool isAwful(int S) { return S < AwfulScore / 2; }
74static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
75
76FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
77 : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
78 ScoreScale(float{1} / (PerfectBonus * PatN)), WordN(0) {
79 memcpy(Pat, Pattern.data(), PatN);
80 for (int I = 0; I < PatN; ++I) {
81 LowPat[I] = lower(Pat[I]);
82 CaseSensitive |= LowPat[I] != Pat[I];
83 }
84 Scores[0][0][Miss] = {0, Miss};
85 Scores[0][0][Match] = {AwfulScore, Miss};
86 for (int P = 0; P <= PatN; ++P)
87 for (int W = 0; W < P; ++W)
88 for (Action A : {Miss, Match})
89 Scores[P][W][A] = {AwfulScore, Miss};
90 calculateRoles(Pat, PatRole, PatN);
91}
92
93Optional<float> FuzzyMatcher::match(StringRef Word) {
94 if (!PatN)
95 return 1;
96 if (!(WordContainsPattern = init(Word)))
97 return None;
98 buildGraph();
99 auto Best = std::max(Scores[PatN][WordN][Miss].Score,
100 Scores[PatN][WordN][Match].Score);
101 if (isAwful(Best))
102 return None;
103 return ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
104}
105
106// Segmentation of words and patterns.
107// A name like "fooBar_baz" consists of several parts foo, bar, baz.
108// Aligning segmentation of word and pattern improves the fuzzy-match.
109// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
110//
111// First we classify each character into types (uppercase, lowercase, etc).
112// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
113
114// We only distinguish the types of characters that affect segmentation.
115// It's not obvious how to segment digits, we treat them as lowercase letters.
116// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
117// This means we require exact (case-sensitive) match.
Sam McCall8e97cca2017-12-02 03:35:19 +0000118enum FuzzyMatcher::CharType : unsigned char {
Sam McCall87496412017-12-01 17:08:02 +0000119 Empty = 0, // Before-the-start and after-the-end (and control chars).
120 Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
121 Upper = 2, // Uppercase letters.
122 Punctuation = 3, // ASCII punctuation (including Space)
123};
124
125// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
126// The top 6 bits of the char select the byte, the bottom 2 select the offset.
127// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
128constexpr static uint8_t CharTypes[] = {
129 0x00, 0x00, 0x00, 0x00, // Control characters
130 0x00, 0x00, 0x00, 0x00, // Control characters
131 0xff, 0xff, 0xff, 0xff, // Punctuation
132 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
133 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
134 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
135 0x57, 0x55, 0x55, 0x55, // ` and a-o
136 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
137 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
138 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
139 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
140 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
141};
142
143// Each character's Role is the Head or Tail of a segment, or a Separator.
144// e.g. XMLHttpRequest_Async
145// +--+---+------ +----
146// ^Head ^Tail ^Separator
Sam McCall8e97cca2017-12-02 03:35:19 +0000147enum FuzzyMatcher::CharRole : unsigned char {
Sam McCall87496412017-12-01 17:08:02 +0000148 Unknown = 0, // Stray control characters or impossible states.
149 Tail = 1, // Part of a word segment, but not the first character.
150 Head = 2, // The first character of a word segment.
151 Separator = 3, // Punctuation characters that separate word segments.
152};
153
154// The Role can be determined from the Type of a character and its neighbors:
155//
156// Example | Chars | Type | Role
157// ---------+--------------+-----
158// F(o)oBar | Foo | Ull | Tail
159// Foo(B)ar | oBa | lUl | Head
160// (f)oo | ^fo | Ell | Head
161// H(T)TP | HTT | UUU | Tail
162//
163// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
164// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
165// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
166constexpr static uint8_t CharRoles[] = {
167 // clang-format off
168 // Curr= Empty Lower Upper Separ
169 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
170 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
171 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
172 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
173 // clang-format on
174};
175
176template <typename T> static T packedLookup(const uint8_t *Data, int I) {
177 return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
178}
179void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
180 // Types holds a sliding window of (Prev, Curr, Next) types.
181 // Initial value is (Empty, Empty, type of Text[0]).
182 int Types = packedLookup<CharType>(CharTypes, Text[0]);
183 // Rotate slides in the type of the next character.
184 auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
185 for (int I = 0; I < N - 1; ++I) {
186 // For each character, rotate in the next, and look up the role.
187 Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
188 *Out++ = packedLookup<CharRole>(CharRoles, Types);
189 }
190 // For the last character, the "next character" is Empty.
191 Rotate(Empty);
192 *Out++ = packedLookup<CharRole>(CharRoles, Types);
193}
194
195// Sets up the data structures matching Word.
196// Returns false if we can cheaply determine that no match is possible.
197bool FuzzyMatcher::init(StringRef NewWord) {
198 WordN = std::min<int>(MaxWord, NewWord.size());
199 if (PatN > WordN)
200 return false;
201 memcpy(Word, NewWord.data(), WordN);
202 for (int I = 0; I < WordN; ++I)
203 LowWord[I] = lower(Word[I]);
204
205 // Cheap subsequence check.
206 for (int W = 0, P = 0; P != PatN; ++W) {
207 if (W == WordN)
208 return false;
209 if (LowWord[W] == LowPat[P])
210 ++P;
211 }
212
213 calculateRoles(Word, WordRole, WordN);
214 return true;
215}
216
217// The forwards pass finds the mappings of Pattern onto Word.
218// Score = best score achieved matching Word[..W] against Pat[..P].
219// Unlike other tables, indices range from 0 to N *inclusive*
220// Matched = whether we chose to match Word[W] with Pat[P] or not.
221//
222// Points are mostly assigned to matched characters, with 1 being a good score
223// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
224// This range is not strict: we can apply larger bonuses/penalties, or penalize
225// non-matched characters.
226void FuzzyMatcher::buildGraph() {
227 for (int W = 0; W < WordN; ++W) {
228 Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
229 Miss};
230 Scores[0][W + 1][Match] = {AwfulScore, Miss};
231 }
232 for (int P = 0; P < PatN; ++P) {
233 for (int W = P; W < WordN; ++W) {
234 auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
235
236 auto MatchMissScore = PreMiss[Match].Score;
237 auto MissMissScore = PreMiss[Miss].Score;
238 if (P < PatN - 1) { // Skipping trailing characters is always free.
239 MatchMissScore -= skipPenalty(W, Match);
240 MissMissScore -= skipPenalty(W, Miss);
241 }
242 Score[Miss] = (MatchMissScore > MissMissScore)
243 ? ScoreInfo{MatchMissScore, Match}
244 : ScoreInfo{MissMissScore, Miss};
245
246 if (LowPat[P] != LowWord[W]) { // No match possible.
247 Score[Match] = {AwfulScore, Miss};
248 } else {
249 auto &PreMatch = Scores[P][W];
250 auto MatchMatchScore = PreMatch[Match].Score + matchBonus(P, W, Match);
251 auto MissMatchScore = PreMatch[Miss].Score + matchBonus(P, W, Miss);
252 Score[Match] = (MatchMatchScore > MissMatchScore)
253 ? ScoreInfo{MatchMatchScore, Match}
254 : ScoreInfo{MissMatchScore, Miss};
255 }
256 }
257 }
258}
259
260int FuzzyMatcher::skipPenalty(int W, Action Last) {
261 int S = 0;
262 if (WordRole[W] == Head) // Skipping a segment.
263 S += 1;
264 if (Last == Match) // Non-consecutive match.
265 S += 2; // We'd rather skip a segment than split our match.
266 return S;
267}
268
269int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
270 assert(LowPat[P] == LowWord[W]);
271 int S = 1;
272 // Bonus: pattern so far is a (case-insensitive) prefix of the word.
273 if (P == W) // We can't skip pattern characters, so we must have matched all.
274 ++S;
275 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
276 if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
277 (PatRole[P] == Head && WordRole[W] == Head))
278 ++S;
279 // Penalty: matching inside a segment (and previous char wasn't matched).
280 if (WordRole[W] == Tail && P && Last == Miss)
281 S -= 3;
282 // Penalty: a Head in the pattern matches in the middle of a word segment.
283 if (PatRole[P] == Head && WordRole[W] == Tail)
284 --S;
285 // Penalty: matching the first pattern character in the middle of a segment.
286 if (P == 0 && WordRole[W] == Tail)
287 S -= 4;
288 assert(S <= PerfectBonus);
289 return S;
290}
291
292llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
293 llvm::SmallString<256> Result;
294 OS << "=== Match \"" << StringRef(Word, WordN) << "\" against ["
295 << StringRef(Pat, PatN) << "] ===\n";
296 if (!WordContainsPattern) {
297 OS << "Substring check failed.\n";
298 return Result;
299 } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
300 Scores[PatN][WordN][Miss].Score))) {
301 OS << "Substring check passed, but all matches are forbidden\n";
302 }
303 if (!CaseSensitive)
304 OS << "Lowercase query, so scoring ignores case\n";
305
306 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
307 // The Score table has cumulative scores, subtracting along this path gives
308 // us the per-letter scores.
309 Action Last =
310 (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
311 ? Match
312 : Miss;
313 int S[MaxWord];
314 Action A[MaxWord];
315 for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
316 A[W] = Last;
317 const auto &Cell = Scores[P + 1][W + 1][Last];
318 if (Last == Match)
319 --P;
320 const auto &Prev = Scores[P + 1][W][Cell.Prev];
321 S[W] = Cell.Score - Prev.Score;
322 Last = Cell.Prev;
323 }
324 for (int I = 0; I < WordN; ++I) {
325 if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
326 Result.push_back('[');
327 if (A[I] == Miss && I > 0 && A[I - 1] == Match)
328 Result.push_back(']');
329 Result.push_back(Word[I]);
330 }
331 if (A[WordN - 1] == Match)
332 Result.push_back(']');
333
334 for (char C : StringRef(Word, WordN))
335 OS << " " << C << " ";
336 OS << "\n";
337 for (int I = 0, J = 0; I < WordN; I++)
338 OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
339 OS << "\n";
340 for (int I = 0; I < WordN; I++)
341 OS << format("%2d ", S[I]);
342 OS << "\n";
343
344 OS << "\nSegmentation:";
345 OS << "\n'" << StringRef(Word, WordN) << "'\n ";
346 for (int I = 0; I < WordN; ++I)
347 OS << "?-+ "[static_cast<int>(WordRole[I])];
348 OS << "\n[" << StringRef(Pat, PatN) << "]\n ";
349 for (int I = 0; I < PatN; ++I)
350 OS << "?-+ "[static_cast<int>(PatRole[I])];
351 OS << "\n";
352
353 OS << "\nScoring table (last-Miss, last-Match):\n";
354 OS << " | ";
355 for (char C : StringRef(Word, WordN))
356 OS << " " << C << " ";
357 OS << "\n";
358 OS << "-+----" << std::string(WordN * 4, '-') << "\n";
359 for (int I = 0; I <= PatN; ++I) {
360 for (Action A : {Miss, Match}) {
361 OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
362 for (int J = 0; J <= WordN; ++J) {
363 if (!isAwful(Scores[I][J][A].Score))
364 OS << format("%3d%c", Scores[I][J][A].Score,
365 Scores[I][J][A].Prev == Match ? '*' : ' ');
366 else
367 OS << " ";
368 }
369 OS << "\n";
370 }
371 }
372
373 return Result;
374}
Sam McCall8fed6342017-12-01 20:03:19 +0000375
376} // namespace clangd
377} // namespace clang