Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 1 | //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file implements fuzzy-matching of strings against identifiers. |
| 11 | // It indicates both the existence and quality of a match: |
| 12 | // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. |
| 13 | // |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
| 17 | #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
| 18 | |
| 19 | #include "llvm/ADT/Optional.h" |
| 20 | #include "llvm/ADT/SmallString.h" |
| 21 | #include "llvm/ADT/StringRef.h" |
| 22 | #include "llvm/Support/raw_ostream.h" |
| 23 | |
| 24 | namespace clang { |
| 25 | namespace clangd { |
| 26 | |
| 27 | // A matcher capable of matching and scoring strings against a single pattern. |
| 28 | // It's optimized for matching against many strings - match() does not allocate. |
| 29 | class FuzzyMatcher { |
| 30 | public: |
| 31 | // Characters beyond MaxPat are ignored. |
| 32 | FuzzyMatcher(llvm::StringRef Pattern); |
| 33 | |
| 34 | // If Word matches the pattern, return a score in [0,1] (higher is better). |
| 35 | // Characters beyond MaxWord are ignored. |
| 36 | llvm::Optional<float> match(llvm::StringRef Word); |
| 37 | |
Sam McCall | 545a20d | 2018-01-19 14:34:02 +0000 | [diff] [blame] | 38 | llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); } |
| 39 | bool empty() const { return PatN == 0; } |
Sam McCall | 84652cc | 2018-01-12 16:16:09 +0000 | [diff] [blame] | 40 | |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 41 | // Dump internal state from the last match() to the stream, for debugging. |
| 42 | // Returns the pattern with [] around matched characters, e.g. |
| 43 | // [u_p] + "unique_ptr" --> "[u]nique[_p]tr" |
| 44 | llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; |
| 45 | |
| 46 | private: |
| 47 | // We truncate the pattern and the word to bound the cost of matching. |
| 48 | constexpr static int MaxPat = 63, MaxWord = 127; |
Sam McCall | 8e97cca | 2017-12-02 03:35:19 +0000 | [diff] [blame] | 49 | enum CharRole : unsigned char; // For segmentation. |
| 50 | enum CharType : unsigned char; // For segmentation. |
Sam McCall | 3ea9640 | 2017-12-02 04:15:55 +0000 | [diff] [blame] | 51 | // Action should be an enum, but this causes bitfield problems: |
| 52 | // - for MSVC the enum type must be explicitly unsigned for correctness |
| 53 | // - GCC 4.8 complains not all values fit if the type is unsigned |
| 54 | using Action = bool; |
| 55 | constexpr static Action Miss = false, Match = true; |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 56 | |
| 57 | bool init(llvm::StringRef Word); |
| 58 | void buildGraph(); |
Sam McCall | 77a719c | 2018-03-05 17:34:33 +0000 | [diff] [blame] | 59 | void calculateRoles(const char *Text, CharRole *Out, int &Types, int N); |
| 60 | bool allowMatch(int P, int W) const; |
| 61 | int skipPenalty(int W, Action Last) const; |
| 62 | int matchBonus(int P, int W, Action Last) const; |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 63 | |
| 64 | // Pattern data is initialized by the constructor, then constant. |
| 65 | char Pat[MaxPat]; // Pattern data |
| 66 | int PatN; // Length |
| 67 | char LowPat[MaxPat]; // Pattern in lowercase |
| 68 | CharRole PatRole[MaxPat]; // Pattern segmentation info |
Sam McCall | 77a719c | 2018-03-05 17:34:33 +0000 | [diff] [blame] | 69 | int PatTypeSet; // Bitmask of 1<<CharType |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 70 | float ScoreScale; // Normalizes scores for the pattern length. |
| 71 | |
| 72 | // Word data is initialized on each call to match(), mostly by init(). |
| 73 | char Word[MaxWord]; // Word data |
| 74 | int WordN; // Length |
| 75 | char LowWord[MaxWord]; // Word in lowercase |
| 76 | CharRole WordRole[MaxWord]; // Word segmentation info |
Sam McCall | 77a719c | 2018-03-05 17:34:33 +0000 | [diff] [blame] | 77 | int WordTypeSet; // Bitmask of 1<<CharType |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 78 | bool WordContainsPattern; // Simple substring check |
| 79 | |
| 80 | // Cumulative best-match score table. |
| 81 | // Boundary conditions are filled in by the constructor. |
| 82 | // The rest is repopulated for each match(), by buildGraph(). |
| 83 | struct ScoreInfo { |
| 84 | signed int Score : 15; |
| 85 | Action Prev : 1; |
| 86 | }; |
| 87 | ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; |
| 88 | }; |
| 89 | |
| 90 | } // namespace clangd |
| 91 | } // namespace clang |
| 92 | |
| 93 | #endif |