Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 1 | //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file implements fuzzy-matching of strings against identifiers. |
| 11 | // It indicates both the existence and quality of a match: |
| 12 | // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. |
| 13 | // |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
| 17 | #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H |
| 18 | |
| 19 | #include "llvm/ADT/Optional.h" |
| 20 | #include "llvm/ADT/SmallString.h" |
| 21 | #include "llvm/ADT/StringRef.h" |
| 22 | #include "llvm/Support/raw_ostream.h" |
| 23 | |
| 24 | namespace clang { |
| 25 | namespace clangd { |
| 26 | |
| 27 | // A matcher capable of matching and scoring strings against a single pattern. |
| 28 | // It's optimized for matching against many strings - match() does not allocate. |
| 29 | class FuzzyMatcher { |
| 30 | public: |
| 31 | // Characters beyond MaxPat are ignored. |
| 32 | FuzzyMatcher(llvm::StringRef Pattern); |
| 33 | |
| 34 | // If Word matches the pattern, return a score in [0,1] (higher is better). |
| 35 | // Characters beyond MaxWord are ignored. |
| 36 | llvm::Optional<float> match(llvm::StringRef Word); |
| 37 | |
| 38 | // Dump internal state from the last match() to the stream, for debugging. |
| 39 | // Returns the pattern with [] around matched characters, e.g. |
| 40 | // [u_p] + "unique_ptr" --> "[u]nique[_p]tr" |
| 41 | llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; |
| 42 | |
| 43 | private: |
| 44 | // We truncate the pattern and the word to bound the cost of matching. |
| 45 | constexpr static int MaxPat = 63, MaxWord = 127; |
Sam McCall | 8e97cca | 2017-12-02 03:35:19 +0000 | [diff] [blame] | 46 | enum CharRole : unsigned char; // For segmentation. |
| 47 | enum CharType : unsigned char; // For segmentation. |
| 48 | enum Action : unsigned char { Miss = 0, Match = 1 }; |
Sam McCall | 8749641 | 2017-12-01 17:08:02 +0000 | [diff] [blame] | 49 | |
| 50 | bool init(llvm::StringRef Word); |
| 51 | void buildGraph(); |
| 52 | void calculateRoles(const char *Text, CharRole *Out, int N); |
| 53 | int skipPenalty(int W, Action Last); |
| 54 | int matchBonus(int P, int W, Action Last); |
| 55 | |
| 56 | // Pattern data is initialized by the constructor, then constant. |
| 57 | char Pat[MaxPat]; // Pattern data |
| 58 | int PatN; // Length |
| 59 | char LowPat[MaxPat]; // Pattern in lowercase |
| 60 | CharRole PatRole[MaxPat]; // Pattern segmentation info |
| 61 | bool CaseSensitive; // Case-sensitive match if pattern has uppercase |
| 62 | float ScoreScale; // Normalizes scores for the pattern length. |
| 63 | |
| 64 | // Word data is initialized on each call to match(), mostly by init(). |
| 65 | char Word[MaxWord]; // Word data |
| 66 | int WordN; // Length |
| 67 | char LowWord[MaxWord]; // Word in lowercase |
| 68 | CharRole WordRole[MaxWord]; // Word segmentation info |
| 69 | bool WordContainsPattern; // Simple substring check |
| 70 | |
| 71 | // Cumulative best-match score table. |
| 72 | // Boundary conditions are filled in by the constructor. |
| 73 | // The rest is repopulated for each match(), by buildGraph(). |
| 74 | struct ScoreInfo { |
| 75 | signed int Score : 15; |
| 76 | Action Prev : 1; |
| 77 | }; |
| 78 | ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; |
| 79 | }; |
| 80 | |
| 81 | } // namespace clangd |
| 82 | } // namespace clang |
| 83 | |
| 84 | #endif |