Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 1 | //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | #include "SourceCode.h" |
| 10 | |
Marc-Andre Laperle | 1be6970 | 2018-07-05 19:35:01 +0000 | [diff] [blame] | 11 | #include "Logger.h" |
| 12 | #include "clang/AST/ASTContext.h" |
Marc-Andre Laperle | 63a1098 | 2018-02-21 02:39:08 +0000 | [diff] [blame] | 13 | #include "clang/Basic/SourceManager.h" |
Marc-Andre Laperle | 1be6970 | 2018-07-05 19:35:01 +0000 | [diff] [blame] | 14 | #include "clang/Lex/Lexer.h" |
Simon Marchi | 766338a | 2018-03-21 14:36:46 +0000 | [diff] [blame] | 15 | #include "llvm/Support/Errc.h" |
| 16 | #include "llvm/Support/Error.h" |
Marc-Andre Laperle | 1be6970 | 2018-07-05 19:35:01 +0000 | [diff] [blame] | 17 | #include "llvm/Support/Path.h" |
Marc-Andre Laperle | 63a1098 | 2018-02-21 02:39:08 +0000 | [diff] [blame] | 18 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 19 | using namespace llvm; |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 20 | namespace clang { |
| 21 | namespace clangd { |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 22 | |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 23 | // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! |
| 24 | // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. |
| 25 | |
| 26 | // Iterates over unicode codepoints in the (UTF-8) string. For each, |
| 27 | // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. |
| 28 | // Returns true if CB returned true, false if we hit the end of string. |
| 29 | template <typename Callback> |
| 30 | static bool iterateCodepoints(StringRef U8, const Callback &CB) { |
| 31 | for (size_t I = 0; I < U8.size();) { |
| 32 | unsigned char C = static_cast<unsigned char>(U8[I]); |
| 33 | if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. |
| 34 | if (CB(1, 1)) |
| 35 | return true; |
| 36 | ++I; |
| 37 | continue; |
| 38 | } |
| 39 | // This convenient property of UTF-8 holds for all non-ASCII characters. |
| 40 | size_t UTF8Length = countLeadingOnes(C); |
| 41 | // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. |
| 42 | // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. |
| 43 | assert((UTF8Length >= 2 && UTF8Length <= 4) && |
| 44 | "Invalid UTF-8, or transcoding bug?"); |
| 45 | I += UTF8Length; // Skip over all trailing bytes. |
| 46 | // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). |
| 47 | // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) |
| 48 | if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) |
| 49 | return true; |
| 50 | } |
| 51 | return false; |
| 52 | } |
| 53 | |
| 54 | // Returns the offset into the string that matches \p Units UTF-16 code units. |
| 55 | // Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back |
| 56 | // to UTF-8, and returns the length in bytes. |
| 57 | static size_t measureUTF16(StringRef U8, int U16Units, bool &Valid) { |
| 58 | size_t Result = 0; |
| 59 | Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { |
| 60 | Result += U8Len; |
| 61 | U16Units -= U16Len; |
| 62 | return U16Units <= 0; |
| 63 | }); |
| 64 | if (U16Units < 0) // Offset was into the middle of a surrogate pair. |
| 65 | Valid = false; |
| 66 | // Don't return an out-of-range index if we overran. |
| 67 | return std::min(Result, U8.size()); |
| 68 | } |
| 69 | |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 70 | // Like most strings in clangd, the input is UTF-8 encoded. |
Sam McCall | 7189112 | 2018-10-23 11:51:53 +0000 | [diff] [blame] | 71 | size_t lspLength(StringRef Code) { |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 72 | // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). |
| 73 | // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. |
| 74 | size_t Count = 0; |
Sam McCall | 7189112 | 2018-10-23 11:51:53 +0000 | [diff] [blame] | 75 | iterateCodepoints(Code, [&](int U8Len, int U16Len) { |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 76 | Count += U16Len; |
| 77 | return false; |
| 78 | }); |
| 79 | return Count; |
| 80 | } |
| 81 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 82 | Expected<size_t> positionToOffset(StringRef Code, Position P, |
| 83 | bool AllowColumnsBeyondLineLength) { |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 84 | if (P.line < 0) |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 85 | return make_error<StringError>( |
| 86 | formatv("Line value can't be negative ({0})", P.line), |
| 87 | errc::invalid_argument); |
Simon Marchi | 766338a | 2018-03-21 14:36:46 +0000 | [diff] [blame] | 88 | if (P.character < 0) |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 89 | return make_error<StringError>( |
| 90 | formatv("Character value can't be negative ({0})", P.character), |
| 91 | errc::invalid_argument); |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 92 | size_t StartOfLine = 0; |
| 93 | for (int I = 0; I != P.line; ++I) { |
| 94 | size_t NextNL = Code.find('\n', StartOfLine); |
| 95 | if (NextNL == StringRef::npos) |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 96 | return make_error<StringError>( |
| 97 | formatv("Line value is out of range ({0})", P.line), |
| 98 | errc::invalid_argument); |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 99 | StartOfLine = NextNL + 1; |
| 100 | } |
Simon Marchi | 766338a | 2018-03-21 14:36:46 +0000 | [diff] [blame] | 101 | |
| 102 | size_t NextNL = Code.find('\n', StartOfLine); |
| 103 | if (NextNL == StringRef::npos) |
| 104 | NextNL = Code.size(); |
| 105 | |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 106 | bool Valid; |
| 107 | size_t ByteOffsetInLine = measureUTF16( |
| 108 | Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); |
| 109 | if (!Valid && !AllowColumnsBeyondLineLength) |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 110 | return make_error<StringError>( |
| 111 | formatv("UTF-16 offset {0} is invalid for line {1}", P.character, |
| 112 | P.line), |
| 113 | errc::invalid_argument); |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 114 | return StartOfLine + ByteOffsetInLine; |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 115 | } |
| 116 | |
| 117 | Position offsetToPosition(StringRef Code, size_t Offset) { |
| 118 | Offset = std::min(Code.size(), Offset); |
| 119 | StringRef Before = Code.substr(0, Offset); |
| 120 | int Lines = Before.count('\n'); |
| 121 | size_t PrevNL = Before.rfind('\n'); |
| 122 | size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); |
Ilya Biryukov | 7beea3a | 2018-02-14 10:52:04 +0000 | [diff] [blame] | 123 | Position Pos; |
| 124 | Pos.line = Lines; |
Sam McCall | 7189112 | 2018-10-23 11:51:53 +0000 | [diff] [blame] | 125 | Pos.character = lspLength(Before.substr(StartOfLine)); |
Ilya Biryukov | 7beea3a | 2018-02-14 10:52:04 +0000 | [diff] [blame] | 126 | return Pos; |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 127 | } |
| 128 | |
Marc-Andre Laperle | 63a1098 | 2018-02-21 02:39:08 +0000 | [diff] [blame] | 129 | Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 130 | // We use the SourceManager's line tables, but its column number is in bytes. |
| 131 | FileID FID; |
| 132 | unsigned Offset; |
| 133 | std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); |
Marc-Andre Laperle | 63a1098 | 2018-02-21 02:39:08 +0000 | [diff] [blame] | 134 | Position P; |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 135 | P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; |
| 136 | bool Invalid = false; |
| 137 | StringRef Code = SM.getBufferData(FID, &Invalid); |
| 138 | if (!Invalid) { |
| 139 | auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; |
| 140 | auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); |
Sam McCall | 7189112 | 2018-10-23 11:51:53 +0000 | [diff] [blame] | 141 | P.character = lspLength(LineSoFar); |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 142 | } |
Marc-Andre Laperle | 63a1098 | 2018-02-21 02:39:08 +0000 | [diff] [blame] | 143 | return P; |
| 144 | } |
| 145 | |
Ilya Biryukov | 71028b8 | 2018-03-12 15:28:22 +0000 | [diff] [blame] | 146 | Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { |
| 147 | // Clang is 1-based, LSP uses 0-based indexes. |
| 148 | Position Begin = sourceLocToPosition(SM, R.getBegin()); |
| 149 | Position End = sourceLocToPosition(SM, R.getEnd()); |
| 150 | |
| 151 | return {Begin, End}; |
| 152 | } |
| 153 | |
Sam McCall | a4962cc | 2018-04-27 11:59:28 +0000 | [diff] [blame] | 154 | std::pair<size_t, size_t> offsetToClangLineColumn(StringRef Code, |
| 155 | size_t Offset) { |
| 156 | Offset = std::min(Code.size(), Offset); |
| 157 | StringRef Before = Code.substr(0, Offset); |
| 158 | int Lines = Before.count('\n'); |
| 159 | size_t PrevNL = Before.rfind('\n'); |
| 160 | size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1); |
| 161 | return {Lines + 1, Offset - StartOfLine + 1}; |
| 162 | } |
| 163 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 164 | std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { |
Marc-Andre Laperle | b387b6e | 2018-04-23 20:00:52 +0000 | [diff] [blame] | 165 | size_t Pos = QName.rfind("::"); |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 166 | if (Pos == StringRef::npos) |
Marc-Andre Laperle | b387b6e | 2018-04-23 20:00:52 +0000 | [diff] [blame] | 167 | return {StringRef(), QName}; |
| 168 | return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)}; |
| 169 | } |
| 170 | |
Eric Liu | 9133ecd | 2018-05-11 12:12:08 +0000 | [diff] [blame] | 171 | TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R) { |
| 172 | Range ReplacementRange = { |
| 173 | offsetToPosition(Code, R.getOffset()), |
| 174 | offsetToPosition(Code, R.getOffset() + R.getLength())}; |
| 175 | return {ReplacementRange, R.getReplacementText()}; |
| 176 | } |
| 177 | |
| 178 | std::vector<TextEdit> replacementsToEdits(StringRef Code, |
| 179 | const tooling::Replacements &Repls) { |
| 180 | std::vector<TextEdit> Edits; |
| 181 | for (const auto &R : Repls) |
| 182 | Edits.push_back(replacementToEdit(Code, R)); |
| 183 | return Edits; |
| 184 | } |
| 185 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 186 | Optional<std::string> getRealPath(const FileEntry *F, |
| 187 | const SourceManager &SourceMgr) { |
Simon Marchi | 25f1f73 | 2018-08-10 22:27:53 +0000 | [diff] [blame] | 188 | // Ideally, we get the real path from the FileEntry object. |
| 189 | SmallString<128> FilePath = F->tryGetRealPathName(); |
| 190 | if (!FilePath.empty()) { |
| 191 | return FilePath.str().str(); |
| 192 | } |
| 193 | |
| 194 | // Otherwise, we try to compute ourselves. |
| 195 | vlog("FileEntry for {0} did not contain the real path.", F->getName()); |
| 196 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 197 | SmallString<128> Path = F->getName(); |
Simon Marchi | 25f1f73 | 2018-08-10 22:27:53 +0000 | [diff] [blame] | 198 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 199 | if (!sys::path::is_absolute(Path)) { |
Simon Marchi | 25f1f73 | 2018-08-10 22:27:53 +0000 | [diff] [blame] | 200 | if (!SourceMgr.getFileManager().makeAbsolutePath(Path)) { |
| 201 | log("Could not turn relative path to absolute: {0}", Path); |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 202 | return None; |
Marc-Andre Laperle | 1be6970 | 2018-07-05 19:35:01 +0000 | [diff] [blame] | 203 | } |
| 204 | } |
Simon Marchi | 25f1f73 | 2018-08-10 22:27:53 +0000 | [diff] [blame] | 205 | |
Sam McCall | c008af6 | 2018-10-20 15:30:37 +0000 | [diff] [blame] | 206 | SmallString<128> RealPath; |
Simon Marchi | 25f1f73 | 2018-08-10 22:27:53 +0000 | [diff] [blame] | 207 | if (SourceMgr.getFileManager().getVirtualFileSystem()->getRealPath( |
| 208 | Path, RealPath)) { |
| 209 | log("Could not compute real path: {0}", Path); |
| 210 | return Path.str().str(); |
| 211 | } |
| 212 | |
| 213 | return RealPath.str().str(); |
Marc-Andre Laperle | 1be6970 | 2018-07-05 19:35:01 +0000 | [diff] [blame] | 214 | } |
| 215 | |
Kadir Cetinkaya | 2f84d91 | 2018-08-08 08:59:29 +0000 | [diff] [blame] | 216 | TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, |
| 217 | const LangOptions &L) { |
| 218 | TextEdit Result; |
| 219 | Result.range = |
| 220 | halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L)); |
| 221 | Result.newText = FixIt.CodeToInsert; |
| 222 | return Result; |
| 223 | } |
| 224 | |
Kadir Cetinkaya | a9c9d00 | 2018-08-13 08:23:01 +0000 | [diff] [blame] | 225 | bool IsRangeConsecutive(const Range &Left, const Range &Right) { |
| 226 | return Left.end.line == Right.start.line && |
| 227 | Left.end.character == Right.start.character; |
| 228 | } |
| 229 | |
Kadir Cetinkaya | d08eab4 | 2018-11-27 16:08:53 +0000 | [diff] [blame^] | 230 | FileDigest digest(StringRef Content) { |
| 231 | return llvm::SHA1::hash({(const uint8_t *)Content.data(), Content.size()}); |
| 232 | } |
| 233 | |
| 234 | Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { |
| 235 | bool Invalid = false; |
| 236 | StringRef Content = SM.getBufferData(FID, &Invalid); |
| 237 | if (Invalid) |
| 238 | return None; |
| 239 | return digest(Content); |
| 240 | } |
| 241 | |
Sam McCall | b536a2a | 2017-12-19 12:23:48 +0000 | [diff] [blame] | 242 | } // namespace clangd |
| 243 | } // namespace clang |