blob: cdafaf9636c54b349ca904b12c13fbd781282059 [file] [log] [blame]
Sam McCallb536a2a2017-12-19 12:23:48 +00001//===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Sam McCallb536a2a2017-12-19 12:23:48 +00006//
7//===----------------------------------------------------------------------===//
8#include "SourceCode.h"
9
Sam McCalla69698f2019-03-27 17:47:49 +000010#include "Context.h"
Marc-Andre Laperle1be69702018-07-05 19:35:01 +000011#include "Logger.h"
Sam McCalla69698f2019-03-27 17:47:49 +000012#include "Protocol.h"
Marc-Andre Laperle1be69702018-07-05 19:35:01 +000013#include "clang/AST/ASTContext.h"
Marc-Andre Laperle63a10982018-02-21 02:39:08 +000014#include "clang/Basic/SourceManager.h"
Marc-Andre Laperle1be69702018-07-05 19:35:01 +000015#include "clang/Lex/Lexer.h"
Ilya Biryukov43998782019-01-31 21:30:05 +000016#include "llvm/ADT/None.h"
17#include "llvm/ADT/StringRef.h"
Simon Marchi766338a2018-03-21 14:36:46 +000018#include "llvm/Support/Errc.h"
19#include "llvm/Support/Error.h"
Marc-Andre Laperle1be69702018-07-05 19:35:01 +000020#include "llvm/Support/Path.h"
Marc-Andre Laperle63a10982018-02-21 02:39:08 +000021
Sam McCallb536a2a2017-12-19 12:23:48 +000022namespace clang {
23namespace clangd {
Sam McCallb536a2a2017-12-19 12:23:48 +000024
Sam McCalla4962cc2018-04-27 11:59:28 +000025// Here be dragons. LSP positions use columns measured in *UTF-16 code units*!
26// Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.
27
28// Iterates over unicode codepoints in the (UTF-8) string. For each,
29// invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
30// Returns true if CB returned true, false if we hit the end of string.
31template <typename Callback>
Ilya Biryukovf2001aa2019-01-07 15:45:19 +000032static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
Sam McCalla4962cc2018-04-27 11:59:28 +000033 for (size_t I = 0; I < U8.size();) {
34 unsigned char C = static_cast<unsigned char>(U8[I]);
35 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
36 if (CB(1, 1))
37 return true;
38 ++I;
39 continue;
40 }
41 // This convenient property of UTF-8 holds for all non-ASCII characters.
Ilya Biryukovf2001aa2019-01-07 15:45:19 +000042 size_t UTF8Length = llvm::countLeadingOnes(C);
Sam McCalla4962cc2018-04-27 11:59:28 +000043 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
44 // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
45 assert((UTF8Length >= 2 && UTF8Length <= 4) &&
46 "Invalid UTF-8, or transcoding bug?");
47 I += UTF8Length; // Skip over all trailing bytes.
48 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
49 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
50 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
51 return true;
52 }
53 return false;
54}
55
56// Returns the offset into the string that matches \p Units UTF-16 code units.
57// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
58// to UTF-8, and returns the length in bytes.
Ilya Biryukovf2001aa2019-01-07 15:45:19 +000059static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
Sam McCalla4962cc2018-04-27 11:59:28 +000060 size_t Result = 0;
61 Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
62 Result += U8Len;
63 U16Units -= U16Len;
64 return U16Units <= 0;
65 });
66 if (U16Units < 0) // Offset was into the middle of a surrogate pair.
67 Valid = false;
68 // Don't return an out-of-range index if we overran.
69 return std::min(Result, U8.size());
70}
71
Sam McCalla69698f2019-03-27 17:47:49 +000072Key<OffsetEncoding> kCurrentOffsetEncoding;
73static bool useUTF16ForLSP() {
74 auto *Enc = Context::current().get(kCurrentOffsetEncoding);
75 switch (Enc ? *Enc : OffsetEncoding::UTF16) {
76 case OffsetEncoding::UTF16:
77 return true;
78 case OffsetEncoding::UTF8:
79 return false;
80 case OffsetEncoding::UnsupportedEncoding:
81 llvm_unreachable("cannot use an unsupported encoding");
82 }
83}
84
Sam McCalla4962cc2018-04-27 11:59:28 +000085// Like most strings in clangd, the input is UTF-8 encoded.
Ilya Biryukovf2001aa2019-01-07 15:45:19 +000086size_t lspLength(llvm::StringRef Code) {
Sam McCalla69698f2019-03-27 17:47:49 +000087 if (!useUTF16ForLSP())
88 return Code.size();
Sam McCalla4962cc2018-04-27 11:59:28 +000089 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
90 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
91 size_t Count = 0;
Sam McCall71891122018-10-23 11:51:53 +000092 iterateCodepoints(Code, [&](int U8Len, int U16Len) {
Sam McCalla4962cc2018-04-27 11:59:28 +000093 Count += U16Len;
94 return false;
95 });
96 return Count;
97}
98
Ilya Biryukovf2001aa2019-01-07 15:45:19 +000099llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
100 bool AllowColumnsBeyondLineLength) {
Sam McCallb536a2a2017-12-19 12:23:48 +0000101 if (P.line < 0)
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000102 return llvm::make_error<llvm::StringError>(
103 llvm::formatv("Line value can't be negative ({0})", P.line),
104 llvm::errc::invalid_argument);
Simon Marchi766338a2018-03-21 14:36:46 +0000105 if (P.character < 0)
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000106 return llvm::make_error<llvm::StringError>(
107 llvm::formatv("Character value can't be negative ({0})", P.character),
108 llvm::errc::invalid_argument);
Sam McCallb536a2a2017-12-19 12:23:48 +0000109 size_t StartOfLine = 0;
110 for (int I = 0; I != P.line; ++I) {
111 size_t NextNL = Code.find('\n', StartOfLine);
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000112 if (NextNL == llvm::StringRef::npos)
113 return llvm::make_error<llvm::StringError>(
114 llvm::formatv("Line value is out of range ({0})", P.line),
115 llvm::errc::invalid_argument);
Sam McCallb536a2a2017-12-19 12:23:48 +0000116 StartOfLine = NextNL + 1;
117 }
Sam McCalla69698f2019-03-27 17:47:49 +0000118 StringRef Line =
119 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
Simon Marchi766338a2018-03-21 14:36:46 +0000120
Sam McCalla69698f2019-03-27 17:47:49 +0000121 if (!useUTF16ForLSP()) {
122 // Bounds-checking only.
123 if (P.character > int(Line.size())) {
124 if (AllowColumnsBeyondLineLength)
125 return StartOfLine + Line.size();
126 else
127 return llvm::make_error<llvm::StringError>(
128 llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
129 P.line),
130 llvm::errc::invalid_argument);
131 }
132 return StartOfLine + P.character;
133 }
134 // P.character is in UTF-16 code units, so we have to transcode.
Sam McCalla4962cc2018-04-27 11:59:28 +0000135 bool Valid;
Sam McCalla69698f2019-03-27 17:47:49 +0000136 size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
Sam McCalla4962cc2018-04-27 11:59:28 +0000137 if (!Valid && !AllowColumnsBeyondLineLength)
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000138 return llvm::make_error<llvm::StringError>(
139 llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
140 P.line),
141 llvm::errc::invalid_argument);
Sam McCalla4962cc2018-04-27 11:59:28 +0000142 return StartOfLine + ByteOffsetInLine;
Sam McCallb536a2a2017-12-19 12:23:48 +0000143}
144
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000145Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
Sam McCallb536a2a2017-12-19 12:23:48 +0000146 Offset = std::min(Code.size(), Offset);
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000147 llvm::StringRef Before = Code.substr(0, Offset);
Sam McCallb536a2a2017-12-19 12:23:48 +0000148 int Lines = Before.count('\n');
149 size_t PrevNL = Before.rfind('\n');
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000150 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1);
Ilya Biryukov7beea3a2018-02-14 10:52:04 +0000151 Position Pos;
152 Pos.line = Lines;
Sam McCall71891122018-10-23 11:51:53 +0000153 Pos.character = lspLength(Before.substr(StartOfLine));
Ilya Biryukov7beea3a2018-02-14 10:52:04 +0000154 return Pos;
Sam McCallb536a2a2017-12-19 12:23:48 +0000155}
156
Marc-Andre Laperle63a10982018-02-21 02:39:08 +0000157Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) {
Sam McCalla4962cc2018-04-27 11:59:28 +0000158 // We use the SourceManager's line tables, but its column number is in bytes.
159 FileID FID;
160 unsigned Offset;
161 std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc);
Marc-Andre Laperle63a10982018-02-21 02:39:08 +0000162 Position P;
Sam McCalla4962cc2018-04-27 11:59:28 +0000163 P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1;
164 bool Invalid = false;
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000165 llvm::StringRef Code = SM.getBufferData(FID, &Invalid);
Sam McCalla4962cc2018-04-27 11:59:28 +0000166 if (!Invalid) {
167 auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1;
168 auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes);
Sam McCall71891122018-10-23 11:51:53 +0000169 P.character = lspLength(LineSoFar);
Sam McCalla4962cc2018-04-27 11:59:28 +0000170 }
Marc-Andre Laperle63a10982018-02-21 02:39:08 +0000171 return P;
172}
173
Ilya Biryukov43998782019-01-31 21:30:05 +0000174bool isValidFileRange(const SourceManager &Mgr, SourceRange R) {
175 if (!R.getBegin().isValid() || !R.getEnd().isValid())
176 return false;
177
178 FileID BeginFID;
179 size_t BeginOffset = 0;
180 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin());
181
182 FileID EndFID;
183 size_t EndOffset = 0;
184 std::tie(EndFID, EndOffset) = Mgr.getDecomposedLoc(R.getEnd());
185
186 return BeginFID.isValid() && BeginFID == EndFID && BeginOffset <= EndOffset;
187}
188
189bool halfOpenRangeContains(const SourceManager &Mgr, SourceRange R,
190 SourceLocation L) {
191 assert(isValidFileRange(Mgr, R));
192
193 FileID BeginFID;
194 size_t BeginOffset = 0;
195 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin());
196 size_t EndOffset = Mgr.getFileOffset(R.getEnd());
197
198 FileID LFid;
199 size_t LOffset;
200 std::tie(LFid, LOffset) = Mgr.getDecomposedLoc(L);
201 return BeginFID == LFid && BeginOffset <= LOffset && LOffset < EndOffset;
202}
203
204bool halfOpenRangeTouches(const SourceManager &Mgr, SourceRange R,
205 SourceLocation L) {
206 return L == R.getEnd() || halfOpenRangeContains(Mgr, R, L);
207}
208
209llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &Mgr,
210 const LangOptions &LangOpts,
211 SourceRange R) {
212 auto Begin = Mgr.getFileLoc(R.getBegin());
213 if (Begin.isInvalid())
214 return llvm::None;
215 auto End = Mgr.getFileLoc(R.getEnd());
216 if (End.isInvalid())
217 return llvm::None;
218 End = Lexer::getLocForEndOfToken(End, 0, Mgr, LangOpts);
219
220 SourceRange Result(Begin, End);
221 if (!isValidFileRange(Mgr, Result))
222 return llvm::None;
223 return Result;
224}
225
226llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) {
227 assert(isValidFileRange(SM, R));
228 bool Invalid = false;
229 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid);
230 assert(!Invalid);
231
232 size_t BeginOffset = SM.getFileOffset(R.getBegin());
233 size_t EndOffset = SM.getFileOffset(R.getEnd());
234 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset);
235}
236
Ilya Biryukovcce67a32019-01-29 14:17:36 +0000237llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM,
238 Position P) {
239 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer();
240 auto Offset =
241 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false);
242 if (!Offset)
243 return Offset.takeError();
244 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset);
245}
246
Ilya Biryukov71028b82018-03-12 15:28:22 +0000247Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) {
248 // Clang is 1-based, LSP uses 0-based indexes.
249 Position Begin = sourceLocToPosition(SM, R.getBegin());
250 Position End = sourceLocToPosition(SM, R.getEnd());
251
252 return {Begin, End};
253}
254
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000255std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code,
Sam McCalla4962cc2018-04-27 11:59:28 +0000256 size_t Offset) {
257 Offset = std::min(Code.size(), Offset);
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000258 llvm::StringRef Before = Code.substr(0, Offset);
Sam McCalla4962cc2018-04-27 11:59:28 +0000259 int Lines = Before.count('\n');
260 size_t PrevNL = Before.rfind('\n');
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000261 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1);
Sam McCalla4962cc2018-04-27 11:59:28 +0000262 return {Lines + 1, Offset - StartOfLine + 1};
263}
264
Ilya Biryukov43998782019-01-31 21:30:05 +0000265std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) {
Marc-Andre Laperleb387b6e2018-04-23 20:00:52 +0000266 size_t Pos = QName.rfind("::");
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000267 if (Pos == llvm::StringRef::npos)
268 return {llvm::StringRef(), QName};
Marc-Andre Laperleb387b6e2018-04-23 20:00:52 +0000269 return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)};
270}
271
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000272TextEdit replacementToEdit(llvm::StringRef Code,
273 const tooling::Replacement &R) {
Eric Liu9133ecd2018-05-11 12:12:08 +0000274 Range ReplacementRange = {
275 offsetToPosition(Code, R.getOffset()),
276 offsetToPosition(Code, R.getOffset() + R.getLength())};
277 return {ReplacementRange, R.getReplacementText()};
278}
279
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000280std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code,
Eric Liu9133ecd2018-05-11 12:12:08 +0000281 const tooling::Replacements &Repls) {
282 std::vector<TextEdit> Edits;
283 for (const auto &R : Repls)
284 Edits.push_back(replacementToEdit(Code, R));
285 return Edits;
286}
287
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000288llvm::Optional<std::string> getCanonicalPath(const FileEntry *F,
289 const SourceManager &SourceMgr) {
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000290 if (!F)
291 return None;
Simon Marchi25f1f732018-08-10 22:27:53 +0000292
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000293 llvm::SmallString<128> FilePath = F->getName();
294 if (!llvm::sys::path::is_absolute(FilePath)) {
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000295 if (auto EC =
Duncan P. N. Exon Smithdb8a7422019-03-26 22:32:06 +0000296 SourceMgr.getFileManager().getVirtualFileSystem().makeAbsolute(
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000297 FilePath)) {
298 elog("Could not turn relative path '{0}' to absolute: {1}", FilePath,
299 EC.message());
Sam McCallc008af62018-10-20 15:30:37 +0000300 return None;
Marc-Andre Laperle1be69702018-07-05 19:35:01 +0000301 }
302 }
Simon Marchi25f1f732018-08-10 22:27:53 +0000303
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000304 // Handle the symbolic link path case where the current working directory
305 // (getCurrentWorkingDirectory) is a symlink./ We always want to the real
306 // file path (instead of the symlink path) for the C++ symbols.
307 //
308 // Consider the following example:
309 //
310 // src dir: /project/src/foo.h
311 // current working directory (symlink): /tmp/build -> /project/src/
312 //
313 // The file path of Symbol is "/project/src/foo.h" instead of
314 // "/tmp/build/foo.h"
315 if (const DirectoryEntry *Dir = SourceMgr.getFileManager().getDirectory(
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000316 llvm::sys::path::parent_path(FilePath))) {
317 llvm::SmallString<128> RealPath;
318 llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(Dir);
319 llvm::sys::path::append(RealPath, DirName,
320 llvm::sys::path::filename(FilePath));
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000321 return RealPath.str().str();
Simon Marchi25f1f732018-08-10 22:27:53 +0000322 }
323
Kadir Cetinkayadd677932018-12-19 10:46:21 +0000324 return FilePath.str().str();
Marc-Andre Laperle1be69702018-07-05 19:35:01 +0000325}
326
Kadir Cetinkaya2f84d912018-08-08 08:59:29 +0000327TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M,
328 const LangOptions &L) {
329 TextEdit Result;
330 Result.range =
331 halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L));
332 Result.newText = FixIt.CodeToInsert;
333 return Result;
334}
335
Haojian Wuaa3ed5a2019-01-25 15:14:03 +0000336bool isRangeConsecutive(const Range &Left, const Range &Right) {
Kadir Cetinkayaa9c9d002018-08-13 08:23:01 +0000337 return Left.end.line == Right.start.line &&
338 Left.end.character == Right.start.character;
339}
340
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000341FileDigest digest(llvm::StringRef Content) {
Kadir Cetinkayad08eab42018-11-27 16:08:53 +0000342 return llvm::SHA1::hash({(const uint8_t *)Content.data(), Content.size()});
343}
344
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000345llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) {
Kadir Cetinkayad08eab42018-11-27 16:08:53 +0000346 bool Invalid = false;
Ilya Biryukovf2001aa2019-01-07 15:45:19 +0000347 llvm::StringRef Content = SM.getBufferData(FID, &Invalid);
Kadir Cetinkayad08eab42018-11-27 16:08:53 +0000348 if (Invalid)
349 return None;
350 return digest(Content);
351}
352
Eric Liudd662772019-01-28 14:01:55 +0000353format::FormatStyle getFormatStyleForFile(llvm::StringRef File,
354 llvm::StringRef Content,
355 llvm::vfs::FileSystem *FS) {
356 auto Style = format::getStyle(format::DefaultFormatStyle, File,
357 format::DefaultFallbackStyle, Content, FS);
358 if (!Style) {
359 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File,
360 Style.takeError());
361 Style = format::getLLVMStyle();
362 }
363 return *Style;
364}
365
Haojian Wu12e194c2019-02-06 15:24:50 +0000366llvm::Expected<tooling::Replacements>
367cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces,
368 const format::FormatStyle &Style) {
369 auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style);
370 if (!CleanReplaces)
371 return CleanReplaces;
372 return formatReplacements(Code, std::move(*CleanReplaces), Style);
373}
374
Sam McCallb536a2a2017-12-19 12:23:48 +0000375} // namespace clangd
376} // namespace clang