[clangd] Support UTF-32 (i.e. codepoint) offsets.
Summary:
(Changes to UTF-8/UTF-16 here are NFC, moving things around to make the
cases more symmetrical)
Reviewers: ilya-biryukov
Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D59927
llvm-svn: 357173
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index cdafaf9..656ab1d 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Path.h"
namespace clang {
@@ -30,6 +31,8 @@
// Returns true if CB returned true, false if we hit the end of string.
template <typename Callback>
static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+ // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+ // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
for (size_t I = 0; I < U8.size();) {
unsigned char C = static_cast<unsigned char>(U8[I]);
if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
@@ -53,46 +56,75 @@
return false;
}
-// Returns the offset into the string that matches \p Units UTF-16 code units.
-// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
-// to UTF-8, and returns the length in bytes.
-static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
+// Returns the byte offset into the string that is an offset of \p Units in
+// the specified encoding.
+// Conceptually, this converts to the encoding, truncates to CodeUnits,
+// converts back to UTF-8, and returns the length in bytes.
+static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
+ bool &Valid) {
+ Valid = Units >= 0;
+ if (Units <= 0)
+ return 0;
size_t Result = 0;
- Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
- Result += U8Len;
- U16Units -= U16Len;
- return U16Units <= 0;
- });
- if (U16Units < 0) // Offset was into the middle of a surrogate pair.
- Valid = false;
+ switch (Enc) {
+ case OffsetEncoding::UTF8:
+ Result = Units;
+ break;
+ case OffsetEncoding::UTF16:
+ Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+ Result += U8Len;
+ Units -= U16Len;
+ return Units <= 0;
+ });
+ if (Units < 0) // Offset in the middle of a surrogate pair.
+ Valid = false;
+ break;
+ case OffsetEncoding::UTF32:
+ Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+ Result += U8Len;
+ Units--;
+ return Units <= 0;
+ });
+ break;
+ case OffsetEncoding::UnsupportedEncoding:
+ llvm_unreachable("unsupported encoding");
+ }
// Don't return an out-of-range index if we overran.
- return std::min(Result, U8.size());
+ if (Result > U8.size()) {
+ Valid = false;
+ return U8.size();
+ }
+ return Result;
}
Key<OffsetEncoding> kCurrentOffsetEncoding;
-static bool useUTF16ForLSP() {
+static OffsetEncoding lspEncoding() {
auto *Enc = Context::current().get(kCurrentOffsetEncoding);
- switch (Enc ? *Enc : OffsetEncoding::UTF16) {
- case OffsetEncoding::UTF16:
- return true;
- case OffsetEncoding::UTF8:
- return false;
- case OffsetEncoding::UnsupportedEncoding:
- llvm_unreachable("cannot use an unsupported encoding");
- }
+ return Enc ? *Enc : OffsetEncoding::UTF16;
}
// Like most strings in clangd, the input is UTF-8 encoded.
size_t lspLength(llvm::StringRef Code) {
- if (!useUTF16ForLSP())
- return Code.size();
- // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
- // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
size_t Count = 0;
- iterateCodepoints(Code, [&](int U8Len, int U16Len) {
- Count += U16Len;
- return false;
- });
+ switch (lspEncoding()) {
+ case OffsetEncoding::UTF8:
+ Count = Code.size();
+ break;
+ case OffsetEncoding::UTF16:
+ iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+ Count += U16Len;
+ return false;
+ });
+ break;
+ case OffsetEncoding::UTF32:
+ iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+ ++Count;
+ return false;
+ });
+ break;
+ case OffsetEncoding::UnsupportedEncoding:
+ llvm_unreachable("unsupported encoding");
+ }
return Count;
}
@@ -118,28 +150,15 @@
StringRef Line =
Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
- if (!useUTF16ForLSP()) {
- // Bounds-checking only.
- if (P.character > int(Line.size())) {
- if (AllowColumnsBeyondLineLength)
- return StartOfLine + Line.size();
- else
- return llvm::make_error<llvm::StringError>(
- llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
- P.line),
- llvm::errc::invalid_argument);
- }
- return StartOfLine + P.character;
- }
- // P.character is in UTF-16 code units, so we have to transcode.
+ // P.character may be in UTF-16, transcode if necessary.
bool Valid;
- size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
+ size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
if (!Valid && !AllowColumnsBeyondLineLength)
return llvm::make_error<llvm::StringError>(
- llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
- P.line),
+ llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(),
+ P.character, P.line),
llvm::errc::invalid_argument);
- return StartOfLine + ByteOffsetInLine;
+ return StartOfLine + ByteInLine;
}
Position offsetToPosition(llvm::StringRef Code, size_t Offset) {