[clangd] Support UTF-32 (i.e. codepoint) offsets.
Summary:
(Changes to UTF-8/UTF-16 here are NFC, moving things around to make the
cases more symmetrical)
Reviewers: ilya-biryukov
Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D59927
llvm-svn: 357173
diff --git a/clang-tools-extra/unittests/clangd/SourceCodeTests.cpp b/clang-tools-extra/unittests/clangd/SourceCodeTests.cpp
index e09f42b..e38eaa3 100644
--- a/clang-tools-extra/unittests/clangd/SourceCodeTests.cpp
+++ b/clang-tools-extra/unittests/clangd/SourceCodeTests.cpp
@@ -58,6 +58,15 @@
EXPECT_EQ(lspLength("¥"), 2UL);
// astral
EXPECT_EQ(lspLength("😂"), 4UL);
+
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ EXPECT_EQ(lspLength(""), 0UL);
+ EXPECT_EQ(lspLength("ascii"), 5UL);
+ // BMP
+ EXPECT_EQ(lspLength("↓"), 1UL);
+ EXPECT_EQ(lspLength("¥"), 1UL);
+ // astral
+ EXPECT_EQ(lspLength("😂"), 1UL);
}
// The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
@@ -131,6 +140,63 @@
EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+ // Codepoints are similar, except near astral characters.
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ // line out of bounds
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
+ // first line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 0)),
+ llvm::HasValue(0)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 3)),
+ llvm::HasValue(3)); // middle character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 6)),
+ llvm::HasValue(6)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7)),
+ llvm::HasValue(7)); // the newline itself
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7), false),
+ llvm::HasValue(7));
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8)),
+ llvm::HasValue(7)); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8), false),
+ llvm::Failed()); // out of range
+ // middle line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 0)),
+ llvm::HasValue(8)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3)),
+ llvm::HasValue(11)); // middle character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false),
+ llvm::HasValue(11));
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)),
+ llvm::HasValue(16)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)),
+ llvm::HasValue(17)); // the newline itself
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)),
+ llvm::HasValue(17)); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false),
+ llvm::Failed()); // out of range
+ // last line
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)),
+ llvm::Failed()); // out of range
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)),
+ llvm::HasValue(18)); // first character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 4)),
+ llvm::HasValue(22)); // Before astral character.
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false),
+ llvm::HasValue(26)); // after astral character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)),
+ llvm::HasValue(28)); // last character
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)),
+ llvm::HasValue(29)); // EOF
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false),
+ llvm::Failed()); // out of range
+ // line out of bounds
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
+ EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+
// Test UTF-8, where transformations are trivial.
WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
@@ -169,6 +235,27 @@
EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF";
EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds";
+ // Codepoints are similar, except near astral characters.
+ WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+ EXPECT_THAT(offsetToPosition(File, 0), Pos(0, 0)) << "start of file";
+ EXPECT_THAT(offsetToPosition(File, 3), Pos(0, 3)) << "in first line";
+ EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line";
+ EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline";
+ EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line";
+ EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char";
+ EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char";
+ EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char";
+ EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line";
+ EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline";
+ EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line";
+ EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line";
+ EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char";
+ EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 5)) << "in astral char";
+ EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 5)) << "after astral char";
+ EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 7)) << "end of last line";
+ EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 8)) << "EOF";
+ EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 8)) << "out of bounds";
+
WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
for (Line L : FileLines) {
for (unsigned I = 0; I <= L.Length; ++I)