[clangd] Use xxhash instead of SHA1 for background index file digests.

Summary:
Currently SHA1 is about 10% of our CPU, this patch reduces it to ~1%.

xxhash is a well-defined (stable) non-cryptographic hash optimized for
fast checksums (like crc32).
Collisions shouldn't be a problem, despite the reduced length:
 - for actual file content (used to invalidate bg index shards), there
   are only two versions that can collide (new shard and old shard).
 - for file paths in bg index shard filenames, we would need 2^32 files
   with the same filename to expect a collision. Imperfect hashing may
   reduce this a bit but it's well beyond what's plausible.

This will invalidate shards on disk (as usual; I bumped the version),
but this time the filenames are changing so the old files will stick
around :-( So this is more expensive than the usual bump, but would be
good to land before the v9 branch when everyone will start using bg index.

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64306

llvm-svn: 365311
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 9fcaed4..5c715ba 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/xxhash.h"
 #include <algorithm>
 
 namespace clang {
@@ -376,7 +377,13 @@
 }
 
 FileDigest digest(llvm::StringRef Content) {
-  return llvm::SHA1::hash({(const uint8_t *)Content.data(), Content.size()});
+  uint64_t Hash{llvm::xxHash64(Content)};
+  FileDigest Result;
+  for (unsigned I = 0; I < Result.size(); ++I) {
+    Result[I] = uint8_t(Hash);
+    Hash >>= 8;
+  }
+  return Result;
 }
 
 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) {
diff --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index efa33f6..9532a31 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -22,7 +22,6 @@
 #include "clang/Tooling/Core/Replacement.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/SHA1.h"
 
 namespace clang {
 class SourceManager;
@@ -32,7 +31,7 @@
 // We tend to generate digests for source codes in a lot of different places.
 // This represents the type for those digests to prevent us hard coding details
 // of hashing function at every place that needs to store this information.
-using FileDigest = decltype(llvm::SHA1::hash({}));
+using FileDigest = std::array<uint8_t, 8>;
 FileDigest digest(StringRef Content);
 Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID);
 
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index 7c89d1a..7f25241 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -32,7 +32,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/SHA1.h"
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h
index e3d833e..a8f53c9 100644
--- a/clang-tools-extra/clangd/index/Background.h
+++ b/clang-tools-extra/clangd/index/Background.h
@@ -19,7 +19,6 @@
 #include "index/Serialization.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/SHA1.h"
 #include "llvm/Support/Threading.h"
 #include <atomic>
 #include <condition_variable>
diff --git a/clang-tools-extra/clangd/index/BackgroundIndexStorage.cpp b/clang-tools-extra/clangd/index/BackgroundIndexStorage.cpp
index 42db1dc..80246b9 100644
--- a/clang-tools-extra/clangd/index/BackgroundIndexStorage.cpp
+++ b/clang-tools-extra/clangd/index/BackgroundIndexStorage.cpp
@@ -13,18 +13,11 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/SHA1.h"
 
 namespace clang {
 namespace clangd {
 namespace {
 
-using FileDigest = decltype(llvm::SHA1::hash({}));
-
-static FileDigest digest(StringRef Content) {
-  return llvm::SHA1::hash({(const uint8_t *)Content.data(), Content.size()});
-}
-
 std::string getShardPathFromFilePath(llvm::StringRef ShardRoot,
                                      llvm::StringRef FilePath) {
   llvm::SmallString<128> ShardRootSS(ShardRoot);
diff --git a/clang-tools-extra/clangd/index/Serialization.cpp b/clang-tools-extra/clangd/index/Serialization.cpp
index 5b3c7cf..188a5cc 100644
--- a/clang-tools-extra/clangd/index/Serialization.cpp
+++ b/clang-tools-extra/clangd/index/Serialization.cpp
@@ -444,7 +444,7 @@
 // The current versioning scheme is simple - non-current versions are rejected.
 // If you make a breaking change, bump this version number to invalidate stored
 // data. Later we may want to support some backward compatibility.
-constexpr static uint32_t Version = 11;
+constexpr static uint32_t Version = 12;
 
 llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data) {
   auto RIFF = riff::readFile(Data);
diff --git a/clang-tools-extra/clangd/unittests/SerializationTests.cpp b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
index b76ea82..4abb6f3 100644
--- a/clang-tools-extra/clangd/unittests/SerializationTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
@@ -10,7 +10,6 @@
 #include "index/Index.h"
 #include "index/Serialization.h"
 #include "clang/Tooling/CompilationDatabase.h"
-#include "llvm/Support/SHA1.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -208,9 +207,7 @@
 
   std::string TestContent("TestContent");
   IncludeGraphNode IGN;
-  IGN.Digest =
-      llvm::SHA1::hash({reinterpret_cast<const uint8_t *>(TestContent.data()),
-                        TestContent.size()});
+  IGN.Digest = digest(TestContent);
   IGN.DirectIncludes = {"inc1", "inc2"};
   IGN.URI = "URI";
   IGN.Flags |= IncludeGraphNode::SourceFlag::IsTU;