Merge remote-tracking branch 'goog/androidx-platform-dev' into sc-dev am: 46325e158d am: 191864b1de

Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/14106922

Change-Id: Ib26ba5b1fd95f694fc3768279221b9cbeb1feff2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a740924..70f6852 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@
 # Compile libandroidicu
 set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../icu/libandroidicu")
 set(ICU_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/icu-target")
-add_subdirectory(${ICU_SOURCE_DIR} ${ICU_TARGET_BINARY_DIR})
+add_subdirectory("${ICU_SOURCE_DIR}/static_shim" ${ICU_TARGET_BINARY_DIR})
 
 # Glob Icing proto sources. Results look like this: icing/proto/document.proto
 file(
diff --git a/icing/file/destructible-file.h b/icing/file/destructible-file.h
new file mode 100644
index 0000000..006dcb4
--- /dev/null
+++ b/icing/file/destructible-file.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_DESTRUCTIBLE_FILE_H_
+#define ICING_FILE_DESTRUCTIBLE_FILE_H_
+
+#include <unistd.h>
+
+#include <string>
+
+#include "icing/file/filesystem.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// A convenient RAII class which will open the specified file path for write and
+// delete the underlying file upon destruction.
+class DestructibleFile {
+ public:
+  explicit DestructibleFile(const std::string& filepath,
+                            const Filesystem* filesystem)
+      : filesystem_(filesystem), filepath_(filepath) {
+    fd_ = filesystem_->OpenForWrite(filepath_.c_str());
+  }
+
+  DestructibleFile(const DestructibleFile&) = delete;
+  DestructibleFile(DestructibleFile&& other) : filesystem_(nullptr), fd_(-1) {
+    *this = std::move(other);
+  }
+
+  DestructibleFile& operator=(const DestructibleFile&) = delete;
+  DestructibleFile& operator=(DestructibleFile&& other) {
+    std::swap(fd_, other.fd_);
+    std::swap(filesystem_, other.filesystem_);
+    std::swap(filepath_, other.filepath_);
+    return *this;
+  }
+
+  ~DestructibleFile() {
+    if (is_valid()) {
+      close(fd_);
+      if (!filesystem_->DeleteFile(filepath_.c_str())) {
+        ICING_VLOG(1) << "Failed to delete file " << filepath_;
+      }
+    }
+  }
+
+  bool is_valid() const { return fd_ >= 0; }
+  int get_fd() const { return fd_; }
+
+ private:
+  const Filesystem* filesystem_;
+  std::string filepath_;
+  int fd_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_FILE_DESTRUCTIBLE_FILE_H_
diff --git a/icing/file/destructible-file_test.cc b/icing/file/destructible-file_test.cc
new file mode 100644
index 0000000..61316d1
--- /dev/null
+++ b/icing/file/destructible-file_test.cc
@@ -0,0 +1,117 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/destructible-file.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+TEST(DestructibleFileTest, DeletesFileProperly) {
+  Filesystem filesystem;
+  std::string filepath1 = GetTestTempDir() + "/file1";
+
+  {
+    // 1. Create the file
+    ScopedFd sfd(filesystem.OpenForWrite(filepath1.c_str()));
+    ASSERT_TRUE(sfd.is_valid());
+    int i = 127;
+    ASSERT_TRUE(filesystem.Write(sfd.get(), &i, sizeof(i)));
+  }
+
+  {
+    // 2. Open with a Destructible file.
+    DestructibleFile destructible(filepath1, &filesystem);
+    ASSERT_TRUE(destructible.is_valid());
+  }
+
+  // 3. Ensure that the file doesn't exist.
+  EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+}
+
+TEST(DestructibleFileTest, MoveAssignDeletesFileProperly) {
+  Filesystem filesystem;
+  std::string filepath1 = GetTestTempDir() + "/file1";
+  std::string filepath2 = GetTestTempDir() + "/file2";
+
+  // 1. Create file1
+  DestructibleFile destructible1(filepath1, &filesystem);
+  ASSERT_TRUE(destructible1.is_valid());
+  int i = 127;
+  ASSERT_TRUE(filesystem.Write(destructible1.get_fd(), &i, sizeof(i)));
+
+  {
+    // 2. Create file2
+    DestructibleFile destructible2(filepath2, &filesystem);
+    ASSERT_TRUE(destructible2.is_valid());
+    i = 458;
+    ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i)));
+
+    // Move assign destructible2 into destructible1
+    destructible1 = std::move(destructible2);
+  }
+
+  // 3. file1 shouldn't exist because it was destroyed when destructible1 was
+  // move assigned to.
+  EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+
+  // 4. file2 should still exist because it moved into destructible1 from
+  // destructible2.
+  EXPECT_TRUE(filesystem.FileExists(filepath2.c_str()));
+}
+
+TEST(DestructibleFileTest, MoveConstructionDeletesFileProperly) {
+  Filesystem filesystem;
+  std::string filepath1 = GetTestTempDir() + "/file1";
+
+  // 1. Create destructible1, it'll be reconstructed soon anyways.
+  std::unique_ptr<DestructibleFile> destructible1;
+  {
+    // 2. Create file1
+    DestructibleFile destructible2(filepath1, &filesystem);
+    ASSERT_TRUE(destructible2.is_valid());
+    int i = 458;
+    ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i)));
+
+    // Move construct destructible1 from destructible2
+    destructible1 =
+        std::make_unique<DestructibleFile>(std::move(destructible2));
+  }
+
+  // 3. file1 should still exist because it moved into destructible1 from
+  // destructible2.
+  ASSERT_TRUE(destructible1->is_valid());
+  EXPECT_TRUE(filesystem.FileExists(filepath1.c_str()));
+
+  {
+    // 4. Move construct destructible3 from destructible1
+    DestructibleFile destructible3(std::move(*destructible1));
+    ASSERT_TRUE(destructible3.is_valid());
+  }
+
+  // 5. file1 shouldn't exist because it was destroyed when destructible3 was
+  // destroyed.
+  EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+}
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 1d5b689..9ccd81b 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -70,6 +70,7 @@
 #include "icing/file/filesystem.h"
 #include "icing/file/memory-mapped-file.h"
 #include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/platform.h"
 #include "icing/portable/zlib.h"
 #include "icing/util/crc32.h"
 #include "icing/util/data-loss.h"
@@ -422,7 +423,8 @@
   static constexpr int kDeflateCompressionLevel = 3;
 
   // Chunks of the file to mmap at a time, so we don't mmap the entire file.
-  static constexpr int kMmapChunkSize = 4 * 1024;
+  // Only used on 32-bit devices
+  static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
 
   ScopedFd fd_;
   const Filesystem* const filesystem_;
@@ -631,6 +633,14 @@
         file_path.c_str(), static_cast<long long>(start)));
   }
 
+  if (end < start) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Ending checksum offset of file '%s' must be greater than start "
+        "'%lld', was '%lld'",
+        file_path.c_str(), static_cast<long long>(start),
+        static_cast<long long>(end)));
+  }
+
   int64_t file_size = filesystem->GetFileSize(file_path.c_str());
   if (end > file_size) {
     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
@@ -640,17 +650,41 @@
         static_cast<long long>(end)));
   }
 
-  for (int i = start; i < end; i += kMmapChunkSize) {
-    // Don't read past the file size.
-    int next_chunk_size = kMmapChunkSize;
-    if ((i + kMmapChunkSize) >= end) {
-      next_chunk_size = end - i;
+  Architecture architecture = GetArchitecture();
+  switch (architecture) {
+    case Architecture::BIT_64: {
+      // Don't mmap in chunks here since mmapping can be harmful on 64-bit
+      // devices where mmap/munmap calls need the mmap write semaphore, which
+      // blocks mmap/munmap/mprotect and all page faults from executing while
+      // they run. On 64-bit devices, this doesn't actually load into memory, it
+      // just makes the file faultable. So the whole file should be ok.
+      // b/185822878.
+      ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
+      auto mmap_str = std::string_view(mmapped_file.region(), end - start);
+      new_crc.Append(mmap_str);
+      break;
     }
+    case Architecture::BIT_32:
+      [[fallthrough]];
+    case Architecture::UNKNOWN: {
+      // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
+      // much memory at once. If we're unknown, then also chunk it because we're
+      // not sure what the device can handle.
+      for (int i = start; i < end; i += kMmapChunkSize) {
+        // Don't read past the file size.
+        int next_chunk_size = kMmapChunkSize;
+        if ((i + kMmapChunkSize) >= end) {
+          next_chunk_size = end - i;
+        }
 
-    ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+        ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
 
-    auto mmap_str = std::string_view(mmapped_file.region(), next_chunk_size);
-    new_crc.Append(mmap_str);
+        auto mmap_str =
+            std::string_view(mmapped_file.region(), next_chunk_size);
+        new_crc.Append(mmap_str);
+      }
+      break;
+    }
   }
 
   return new_crc;
@@ -670,7 +704,8 @@
         static_cast<long long>(proto_size), header_->max_proto_size));
   }
 
-  // At this point, we've guaranteed that proto_size is under kMaxProtoSize (see
+  // At this point, we've guaranteed that proto_size is under kMaxProtoSize
+  // (see
   // ::Create), so we can safely store it in an int.
   int final_size = 0;
 
@@ -735,8 +770,8 @@
   MemoryMappedFile mmapped_file(*filesystem_, file_path_,
                                 MemoryMappedFile::Strategy::READ_ONLY);
   if (file_offset >= file_size) {
-    // file_size points to the next byte to write at, so subtract one to get the
-    // inclusive, actual size of file.
+    // file_size points to the next byte to write at, so subtract one to get
+    // the inclusive, actual size of file.
     return absl_ports::OutOfRangeError(
         IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
                                       "out of range of the file size, %lld",
@@ -778,8 +813,8 @@
     int64_t file_offset) {
   int64_t file_size = filesystem_->GetFileSize(fd_.get());
   if (file_offset >= file_size) {
-    // file_size points to the next byte to write at, so subtract one to get the
-    // inclusive, actual size of file.
+    // file_size points to the next byte to write at, so subtract one to get
+    // the inclusive, actual size of file.
     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
         "Trying to erase data at a location, %lld, "
         "out of range of the file size, %lld",
@@ -798,12 +833,12 @@
   ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
                                            GetProtoSize(metadata)));
 
-  // We need to update the crc checksum if the erased area is before the rewind
-  // position.
+  // We need to update the crc checksum if the erased area is before the
+  // rewind position.
   if (file_offset + sizeof(metadata) < header_->rewind_offset) {
     // We need to calculate [original string xor 0s].
-    // The xored string is the same as the original string because 0 xor 0 = 0,
-    // 1 xor 0 = 1.
+    // The xored string is the same as the original string because 0 xor 0 =
+    // 0, 1 xor 0 = 1.
     const std::string_view xored_str(mmapped_file.region(),
                                      mmapped_file.region_size());
 
@@ -896,7 +931,8 @@
 template <typename ProtoT>
 typename FileBackedProtoLog<ProtoT>::Iterator
 FileBackedProtoLog<ProtoT>::GetIterator() {
-  return Iterator(*filesystem_, file_path_, /*initial_offset=*/sizeof(Header));
+  return Iterator(*filesystem_, file_path_,
+                  /*initial_offset=*/sizeof(Header));
 }
 
 template <typename ProtoT>
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
index 26e0fb0..766cc64 100644
--- a/icing/file/file-backed-proto-log_benchmark.cc
+++ b/icing/file/file-backed-proto-log_benchmark.cc
@@ -164,6 +164,48 @@
                               // 16MiB, and we need some extra space for the
                               // rest of the document properties
 
+static void BM_ComputeChecksum(benchmark::State& state) {
+  const Filesystem filesystem;
+  const std::string file_path = GetTestTempDir() + "/proto.log";
+  int max_proto_size = (1 << 24) - 1;  // 16 MiB
+  bool compress = true;
+
+  // Make sure it doesn't already exist.
+  filesystem.DeleteFile(file_path.c_str());
+
+  auto proto_log =
+      FileBackedProtoLog<DocumentProto>::Create(
+          &filesystem, file_path,
+          FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+          .ValueOrDie()
+          .proto_log;
+
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  // Make each document 1KiB
+  int string_length = 1024;
+  std::default_random_engine random;
+  const std::string rand_str =
+      RandomString(kAlNumAlphabet, string_length, &random);
+
+  auto document_properties = document.add_properties();
+  document_properties->set_name("string property");
+  document_properties->add_string_values(rand_str);
+
+  int num_docs = state.range(0);
+  for (int i = 0; i < num_docs; ++i) {
+    ICING_ASSERT_OK(proto_log->WriteProto(document));
+  }
+
+  for (auto _ : state) {
+    testing::DoNotOptimize(proto_log->ComputeChecksum());
+  }
+
+  // Cleanup after ourselves
+  filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
new file mode 100644
index 0000000..95c3949
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -0,0 +1,1173 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// File-backed log of protos with append-only writes and position based reads.
+//
+// There should only be one instance of a PortableFileBackedProtoLog of the same
+// file at a time; using multiple instances at the same time may lead to
+// undefined behavior.
+//
+// The entire checksum is computed on initialization to verify the contents are
+// valid. On failure, the log will be truncated to the last verified state when
+// PersistToDisk() was called. If the log cannot successfully restore the last
+// state due to disk corruption or some other inconsistency, then the entire log
+// will be lost.
+//
+// Each proto written to the file will have a metadata written just before it.
+// The metadata consists of
+//   {
+//     1 bytes of kProtoMagic;
+//     3 bytes of the proto size
+//     n bytes of the proto itself
+//   }
+//
+// All metadata is written in a portable format, encoded with htonl before
+// writing to file and decoded with ntohl when reading from file.
+//
+// Example usage:
+//   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+//       PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+//       file_path_,
+//                                                  options));
+//   auto proto_log = create_result.proto_log;
+//
+//   Document document;
+//   document.set_namespace("com.google.android.example");
+//   document.set_uri("www.google.com");
+//
+//   int64_t document_offset = proto_log->WriteProto(document));
+//   Document same_document = proto_log->ReadProto(document_offset));
+//   proto_log->PersistToDisk();
+
+#ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+#define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/platform.h"
+#include "icing/portable/zlib.h"
+#include "icing/util/bit-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Number of bytes we reserve for the heading at the beginning of the proto log.
+// We reserve this so the header can grow without running into the contents of
+// the proto log, triggering an unnecessary migration of the data.
+constexpr int kHeaderReservedBytes = 256;
+
+bool IsEmptyBuffer(const char* buffer, int size) {
+  return std::all_of(buffer, buffer + size,
+                     [](const char byte) { return byte == 0; });
+}
+
+// Helper function to get stored proto size from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+// Helper function to get stored proto magic from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+}  // namespace
+
+template <typename ProtoT>
+class PortableFileBackedProtoLog {
+ public:
+  struct Options {
+    // Whether to compress each proto before writing to the proto log.
+    bool compress;
+
+    // Byte-size limit for each proto written to the store. This does not
+    // include the bytes needed for the metadata of each proto.
+    //
+    // NOTE: Currently, we only support protos up to 16MiB. We store the proto
+    // size in 3 bytes within the metadata.
+    //
+    // NOTE: This limit is only enforced for future writes. If the store
+    // previously had a higher limit, then reading older entries could return
+    // larger protos.
+    //
+    // NOTE: The max_proto_size is the upper limit for input protos into the
+    // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
+    // to a smaller size, ProtoLog will not accept it. Protos that result in a
+    // compressed size larger than max_proto_size are also not accepted.
+    const int32_t max_proto_size;
+
+    // Must specify values for options.
+    Options() = delete;
+    explicit Options(bool compress_in,
+                     const int32_t max_proto_size_in = kMaxProtoSize)
+        : compress(compress_in), max_proto_size(max_proto_size_in) {}
+  };
+
+  // Header stored at the beginning of the file before the rest of the log
+  // contents. Stores metadata on the log.
+  class Header {
+   public:
+    static constexpr int32_t kMagic = 0xf4c6f67a;
+
+    static constexpr int32_t kFileFormatVersion = 0;
+
+    uint32_t CalculateHeaderChecksum() const {
+      Crc32 crc;
+
+      // Get a string_view of all the fields of the Header, excluding the
+      // magic_nbytes and header_checksum_nbytes
+      std::string_view header_str(reinterpret_cast<const char*>(this) +
+                                      offsetof(Header, header_checksum_nbytes) +
+                                      sizeof(header_checksum_nbytes),
+                                  sizeof(Header) - sizeof(magic_nbytes) -
+                                      sizeof(header_checksum_nbytes));
+      crc.Append(header_str);
+      return crc.Get();
+    }
+
+    int32_t GetMagic() const { return gntohl(magic_nbytes); }
+
+    void SetMagic(int32_t magic_in) { magic_nbytes = ghtonl(magic_in); }
+
+    int32_t GetFileFormatVersion() const {
+      return gntohl(file_format_version_nbytes);
+    }
+
+    void SetFileFormatVersion(int32_t file_format_version_in) {
+      file_format_version_nbytes = ghtonl(file_format_version_in);
+    }
+
+    int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes); }
+
+    void SetMaxProtoSize(int32_t max_proto_size_in) {
+      max_proto_size_nbytes = ghtonl(max_proto_size_in);
+    }
+
+    int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes); }
+
+    void SetLogChecksum(int32_t log_checksum_in) {
+      log_checksum_nbytes = ghtonl(log_checksum_in);
+    }
+
+    int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes); }
+
+    void SetRewindOffset(int64_t rewind_offset_in) {
+      rewind_offset_nbytes = ghtonll(rewind_offset_in);
+    }
+
+    int32_t GetHeaderChecksum() const { return gntohl(header_checksum_nbytes); }
+
+    void SetHeaderChecksum(int32_t header_checksum_in) {
+      header_checksum_nbytes = ghtonl(header_checksum_in);
+    }
+
+    bool GetCompressFlag() const {
+      uint16_t host_order_flags = gntohs(flags_nbytes);
+      return bit_util::BitfieldGet(host_order_flags, kCompressBit, /*len=*/1);
+    }
+
+    void SetCompressFlag(bool compress) {
+      uint16_t host_order_flags = gntohs(flags_nbytes);
+      bit_util::BitfieldSet(compress, kCompressBit,
+                            /*len=*/1, &host_order_flags);
+      flags_nbytes = ghtons(host_order_flags);
+    }
+
+   private:
+    // The least-significant bit offset at which the compress flag is stored in
+    // 'flags_nbytes'. Represents whether the protos in the log are compressed
+    // or not.
+    static constexpr int32_t kCompressBit = 0;
+
+    // Holds the magic as a quick sanity check against file corruption.
+    //
+    // Field is in network-byte order.
+    int32_t magic_nbytes = ghtonl(kMagic);
+
+    // Must be at the beginning after kMagic. Contains the crc checksum of
+    // the following fields.
+    //
+    // Field is in network-byte order.
+    uint32_t header_checksum_nbytes = 0;
+
+    // Last known good offset at which the log and its checksum were updated.
+    // If we crash between writing to the log and updating the checksum, we can
+    // try to rewind the log to this offset and verify the checksum is still
+    // valid instead of throwing away the entire log.
+    //
+    // Field is in network-byte order.
+    int64_t rewind_offset_nbytes = ghtonll(kHeaderReservedBytes);
+
+    // Version number tracking how we serialize the file to disk. If we change
+    // how/what we write to disk, this version should be updated and this class
+    // should handle a migration.
+    //
+    // Currently at kFileFormatVersion.
+    //
+    // Field is in network-byte order.
+    int32_t file_format_version_nbytes = 0;
+
+    // The maximum proto size that can be written to the log.
+    //
+    // Field is in network-byte order.
+    int32_t max_proto_size_nbytes = 0;
+
+    // Checksum of the log elements, doesn't include the header fields.
+    //
+    // Field is in network-byte order.
+    uint32_t log_checksum_nbytes = 0;
+
+    // Bits are used to hold various flags.
+    //   Lowest bit is whether the protos are compressed or not.
+    //
+    // Field is in network-byte order.
+    uint16_t flags_nbytes = 0;
+
+    // NOTE: New fields should *almost always* be added to the end here. Since
+    // this class may have already been written to disk, appending fields
+    // increases the chances that changes are backwards-compatible.
+  };
+  static_assert(sizeof(Header) <= kHeaderReservedBytes,
+                "Header has grown past our reserved bytes!");
+
+  struct CreateResult {
+    // A successfully initialized log.
+    std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
+
+    // The data status after initializing from a previous state. Data loss can
+    // happen if the file is corrupted or some previously added data was
+    // unpersisted. This may be used to signal that any derived data off of the
+    // proto log may need to be regenerated.
+    DataLoss data_loss;
+
+    bool has_data_loss() {
+      return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
+    }
+  };
+
+  // Factory method to create, initialize, and return a
+  // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
+  //
+  // If on re-initialization the log detects disk corruption or some previously
+  // added data was unpersisted, the log will rewind to the last-good state. The
+  // log saves these checkpointed "good" states when PersistToDisk() is called
+  // or the log is safely destructed. If the log rewinds successfully to the
+  // last-good state, then the returned CreateResult.data_loss indicates
+  // whether it has a data loss and what kind of data loss it is (partial or
+  // complete) so that any derived data may know that it needs to be updated. If
+  // the log re-initializes successfully without any data loss,
+  // CreateResult.data_loss will be NONE.
+  //
+  // Params:
+  //   filesystem: Handles system level calls
+  //   file_path: Path of the underlying file. Directory of the file should
+  //   already exist
+  //   options: Configuration options for the proto log
+  //
+  // Returns:
+  //   PortableFileBackedProtoLog::CreateResult on success
+  //   INVALID_ARGUMENT on an invalid option
+  //   INTERNAL_ERROR on IO error
+  static libtextclassifier3::StatusOr<CreateResult> Create(
+      const Filesystem* filesystem, const std::string& file_path,
+      const Options& options);
+
+  // Not copyable
+  PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
+  PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
+      delete;
+
+  // This will update the checksum of the log as well.
+  ~PortableFileBackedProtoLog();
+
+  // Writes the serialized proto to the underlying file. Writes are applied
+  // directly to the underlying file. Users do not need to sync the file after
+  // writing.
+  //
+  // Returns:
+  //   Offset of the newly appended proto in file on success
+  //   INVALID_ARGUMENT if proto is too large, as decided by
+  //     Options.max_proto_size
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
+
+  // Reads out a proto located at file_offset from the file.
+  //
+  // Returns:
+  //   A proto on success
+  //   NOT_FOUND if the proto at the given offset has been erased
+  //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+
+  // Erases the data of a proto located at file_offset from the file.
+  //
+  // Returns:
+  //   OK on success
+  //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::Status EraseProto(int64_t file_offset);
+
+  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+  // block size.
+  //
+  // Returns:
+  //   Disk usage on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+  // Returns the file size of all the elements held in the log. File size is in
+  // bytes. This excludes the size of any internal metadata of the log, e.g. the
+  // log's header.
+  //
+  // Returns:
+  //   File size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
+  // An iterator helping to find offsets of all the protos in file.
+  // Example usage:
+  //
+  // while (iterator.Advance().ok()) {
+  //   int64_t offset = iterator.GetOffset();
+  //   // Do something
+  // }
+  class Iterator {
+   public:
+    Iterator(const Filesystem& filesystem, const std::string& file_path,
+             int64_t initial_offset);
+
+    // Advances to the position of next proto whether it has been erased or not.
+    //
+    // Returns:
+    //   OK on success
+    //   OUT_OF_RANGE_ERROR if it reaches the end
+    //   INTERNAL_ERROR on IO error
+    libtextclassifier3::Status Advance();
+
+    // Returns the file offset of current proto.
+    int64_t GetOffset();
+
+   private:
+    static constexpr int64_t kInvalidOffset = -1;
+    // Used to read proto metadata
+    MemoryMappedFile mmapped_file_;
+    // Offset of first proto
+    int64_t initial_offset_;
+    int64_t current_offset_;
+    int64_t file_size_;
+  };
+
+  // Returns an iterator of current proto log. The caller needs to keep the
+  // proto log unchanged while using the iterator, otherwise unexpected
+  // behaviors could happen.
+  Iterator GetIterator();
+
+  // Persists all changes since initialization or the last call to
+  // PersistToDisk(). Any changes that aren't persisted may be lost if the
+  // system fails to close safely.
+  //
+  // Example use case:
+  //
+  //   Document document;
+  //   document.set_namespace("com.google.android.example");
+  //   document.set_uri("www.google.com");
+  //
+  //   {
+  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+  //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+  //         file_path,
+  //                                                    options));
+  //     auto proto_log = std::move(create_result.proto_log);
+  //
+  //     int64_t document_offset = proto_log->WriteProto(document));
+  //
+  //     // We lose the document here since it wasn't persisted.
+  //     // *SYSTEM CRASH*
+  //   }
+  //
+  //   {
+  //     // Can still successfully create after a crash since the log can
+  //     // rewind/truncate to recover into a previously good state
+  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+  //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+  //         file_path,
+  //                                                    options));
+  //     auto proto_log = std::move(create_result.proto_log);
+  //
+  //     // Lost the proto since we didn't PersistToDisk before the crash
+  //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
+  //
+  //     int64_t document_offset = proto_log->WriteProto(document));
+  //
+  //     // Persisted this time, so we should be ok.
+  //     ICING_ASSERT_OK(proto_log->PersistToDisk());
+  //   }
+  //
+  //   {
+  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+  //         PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+  //         file_path,
+  //                                                    options));
+  //     auto proto_log = std::move(create_result.proto_log);
+  //
+  //     // SUCCESS
+  //     Document same_document = proto_log->ReadProto(document_offset));
+  //   }
+  //
+  // NOTE: Since all protos are already written to the file directly, this
+  // just updates the checksum and rewind position. Without these updates,
+  // future initializations will truncate the file and discard unpersisted
+  // changes.
+  //
+  // Returns:
+  //   OK on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::Status PersistToDisk();
+
+  // Calculates the checksum of the log contents. Excludes the header content.
+  //
+  // Returns:
+  //   Crc of the log content
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ private:
+  // Object can only be instantiated via the ::Create factory.
+  PortableFileBackedProtoLog(const Filesystem* filesystem,
+                             const std::string& file_path,
+                             std::unique_ptr<Header> header);
+
+  // Initializes a new proto log.
+  //
+  // Returns:
+  //   std::unique_ptr<CreateResult> on success
+  //   INTERNAL_ERROR on IO error
+  static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
+      const Filesystem* filesystem, const std::string& file_path,
+      const Options& options);
+
+  // Verifies that the existing proto log is in a good state. If not in a good
+  // state, then the proto log may be truncated to the last good state and
+  // content will be lost.
+  //
+  // Returns:
+  //   std::unique_ptr<CreateResult> on success
+  //   INTERNAL_ERROR on IO error or internal inconsistencies in the file
+  //   INVALID_ARGUMENT_ERROR if options aren't consistent with previous
+  //     instances
+  static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
+      const Filesystem* filesystem, const std::string& file_path,
+      const Options& options, int64_t file_size);
+
+  // Takes an initial checksum and updates it with the content between `start`
+  // and `end` offsets in the file.
+  //
+  // Returns:
+  //   Crc of the content between `start`, inclusive, and `end`, exclusive.
+  //   INTERNAL_ERROR on IO error
+  //   INVALID_ARGUMENT_ERROR if start and end aren't within the file size
+  static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
+      const Filesystem* filesystem, const std::string& file_path,
+      Crc32 initial_crc, int64_t start, int64_t end);
+
+  // Reads out the metadata of a proto located at file_offset from the file.
+  // Metadata will be returned in host byte order endianness.
+  //
+  // Returns:
+  //   Proto's metadata on success
+  //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
+  //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
+  static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
+      MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+
+  // Writes metadata of a proto to the fd. Takes in a host byte order endianness
+  // metadata and converts it into a portable metadata before writing.
+  //
+  // Returns:
+  //   OK on success
+  //   INTERNAL_ERROR on any IO errors
+  static libtextclassifier3::Status WriteProtoMetadata(
+      const Filesystem* filesystem, int fd, int32_t host_order_metadata);
+
+  // Magic number added in front of every proto. Used when reading out protos
+  // as a first check for corruption in each entry in the file. Even if there is
+  // a corruption, the best we can do is roll back to our last recovery point
+  // and throw away un-flushed data. We can discard/reuse this byte if needed so
+  // that we have 4 bytes to store the size of protos, and increase the size of
+  // protos we support.
+  static constexpr uint8_t kProtoMagic = 0x5C;
+
+  // Our internal max for protos.
+  //
+  // WARNING: Changing this to a larger number may invalidate our assumption
+  // that that proto size can safely be stored in the last 3 bytes of the proto
+  // header.
+  static constexpr int kMaxProtoSize = (1 << 24) - 1;  // 16MiB
+  static_assert(kMaxProtoSize <= 0x00FFFFFF,
+                "kMaxProtoSize doesn't fit in 3 bytes");
+
+  // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
+  static constexpr int kDeflateCompressionLevel = 3;
+
+  // Chunks of the file to mmap at a time, so we don't mmap the entire file.
+  // Only used on 32-bit devices
+  static constexpr int kMmapChunkSize = 4 * 1024 * 1024;  // 4MiB
+
+  ScopedFd fd_;
+  const Filesystem* const filesystem_;
+  const std::string file_path_;
+  std::unique_ptr<Header> header_;
+};
+
+template <typename ProtoT>
+constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic;
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
+    const Filesystem* filesystem, const std::string& file_path,
+    std::unique_ptr<Header> header)
+    : filesystem_(filesystem),
+      file_path_(file_path),
+      header_(std::move(header)) {
+  fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
+  if (!PersistToDisk().ok()) {
+    ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
+                          "PortableFileBackedProtoLog: "
+                       << file_path_;
+  }
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+    typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
+                                           const std::string& file_path,
+                                           const Options& options) {
+  if (options.max_proto_size <= 0) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "options.max_proto_size must be greater than 0, was %d",
+        options.max_proto_size));
+  }
+
+  // Since we store the proto_size in 3 bytes, we can only support protos of up
+  // to 16MiB.
+  if (options.max_proto_size > kMaxProtoSize) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "options.max_proto_size must be under 16MiB, was %d",
+        options.max_proto_size));
+  }
+
+  if (!filesystem->FileExists(file_path.c_str())) {
+    return InitializeNewFile(filesystem, file_path, options);
+  }
+
+  int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+  if (file_size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Bad file size '", file_path, "'"));
+  }
+
+  if (file_size == 0) {
+    return InitializeNewFile(filesystem, file_path, options);
+  }
+
+  return InitializeExistingFile(filesystem, file_path, options, file_size);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+    typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
+    const Filesystem* filesystem, const std::string& file_path,
+    const Options& options) {
+  // Grow to the minimum reserved bytes for the header.
+  if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to initialize file size: ", file_path));
+  }
+
+  // Create the header
+  std::unique_ptr<Header> header = std::make_unique<Header>();
+  header->SetCompressFlag(options.compress);
+  header->SetMaxProtoSize(options.max_proto_size);
+  header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+  if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to write header for file: ", file_path));
+  }
+
+  CreateResult create_result = {
+      std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+          new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+                                                 std::move(header))),
+      /*data_loss=*/DataLoss::NONE};
+
+  return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+    typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
+    const Filesystem* filesystem, const std::string& file_path,
+    const Options& options, int64_t file_size) {
+  if (file_size < kHeaderReservedBytes) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("File header too short for: ", file_path));
+  }
+
+  std::unique_ptr<Header> header = std::make_unique<Header>();
+  if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
+                         /*offset=*/0)) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to read header for file: ", file_path));
+  }
+
+  // Make sure the header is still valid before we use any of its values. This
+  // is covered by the header_checksum check below, but this is a quick check
+  // that can save us from an extra crc computation.
+  if (header->GetMagic() != Header::kMagic) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
+  }
+
+  if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Invalid header checksum for: ", file_path));
+  }
+
+  if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
+    // If this changes, we might need to handle a migration rather than throwing
+    // an error.
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Invalid header file format version: ", file_path));
+  }
+
+  if (header->GetCompressFlag() != options.compress) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Inconsistent compress option, expected %d, actual %d",
+        header->GetCompressFlag(), options.compress));
+  }
+
+  if (header->GetMaxProtoSize() > options.max_proto_size) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Max proto size cannot be smaller than previous "
+        "instantiations, previous size %d, wanted size %d",
+        header->GetMaxProtoSize(), options.max_proto_size));
+  }
+  header->SetMaxProtoSize(options.max_proto_size);
+
+  DataLoss data_loss = DataLoss::NONE;
+  ICING_ASSIGN_OR_RETURN(
+      Crc32 calculated_log_checksum,
+      ComputeChecksum(filesystem, file_path, Crc32(),
+                      /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+
+  // Double check that the log checksum is the same as the one that was
+  // persisted last time. If not, we start recovery logic.
+  if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
+    // Need to rewind the proto log since the checksums don't match.
+    // Worst case, we have to rewind the entire log back to just the header
+    int64_t last_known_good = kHeaderReservedBytes;
+
+    // Calculate the checksum of the log contents just up to the last rewind
+    // offset point. This will be valid if we just appended contents to the log
+    // without updating the checksum, and we can rewind back to this point
+    // safely.
+    ICING_ASSIGN_OR_RETURN(calculated_log_checksum,
+                           ComputeChecksum(filesystem, file_path, Crc32(),
+                                           /*start=*/kHeaderReservedBytes,
+                                           /*end=*/header->GetRewindOffset()));
+    if (header->GetLogChecksum() == calculated_log_checksum.Get()) {
+      // Check if it matches our last rewind state. If so, this becomes our last
+      // good state and we can safely truncate and recover from here.
+      last_known_good = header->GetRewindOffset();
+      data_loss = DataLoss::PARTIAL;
+    } else {
+      // Otherwise, we're going to truncate the entire log and this resets the
+      // checksum to an empty log state.
+      header->SetLogChecksum(0);
+      data_loss = DataLoss::COMPLETE;
+    }
+
+    if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
+      return absl_ports::InternalError(
+          absl_ports::StrCat("Error truncating file: ", file_path));
+    }
+
+    ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
+                    << last_known_good;
+  }
+
+  CreateResult create_result = {
+      std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+          new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+                                                 std::move(header))),
+      data_loss};
+
+  return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+    const Filesystem* filesystem, const std::string& file_path,
+    Crc32 initial_crc, int64_t start, int64_t end) {
+  auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
+                                       MemoryMappedFile::Strategy::READ_ONLY);
+  Crc32 new_crc(initial_crc.Get());
+
+  if (start < 0) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Starting checksum offset of file '%s' must be greater than 0, was "
+        "%lld",
+        file_path.c_str(), static_cast<long long>(start)));
+  }
+
+  if (end < start) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Ending checksum offset of file '%s' must be greater than start "
+        "'%lld', was '%lld'",
+        file_path.c_str(), static_cast<long long>(start),
+        static_cast<long long>(end)));
+  }
+
+  int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+  if (end > file_size) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Ending checksum offset of file '%s' must be within "
+        "file size of %lld, was %lld",
+        file_path.c_str(), static_cast<long long>(file_size),
+        static_cast<long long>(end)));
+  }
+
+  Architecture architecture = GetArchitecture();
+  switch (architecture) {
+    case Architecture::BIT_64: {
+      // Don't mmap in chunks here since mmapping can be harmful on 64-bit
+      // devices where mmap/munmap calls need the mmap write semaphore, which
+      // blocks mmap/munmap/mprotect and all page faults from executing while
+      // they run. On 64-bit devices, this doesn't actually load into memory, it
+      // just makes the file faultable. So the whole file should be ok.
+      // b/185822878.
+      ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
+      auto mmap_str = std::string_view(mmapped_file.region(), end - start);
+      new_crc.Append(mmap_str);
+      break;
+    }
+    case Architecture::BIT_32:
+      [[fallthrough]];
+    case Architecture::UNKNOWN: {
+      // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
+      // much memory at once. If we're unknown, then also chunk it because we're
+      // not sure what the device can handle.
+      for (int i = start; i < end; i += kMmapChunkSize) {
+        // Don't read past the file size.
+        int next_chunk_size = kMmapChunkSize;
+        if ((i + kMmapChunkSize) >= end) {
+          next_chunk_size = end - i;
+        }
+
+        ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+
+        auto mmap_str =
+            std::string_view(mmapped_file.region(), next_chunk_size);
+        new_crc.Append(mmap_str);
+      }
+      break;
+    }
+  }
+
+  return new_crc;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
+  int64_t proto_size = proto.ByteSizeLong();
+  int32_t host_order_metadata;
+  int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
+
+  if (proto_size > header_->GetMaxProtoSize()) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "proto_size, %lld, was too large to write. Max is %d",
+        static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
+  }
+
+  // At this point, we've guaranteed that proto_size is under kMaxProtoSize
+  // (see
+  // ::Create), so we can safely store it in an int.
+  int final_size = 0;
+
+  std::string proto_str;
+  google::protobuf::io::StringOutputStream proto_stream(&proto_str);
+
+  if (header_->GetCompressFlag()) {
+    google::protobuf::io::GzipOutputStream::Options options;
+    options.format = google::protobuf::io::GzipOutputStream::ZLIB;
+    options.compression_level = kDeflateCompressionLevel;
+
+    google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
+                                                                  options);
+
+    bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
+                   compressing_stream.Close();
+
+    if (!success) {
+      return absl_ports::InternalError("Error compressing proto.");
+    }
+
+    final_size = proto_str.size();
+
+    // In case the compressed proto is larger than the original proto, we also
+    // can't write it.
+    if (final_size > header_->GetMaxProtoSize()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Compressed proto size, %d, was greater than "
+          "max_proto_size, %d",
+          final_size, header_->GetMaxProtoSize()));
+    }
+  } else {
+    // Serialize the proto directly into the write buffer at an offset of the
+    // metadata.
+    proto.SerializeToZeroCopyStream(&proto_stream);
+    final_size = proto_str.size();
+  }
+
+  // 1st byte for magic, next 3 bytes for proto size.
+  host_order_metadata = (kProtoMagic << 24) | final_size;
+
+  // Actually write metadata, has to be done after we know the possibly
+  // compressed proto size
+  ICING_RETURN_IF_ERROR(
+      WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
+
+  // Write the serialized proto
+  if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to write proto to: ", file_path_));
+  }
+
+  return current_position;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<ProtoT>
+PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
+  int64_t file_size = filesystem_->GetFileSize(fd_.get());
+  MemoryMappedFile mmapped_file(*filesystem_, file_path_,
+                                MemoryMappedFile::Strategy::READ_ONLY);
+  if (file_offset >= file_size) {
+    // file_size points to the next byte to write at, so subtract one to get
+    // the inclusive, actual size of file.
+    return absl_ports::OutOfRangeError(
+        IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+                                      "out of range of the file size, %lld",
+                                      static_cast<long long>(file_offset),
+                                      static_cast<long long>(file_size - 1)));
+  }
+
+  // Read out the metadata
+  ICING_ASSIGN_OR_RETURN(
+      int32_t metadata,
+      ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+  // Copy out however many bytes it says the proto is
+  int stored_size = GetProtoSize(metadata);
+
+  ICING_RETURN_IF_ERROR(
+      mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+
+  if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+    return absl_ports::NotFoundError("The proto data has been erased.");
+  }
+
+  google::protobuf::io::ArrayInputStream proto_stream(
+      mmapped_file.mutable_region(), stored_size);
+
+  // Deserialize proto
+  ProtoT proto;
+  if (header_->GetCompressFlag()) {
+    google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
+    proto.ParseFromZeroCopyStream(&decompress_stream);
+  } else {
+    proto.ParseFromZeroCopyStream(&proto_stream);
+  }
+
+  return proto;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
+    int64_t file_offset) {
+  int64_t file_size = filesystem_->GetFileSize(fd_.get());
+  if (file_offset >= file_size) {
+    // file_size points to the next byte to write at, so subtract one to get
+    // the inclusive, actual size of file.
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "Trying to erase data at a location, %lld, "
+        "out of range of the file size, %lld",
+        static_cast<long long>(file_offset),
+        static_cast<long long>(file_size - 1)));
+  }
+
+  MemoryMappedFile mmapped_file(
+      *filesystem_, file_path_,
+      MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+  // Read out the metadata
+  ICING_ASSIGN_OR_RETURN(
+      int32_t metadata,
+      ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+  ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+                                           GetProtoSize(metadata)));
+
+  // We need to update the crc checksum if the erased area is before the
+  // rewind position.
+  if (file_offset + sizeof(metadata) < header_->GetRewindOffset()) {
+    // We need to calculate [original string xor 0s].
+    // The xored string is the same as the original string because 0 xor 0 =
+    // 0, 1 xor 0 = 1.
+    const std::string_view xored_str(mmapped_file.region(),
+                                     mmapped_file.region_size());
+
+    Crc32 crc(header_->GetLogChecksum());
+    ICING_ASSIGN_OR_RETURN(
+        uint32_t new_crc,
+        crc.UpdateWithXor(xored_str,
+                          /*full_data_size=*/header_->GetRewindOffset() -
+                              kHeaderReservedBytes,
+                          /*position=*/file_offset + sizeof(metadata) -
+                              kHeaderReservedBytes));
+
+    header_->SetLogChecksum(new_crc);
+    header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+    if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+                             sizeof(Header))) {
+      return absl_ports::InternalError(
+          absl_ports::StrCat("Failed to update header to: ", file_path_));
+    }
+  }
+
+  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
+  int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+  if (size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError("Failed to get disk usage of proto log");
+  }
+  return size;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+  int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+  if (total_file_size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        "Failed to get file size of elments in the proto log");
+  }
+  return total_file_size - kHeaderReservedBytes;
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
+    const Filesystem& filesystem, const std::string& file_path,
+    int64_t initial_offset)
+    : mmapped_file_(filesystem, file_path,
+                    MemoryMappedFile::Strategy::READ_ONLY),
+      initial_offset_(initial_offset),
+      current_offset_(kInvalidOffset),
+      file_size_(filesystem.GetFileSize(file_path.c_str())) {
+  if (file_size_ == Filesystem::kBadFileSize) {
+    // Fails all Advance() calls
+    file_size_ = 0;
+  }
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
+  if (current_offset_ == kInvalidOffset) {
+    // First Advance() call
+    current_offset_ = initial_offset_;
+  } else {
+    // Jumps to the next proto position
+    ICING_ASSIGN_OR_RETURN(
+        int32_t metadata,
+        ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+    current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
+  }
+
+  if (current_offset_ < file_size_) {
+    return libtextclassifier3::Status::OK;
+  } else {
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "The next proto offset, %lld, is out of file range [0, %lld)",
+        static_cast<long long>(current_offset_),
+        static_cast<long long>(file_size_)));
+  }
+}
+
+template <typename ProtoT>
+int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
+  return current_offset_;
+}
+
+template <typename ProtoT>
+typename PortableFileBackedProtoLog<ProtoT>::Iterator
+PortableFileBackedProtoLog<ProtoT>::GetIterator() {
+  return Iterator(*filesystem_, file_path_,
+                  /*initial_offset=*/kHeaderReservedBytes);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int32_t>
+PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
+    MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+  // Checks file_offset
+  if (file_offset >= file_size) {
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "offset, %lld, is out of file range [0, %lld)",
+        static_cast<long long>(file_offset),
+        static_cast<long long>(file_size)));
+  }
+  int32_t portable_metadata;
+  int metadata_size = sizeof(portable_metadata);
+  if (file_offset + metadata_size >= file_size) {
+    return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+        "Wrong metadata offset %lld, metadata doesn't fit in "
+        "with file range [0, %lld)",
+        static_cast<long long>(file_offset),
+        static_cast<long long>(file_size)));
+  }
+
+  // Reads metadata
+  ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
+  memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
+
+  // Need to switch it back to host order endianness after reading from disk.
+  int32_t host_order_metadata = gntohl(portable_metadata);
+
+  // Checks magic number
+  uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
+  if (stored_k_proto_magic != kProtoMagic) {
+    return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+        "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
+        stored_k_proto_magic));
+  }
+
+  return host_order_metadata;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
+    const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
+  // Convert it into portable endian format before writing to disk
+  int32_t portable_metadata = ghtonl(host_order_metadata);
+  int portable_metadata_size = sizeof(portable_metadata);
+
+  // Write metadata
+  if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to write proto metadata."));
+  }
+
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
+  int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+  if (file_size == header_->GetRewindOffset()) {
+    // No new protos appended, don't need to update the checksum.
+    return libtextclassifier3::Status::OK;
+  }
+
+  int64_t new_content_size = file_size - header_->GetRewindOffset();
+  Crc32 crc;
+  if (new_content_size < 0) {
+    // File shrunk, recalculate the entire checksum.
+    ICING_ASSIGN_OR_RETURN(
+        crc,
+        ComputeChecksum(filesystem_, file_path_, Crc32(),
+                        /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+  } else {
+    // Append new changes to the existing checksum.
+    ICING_ASSIGN_OR_RETURN(
+        crc, ComputeChecksum(filesystem_, file_path_,
+                             Crc32(header_->GetLogChecksum()),
+                             header_->GetRewindOffset(), file_size));
+  }
+
+  header_->SetLogChecksum(crc.Get());
+  header_->SetRewindOffset(file_size);
+  header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+  if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+                           sizeof(Header)) ||
+      !filesystem_->DataSync(fd_.get())) {
+    return absl_ports::InternalError(
+        absl_ports::StrCat("Failed to update header to: ", file_path_));
+  }
+
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
+  return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+      filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes,
+      /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
+}
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
new file mode 100644
index 0000000..b1dfe12
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -0,0 +1,211 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+// go/microbenchmarks
+//
+// To build and run on a local machine:
+//   $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+//   icing/file:portable-file-backed-proto-log_benchmark
+//
+//   $ blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+//   --benchmarks=all
+//
+//
+// To build and run on an Android device (must be connected and rooted):
+//   $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+//   --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+//   icing/file:portable-file-backed-proto-log_benchmark
+//
+//   $ adb root
+//
+//   $ adb push
+//   blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+//   /data/local/tmp/
+//
+//   $ adb shell /data/local/tmp/portable-file-backed-proto-log-benchmark
+//   --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static void BM_Write(benchmark::State& state) {
+  const Filesystem filesystem;
+  int string_length = state.range(0);
+  const std::string file_path = IcingStringUtil::StringPrintf(
+      "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+  int max_proto_size = (1 << 24) - 1;  // 16 MiB
+  bool compress = true;
+
+  // Make sure it doesn't already exist.
+  filesystem.DeleteFile(file_path.c_str());
+
+  auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+                       &filesystem, file_path,
+                       PortableFileBackedProtoLog<DocumentProto>::Options(
+                           compress, max_proto_size))
+                       .ValueOrDie()
+                       .proto_log;
+
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  std::default_random_engine random;
+  const std::string rand_str =
+      RandomString(kAlNumAlphabet, string_length, &random);
+
+  auto document_properties = document.add_properties();
+  document_properties->set_name("string property");
+  document_properties->add_string_values(rand_str);
+
+  for (auto _ : state) {
+    testing::DoNotOptimize(proto_log->WriteProto(document));
+  }
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          string_length);
+
+  // Cleanup after ourselves
+  filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Write)
+    ->Arg(1)
+    ->Arg(32)
+    ->Arg(512)
+    ->Arg(1024)
+    ->Arg(4 * 1024)
+    ->Arg(8 * 1024)
+    ->Arg(16 * 1024)
+    ->Arg(32 * 1024)
+    ->Arg(256 * 1024)
+    ->Arg(2 * 1024 * 1024)
+    ->Arg(8 * 1024 * 1024)
+    ->Arg(15 * 1024 * 1024);  // We do 15MiB here since our max proto size is
+                              // 16MiB, and we need some extra space for the
+                              // rest of the document properties
+
+static void BM_Read(benchmark::State& state) {
+  const Filesystem filesystem;
+  int string_length = state.range(0);
+  const std::string file_path = IcingStringUtil::StringPrintf(
+      "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+  int max_proto_size = (1 << 24) - 1;  // 16 MiB
+  bool compress = true;
+
+  // Make sure it doesn't already exist.
+  filesystem.DeleteFile(file_path.c_str());
+
+  auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+                       &filesystem, file_path,
+                       PortableFileBackedProtoLog<DocumentProto>::Options(
+                           compress, max_proto_size))
+                       .ValueOrDie()
+                       .proto_log;
+
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  std::default_random_engine random;
+  const std::string rand_str =
+      RandomString(kAlNumAlphabet, string_length, &random);
+
+  auto document_properties = document.add_properties();
+  document_properties->set_name("string property");
+  document_properties->add_string_values(rand_str);
+
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+                             proto_log->WriteProto(document));
+
+  for (auto _ : state) {
+    testing::DoNotOptimize(proto_log->ReadProto(write_offset));
+  }
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          string_length);
+
+  // Cleanup after ourselves
+  filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Read)
+    ->Arg(1)
+    ->Arg(32)
+    ->Arg(512)
+    ->Arg(1024)
+    ->Arg(4 * 1024)
+    ->Arg(8 * 1024)
+    ->Arg(16 * 1024)
+    ->Arg(32 * 1024)
+    ->Arg(256 * 1024)
+    ->Arg(2 * 1024 * 1024)
+    ->Arg(8 * 1024 * 1024)
+    ->Arg(15 * 1024 * 1024);  // We do 15MiB here since our max proto size is
+                              // 16MiB, and we need some extra space for the
+                              // rest of the document properties
+
+static void BM_ComputeChecksum(benchmark::State& state) {
+  const Filesystem filesystem;
+  const std::string file_path = GetTestTempDir() + "/proto.log";
+  int max_proto_size = (1 << 24) - 1;  // 16 MiB
+  bool compress = true;
+
+  // Make sure it doesn't already exist.
+  filesystem.DeleteFile(file_path.c_str());
+
+  auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+                       &filesystem, file_path,
+                       PortableFileBackedProtoLog<DocumentProto>::Options(
+                           compress, max_proto_size))
+                       .ValueOrDie()
+                       .proto_log;
+
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  // Make each document 1KiB
+  int string_length = 1024;
+  std::default_random_engine random;
+  const std::string rand_str =
+      RandomString(kAlNumAlphabet, string_length, &random);
+
+  auto document_properties = document.add_properties();
+  document_properties->set_name("string property");
+  document_properties->add_string_values(rand_str);
+
+  int num_docs = state.range(0);
+  for (int i = 0; i < num_docs; ++i) {
+    ICING_ASSERT_OK(proto_log->WriteProto(document));
+  }
+
+  for (auto _ : state) {
+    testing::DoNotOptimize(proto_log->ComputeChecksum());
+  }
+
+  // Cleanup after ourselves
+  filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
new file mode 100644
index 0000000..dfb67aa
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -0,0 +1,727 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/portable-file-backed-proto-log.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::A;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::Return;
+
+class PortableFileBackedProtoLogTest : public ::testing::Test {
+ protected:
+  // Adds a user-defined default construct because a const member variable may
+  // make the compiler accidentally delete the default constructor.
+  // https://stackoverflow.com/a/47368753
+  PortableFileBackedProtoLogTest() {}
+
+  void SetUp() override {
+    file_path_ = GetTestTempDir() + "/proto_log";
+    filesystem_.DeleteFile(file_path_.c_str());
+  }
+
+  void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+  const Filesystem filesystem_;
+  std::string file_path_;
+  bool compress_ = true;
+  int64_t max_proto_size_ = 256 * 1024;  // 256 KiB
+};
+
+TEST_F(PortableFileBackedProtoLogTest, Initialize) {
+  // max_proto_size must be greater than 0
+  int invalid_max_proto_size = 0;
+  ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+                  &filesystem_, file_path_,
+                  PortableFileBackedProtoLog<DocumentProto>::Options(
+                      compress_, invalid_max_proto_size)),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+  EXPECT_THAT(create_result.proto_log, NotNull());
+  EXPECT_FALSE(create_result.has_data_loss());
+
+  // Can't recreate the same file with different options.
+  ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+                  &filesystem_, file_path_,
+                  PortableFileBackedProtoLog<DocumentProto>::Options(
+                      !compress_, max_proto_size_)),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+
+  // With no protos written yet, the log should be minimum the size of the
+  // reserved header space.
+  ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), kHeaderReservedBytes);
+}
+
+TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) {
+  int max_proto_size = 1;
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.has_data_loss());
+
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  // Proto is too large for the max_proto_size_in
+  ASSERT_THAT(proto_log->WriteProto(document),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.has_data_loss());
+
+  // Write a proto
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
+                             proto_log->WriteProto(document));
+
+  // The 4 bytes of metadata that just doesn't have the same kProtoMagic
+  // specified in file-backed-proto-log.h
+  uint32_t wrong_magic = 0x7E000000;
+
+  // Sanity check that we opened the file correctly
+  int fd = filesystem_.OpenForWrite(file_path_.c_str());
+  ASSERT_GT(fd, 0);
+
+  // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
+  // a proto entry.
+  filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
+
+  ASSERT_THAT(proto_log->ReadProto(file_offset),
+              StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteUncompressedProto) {
+  int last_offset;
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                /*compress_in=*/false, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write the first proto
+    DocumentProto document1 =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+                               proto_log->WriteProto(document1));
+
+    int document1_offset = written_position;
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(written_position),
+                IsOkAndHolds(EqualsProto(document1)));
+
+    // Write a second proto that's close to the max size. Leave some room for
+    // the rest of the proto properties.
+    std::string long_str(max_proto_size_ - 1024, 'a');
+    DocumentProto document2 = DocumentBuilder()
+                                  .SetKey("namespace2", "uri2")
+                                  .AddStringProperty("long_str", long_str)
+                                  .Build();
+
+    ICING_ASSERT_OK_AND_ASSIGN(written_position,
+                               proto_log->WriteProto(document2));
+
+    int document2_offset = written_position;
+    last_offset = written_position;
+    ASSERT_GT(document2_offset, document1_offset);
+
+    // Check the second proto
+    ASSERT_THAT(proto_log->ReadProto(written_position),
+                IsOkAndHolds(EqualsProto(document2)));
+
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+  }
+
+  {
+    // Make a new proto_log with the same file_path, and make sure we
+    // can still write to the same underlying file.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                /*compress_in=*/false, max_proto_size_)));
+    auto recreated_proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write a third proto
+    DocumentProto document3 =
+        DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+    ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+                IsOkAndHolds(Gt(last_offset)));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteCompressedProto) {
+  int last_offset;
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                /*compress_in=*/true, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write the first proto
+    DocumentProto document1 =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+                               proto_log->WriteProto(document1));
+
+    int document1_offset = written_position;
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(written_position),
+                IsOkAndHolds(EqualsProto(document1)));
+
+    // Write a second proto that's close to the max size. Leave some room for
+    // the rest of the proto properties.
+    std::string long_str(max_proto_size_ - 1024, 'a');
+    DocumentProto document2 = DocumentBuilder()
+                                  .SetKey("namespace2", "uri2")
+                                  .AddStringProperty("long_str", long_str)
+                                  .Build();
+
+    ICING_ASSERT_OK_AND_ASSIGN(written_position,
+                               proto_log->WriteProto(document2));
+
+    int document2_offset = written_position;
+    last_offset = written_position;
+    ASSERT_GT(document2_offset, document1_offset);
+
+    // Check the second proto
+    ASSERT_THAT(proto_log->ReadProto(written_position),
+                IsOkAndHolds(EqualsProto(document2)));
+
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+  }
+
+  {
+    // Make a new proto_log with the same file_path, and make sure we
+    // can still write to the same underlying file.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                /*compress_in=*/true, max_proto_size_)));
+    auto recreated_proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write a third proto
+    DocumentProto document3 =
+        DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+    ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+                IsOkAndHolds(Gt(last_offset)));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, CorruptHeader) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto recreated_proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.has_data_loss());
+  }
+
+  int corrupt_value = 24;
+
+  // Offset after the kMagic and the header_checksum.
+  int offset_after_checksum = 8;
+  filesystem_.PWrite(file_path_.c_str(), offset_after_checksum, &corrupt_value,
+                     sizeof(corrupt_value));
+
+  {
+    // Reinitialize the same proto_log
+    ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+                    &filesystem_, file_path_,
+                    PortableFileBackedProtoLog<DocumentProto>::Options(
+                        compress_, max_proto_size_)),
+                StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+                         HasSubstr("Invalid header checksum")));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DifferentMagic) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto recreated_proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.has_data_loss());
+
+    // Corrupt the magic that's stored at the beginning of the header.
+    int invalid_magic = -1;
+    filesystem_.PWrite(file_path_.c_str(), /*offset=*/0, &invalid_magic,
+                       sizeof(invalid_magic));
+  }
+
+  {
+    // Reinitialize the same proto_log
+    ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+                    &filesystem_, file_path_,
+                    PortableFileBackedProtoLog<DocumentProto>::Options(
+                        compress_, max_proto_size_)),
+                StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+                         HasSubstr("Invalid header kMagic")));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, CorruptContent) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    // Write and persist an document.
+    ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+                               proto_log->WriteProto(document));
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+    // "Corrupt" the content written in the log.
+    document.set_uri("invalid");
+    std::string serialized_document = document.SerializeAsString();
+    filesystem_.PWrite(file_path_.c_str(), document_offset,
+                       serialized_document.data(), serialized_document.size());
+  }
+
+  {
+    // We can recover, but we have data loss.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_TRUE(create_result.has_data_loss());
+    ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+    // Lost everything in the log since the rewind position doesn't help if
+    // there's been data corruption within the persisted region
+    ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
+              kHeaderReservedBytes);
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, PersistToDisk) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace1", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace2", "uri2").Build();
+  int document1_offset, document2_offset;
+  int log_size;
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+                               proto_log->WriteProto(document1));
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+    // Write, but don't explicitly persist the second proto
+    ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+                               proto_log->WriteProto(document2));
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(document1_offset),
+                IsOkAndHolds(EqualsProto(document1)));
+    ASSERT_THAT(proto_log->ReadProto(document2_offset),
+                IsOkAndHolds(EqualsProto(document2)));
+
+    log_size = filesystem_.GetFileSize(file_path_.c_str());
+    ASSERT_GT(log_size, 0);
+  }
+
+  {
+    // The header rewind position and checksum aren't updated in this "system
+    // crash" scenario.
+
+    std::string bad_proto =
+        "some incomplete proto that we didn't finish writing before the "
+        "system crashed";
+    filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
+                       bad_proto.size());
+
+    // Double check that we actually wrote something to the underlying file
+    ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
+  }
+
+  {
+    // We can recover, but we have data loss
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_TRUE(create_result.has_data_loss());
+    ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+
+    // Check that everything was persisted across instances
+    ASSERT_THAT(proto_log->ReadProto(document1_offset),
+                IsOkAndHolds(EqualsProto(document1)));
+    ASSERT_THAT(proto_log->ReadProto(document2_offset),
+                IsOkAndHolds(EqualsProto(document2)));
+
+    // We correctly rewound to the last good state.
+    ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, Iterator) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.has_data_loss());
+
+  {
+    // Empty iterator
+    auto iterator = proto_log->GetIterator();
+    ASSERT_THAT(iterator.Advance(),
+                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+  }
+
+  {
+    // Iterates through some documents
+    ICING_ASSERT_OK(proto_log->WriteProto(document1));
+    ICING_ASSERT_OK(proto_log->WriteProto(document2));
+    auto iterator = proto_log->GetIterator();
+    // 1st proto
+    ICING_ASSERT_OK(iterator.Advance());
+    ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+                IsOkAndHolds(EqualsProto(document1)));
+    // 2nd proto
+    ICING_ASSERT_OK(iterator.Advance());
+    ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+                IsOkAndHolds(EqualsProto(document2)));
+    // Tries to advance
+    ASSERT_THAT(iterator.Advance(),
+                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+  }
+
+  {
+    // Iterator with bad filesystem
+    MockFilesystem mock_filesystem;
+    ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+        .WillByDefault(Return(Filesystem::kBadFileSize));
+    PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
+        mock_filesystem, file_path_, /*initial_offset=*/0);
+    ASSERT_THAT(bad_iterator.Advance(),
+                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ComputeChecksum) {
+  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+  Crc32 checksum;
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    ICING_EXPECT_OK(proto_log->WriteProto(document));
+
+    ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
+
+    // Calling it twice with no changes should get us the same checksum
+    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Checksum should be consistent across instances
+    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+    // PersistToDisk shouldn't affect the checksum value
+    ICING_EXPECT_OK(proto_log->PersistToDisk());
+    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+    // Check that modifying the log leads to a different checksum
+    ICING_EXPECT_OK(proto_log->WriteProto(document));
+    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.has_data_loss());
+
+  // Writes and erases proto
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                             proto_log->WriteProto(document1));
+  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+  // Checks if the erased area is set to 0.
+  int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+  MemoryMappedFile mmapped_file(filesystem_, file_path_,
+                                MemoryMappedFile::Strategy::READ_ONLY);
+
+  // document1_offset + sizeof(int) is the start byte of the proto where
+  // sizeof(int) is the size of the proto metadata.
+  mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+  for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+    ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      PortableFileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                             max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.has_data_loss());
+
+  // Writes 2 protos
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                             proto_log->WriteProto(document1));
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+                             proto_log->WriteProto(document2));
+
+  // Erases the first proto
+  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+  // The first proto has been erased.
+  ASSERT_THAT(proto_log->ReadProto(document1_offset),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  // The second proto should be returned.
+  ASSERT_THAT(proto_log->ReadProto(document2_offset),
+              IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace", "uri2").Build();
+  DocumentProto document3 =
+      DocumentBuilder().SetKey("namespace", "uri3").Build();
+  DocumentProto document4 =
+      DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+  int64_t document2_offset;
+  int64_t document3_offset;
+
+  {
+    // Erase data after the rewind position. This won't update the checksum
+    // immediately.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Writes 3 protos
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                               proto_log->WriteProto(document1));
+    ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+                               proto_log->WriteProto(document2));
+    ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+                               proto_log->WriteProto(document3));
+
+    // Erases the 1st proto, checksum won't be updated immediately because the
+    // rewind position is 0.
+    ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(2175574628))));
+  }  // New checksum is updated in destructor.
+
+  {
+    // Erase data before the rewind position. This will update the checksum
+    // immediately.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Erases the 2nd proto that is now before the rewind position. Checksum
+    // is updated.
+    ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(790877774))));
+  }
+
+  {
+    // Append data and erase data before the rewind position. This will update
+    // the checksum twice: in EraseProto() and destructor.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Append a new document which is after the rewind position.
+    ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+    // Erases the 3rd proto that is now before the rewind position. Checksum
+    // is updated.
+    ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(2344803210))));
+  }  // Checksum is updated with the newly appended document.
+
+  {
+    // A successful creation means that the checksum matches.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.has_data_loss());
+  }
+}
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 44241bc..e9865e4 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -27,6 +27,7 @@
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/mutex.h"
 #include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-file.h"
 #include "icing/file/file-backed-proto.h"
 #include "icing/file/filesystem.h"
 #include "icing/index/hit/doc-hit-info.h"
@@ -75,8 +76,7 @@
 constexpr std::string_view kDocumentSubfolderName = "document_dir";
 constexpr std::string_view kIndexSubfolderName = "index_dir";
 constexpr std::string_view kSchemaSubfolderName = "schema_dir";
-constexpr std::string_view kIcingSearchEngineHeaderFilename =
-    "icing_search_engine_header";
+constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
 constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
 
 libtextclassifier3::Status ValidateOptions(
@@ -137,10 +137,6 @@
   return index_processor_options;
 }
 
-std::string MakeHeaderFilename(const std::string& base_dir) {
-  return absl_ports::StrCat(base_dir, "/", kIcingSearchEngineHeaderFilename);
-}
-
 // Document store files are in a standalone subfolder for easier file
 // management. We can delete and recreate the subfolder and not touch/affect
 // anything else.
@@ -168,6 +164,9 @@
 std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
   return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
 }
+std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
+  return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
+}
 
 void TransformStatus(const libtextclassifier3::Status& internal_status,
                      StatusProto* status_proto) {
@@ -298,69 +297,6 @@
   }
 
   libtextclassifier3::Status status = InitializeMembers(initialize_stats);
-  if (!status.ok()) {
-    TransformStatus(status, result_status);
-    initialize_stats->set_latency_ms(
-        initialize_timer->GetElapsedMilliseconds());
-    return result_proto;
-  }
-
-  // Even if each subcomponent initialized fine independently, we need to
-  // check if they're consistent with each other.
-  if (!CheckConsistency().ok()) {
-    // The total checksum doesn't match the stored value, it could be one of the
-    // following cases:
-    // 1. Icing is initialized the first time in this directory.
-    // 2. Non-checksumed changes have been made to some files.
-    if (index_->last_added_document_id() == kInvalidDocumentId &&
-        document_store_->last_added_document_id() == kInvalidDocumentId &&
-        absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
-      // First time initialize. Not recovering but creating all the files.
-      // We need to explicitly clear the recovery-related fields because some
-      // sub-components may not be able to tell if the storage is being
-      // initialized the first time or has lost some files. Sub-components may
-      // already have set these fields in earlier steps.
-      *initialize_stats = InitializeStatsProto();
-      status = RegenerateDerivedFiles();
-    } else {
-      ICING_VLOG(1)
-          << "IcingSearchEngine in inconsistent state, regenerating all "
-             "derived data";
-      // Total checksum mismatch may not be the root cause of document store
-      // recovery. Preserve the root cause that was set by the document store.
-      bool should_log_document_store_recovery_cause =
-          initialize_stats->document_store_recovery_cause() ==
-          InitializeStatsProto::NONE;
-      if (should_log_document_store_recovery_cause) {
-        initialize_stats->set_document_store_recovery_cause(
-            InitializeStatsProto::TOTAL_CHECKSUM_MISMATCH);
-      }
-      initialize_stats->set_index_restoration_cause(
-          InitializeStatsProto::TOTAL_CHECKSUM_MISMATCH);
-      status = RegenerateDerivedFiles(initialize_stats,
-                                      should_log_document_store_recovery_cause);
-    }
-  } else {
-    DocumentId last_stored_document_id =
-        document_store_->last_added_document_id();
-    DocumentId last_indexed_document_id = index_->last_added_document_id();
-    if (last_stored_document_id != last_indexed_document_id) {
-      if (last_stored_document_id == kInvalidDocumentId) {
-        // Document store is empty but index is not. Reset the index.
-        status = index_->Reset();
-      } else {
-        // Index is inconsistent with the document store, we need to restore the
-        // index.
-        initialize_stats->set_index_restoration_cause(
-            InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH);
-        std::unique_ptr<Timer> index_restore_timer = clock_->GetNewTimer();
-        status = RestoreIndexIfNeeded();
-        initialize_stats->set_index_restoration_latency_ms(
-            index_restore_timer->GetElapsedMilliseconds());
-      }
-    }
-  }
-
   if (status.ok() || absl_ports::IsDataLoss(status)) {
     initialized_ = true;
   }
@@ -374,10 +310,6 @@
   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
   ICING_RETURN_IF_ERROR(InitializeOptions());
   ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
-  ICING_RETURN_IF_ERROR(InitializeDocumentStore(initialize_stats));
-
-  result_state_manager_ = std::make_unique<ResultStateManager>(
-      performance_configuration_.max_num_total_hits, *document_store_);
 
   // TODO(b/156383798) : Resolve how to specify the locale.
   language_segmenter_factory::SegmenterOptions segmenter_options(
@@ -388,9 +320,75 @@
   TC3_ASSIGN_OR_RETURN(normalizer_,
                        normalizer_factory::Create(options_.max_token_length()));
 
-  ICING_RETURN_IF_ERROR(InitializeIndex(initialize_stats));
+  std::string marker_filepath =
+      MakeSetSchemaMarkerFilePath(options_.base_dir());
+  libtextclassifier3::Status status;
+  if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
+    // The schema was either lost or never set before. Wipe out the doc store
+    // and index directories and initialize them from scratch.
+    const std::string doc_store_dir =
+        MakeDocumentDirectoryPath(options_.base_dir());
+    const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
+    if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
+        !filesystem_->DeleteDirectoryRecursively(index_dir.c_str())) {
+      return absl_ports::InternalError(absl_ports::StrCat(
+          "Could not delete directories: ", index_dir, " and ", doc_store_dir));
+    }
+    ICING_RETURN_IF_ERROR(InitializeDocumentStore(
+        /*force_recovery_and_revalidate_documents=*/false, initialize_stats));
+    status = InitializeIndex(initialize_stats);
+  } else if (filesystem_->FileExists(marker_filepath.c_str())) {
+    // If the marker file is still around then something wonky happened when we
+    // last tried to set the schema.
+    ICING_RETURN_IF_ERROR(InitializeDocumentStore(
+        /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
+    initialize_stats->set_document_store_recovery_cause(
+        InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
 
-  return libtextclassifier3::Status::OK;
+    // We're going to need to build the index from scratch. So just delete its
+    // files now.
+    const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
+    Index::Options index_options(index_dir, options_.index_merge_size());
+    if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
+        !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
+      return absl_ports::InternalError(
+          absl_ports::StrCat("Could not recreate directory: ", index_dir));
+    }
+    ICING_ASSIGN_OR_RETURN(index_,
+                           Index::Create(index_options, filesystem_.get(),
+                                         icing_filesystem_.get()));
+
+    std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
+    IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+    status = std::move(restore_result.status);
+    // DATA_LOSS means that we have successfully initialized and re-added
+    // content to the index. Some indexed content was lost, but otherwise the
+    // index is in a valid state and can be queried.
+    if (!status.ok() && !absl_ports::IsDataLoss(status)) {
+      return status;
+    }
+
+    // Delete the marker file to indicate that everything is now in sync with
+    // whatever changes were made to the schema.
+    filesystem_->DeleteFile(marker_filepath.c_str());
+
+    initialize_stats->set_index_restoration_latency_ms(
+        restore_timer->GetElapsedMilliseconds());
+    initialize_stats->set_index_restoration_cause(
+        InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
+  } else {
+    ICING_RETURN_IF_ERROR(InitializeDocumentStore(
+        /*force_recovery_and_revalidate_documents=*/false, initialize_stats));
+    status = InitializeIndex(initialize_stats);
+    if (!status.ok() && !absl_ports::IsDataLoss(status)) {
+      return status;
+    }
+  }
+
+  result_state_manager_ = std::make_unique<ResultStateManager>(
+      performance_configuration_.max_num_total_hits, *document_store_);
+
+  return status;
 }
 
 libtextclassifier3::Status IcingSearchEngine::InitializeOptions() {
@@ -424,6 +422,7 @@
 }
 
 libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore(
+    bool force_recovery_and_revalidate_documents,
     InitializeStatsProto* initialize_stats) {
   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
 
@@ -436,8 +435,9 @@
   }
   ICING_ASSIGN_OR_RETURN(
       DocumentStore::CreateResult create_result,
-      DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(),
-                            schema_store_.get(), initialize_stats));
+      DocumentStore::Create(
+          filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
+          force_recovery_and_revalidate_documents, initialize_stats));
   document_store_ = std::move(create_result.document_store);
 
   return libtextclassifier3::Status::OK;
@@ -455,6 +455,7 @@
   }
   Index::Options index_options(index_dir, options_.index_merge_size());
 
+  InitializeStatsProto::RecoveryCause recovery_cause;
   auto index_or =
       Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
   if (!index_or.ok()) {
@@ -464,88 +465,28 @@
           absl_ports::StrCat("Could not recreate directory: ", index_dir));
     }
 
-    initialize_stats->set_index_restoration_cause(
-        InitializeStatsProto::IO_ERROR);
+    recovery_cause = InitializeStatsProto::IO_ERROR;
 
     // Try recreating it from scratch and re-indexing everything.
     ICING_ASSIGN_OR_RETURN(index_,
                            Index::Create(index_options, filesystem_.get(),
                                          icing_filesystem_.get()));
-
-    std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
-    ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded());
-    initialize_stats->set_index_restoration_latency_ms(
-        restore_timer->GetElapsedMilliseconds());
   } else {
     // Index was created fine.
     index_ = std::move(index_or).ValueOrDie();
+    // If a recover does have to happen, then it must be because the index is
+    // out of sync with the document store.
+    recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
   }
 
-  return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status IcingSearchEngine::CheckConsistency() {
-  if (!HeaderExists()) {
-    // Without a header file, we have no checksum and can't even detect
-    // inconsistencies
-    return absl_ports::NotFoundError("No header file found.");
-  }
-
-  // Header does exist, verify that the header looks fine.
-  IcingSearchEngine::Header header;
-  if (!filesystem_->Read(MakeHeaderFilename(options_.base_dir()).c_str(),
-                         &header, sizeof(header))) {
-    return absl_ports::InternalError(absl_ports::StrCat(
-        "Couldn't read: ", MakeHeaderFilename(options_.base_dir())));
-  }
-
-  if (header.magic != IcingSearchEngine::Header::kMagic) {
-    return absl_ports::InternalError(
-        absl_ports::StrCat("Invalid header kMagic for file: ",
-                           MakeHeaderFilename(options_.base_dir())));
-  }
-
-  ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
-  if (checksum.Get() != header.checksum) {
-    return absl_ports::InternalError(
-        "IcingSearchEngine checksum doesn't match");
-  }
-
-  return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles(
-    InitializeStatsProto* initialize_stats, bool log_document_store_stats) {
-  // Measure the latency of the data recovery. The cause of the recovery should
-  // be logged by the caller.
-  std::unique_ptr<Timer> timer = clock_->GetNewTimer();
-  ICING_RETURN_IF_ERROR(
-      document_store_->UpdateSchemaStore(schema_store_.get()));
-  if (initialize_stats != nullptr && log_document_store_stats) {
-    initialize_stats->set_document_store_recovery_latency_ms(
-        timer->GetElapsedMilliseconds());
-  }
-  // Restart timer.
-  timer = clock_->GetNewTimer();
-  ICING_RETURN_IF_ERROR(index_->Reset());
-  ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded());
-  if (initialize_stats != nullptr) {
+  std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
+  IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+  if (restore_result.needed_restoration) {
     initialize_stats->set_index_restoration_latency_ms(
-        timer->GetElapsedMilliseconds());
+        restore_timer->GetElapsedMilliseconds());
+    initialize_stats->set_index_restoration_cause(recovery_cause);
   }
-
-  const std::string header_file =
-      MakeHeaderFilename(options_.base_dir().c_str());
-  if (HeaderExists()) {
-    if (!filesystem_->DeleteFile(header_file.c_str())) {
-      return absl_ports::InternalError(
-          absl_ports::StrCat("Unable to delete file: ", header_file));
-    }
-  }
-  ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
-  ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
-
-  return libtextclassifier3::Status::OK;
+  return restore_result.status;
 }
 
 SetSchemaResultProto IcingSearchEngine::SetSchema(
@@ -580,6 +521,15 @@
   }
   bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
 
+  std::string marker_filepath =
+      MakeSetSchemaMarkerFilePath(options_.base_dir());
+  // Create the marker file indicating that we are going to apply a schema
+  // change. No need to write anything to the marker file - its existence is the
+  // only thing that matters. The marker file is used to indicate if we
+  // encountered a crash or a power loss while updating the schema and other
+  // files. So set it up to be deleted as long as we return from this function.
+  DestructibleFile marker_file(marker_filepath, filesystem_.get());
+
   auto set_schema_result_or = schema_store_->SetSchema(
       std::move(new_schema), ignore_errors_and_delete_documents);
   if (!set_schema_result_or.ok()) {
@@ -627,8 +577,12 @@
         return result_proto;
       }
 
-      status = RestoreIndexIfNeeded();
-      if (!status.ok()) {
+      IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+      // DATA_LOSS means that we have successfully re-added content to the
+      // index. Some indexed content was lost, but otherwise the index is in a
+      // valid state and can be queried.
+      if (!restore_result.status.ok() &&
+          !absl_ports::IsDataLoss(restore_result.status)) {
         TransformStatus(status, result_status);
         return result_proto;
       }
@@ -639,6 +593,7 @@
     result_status->set_code(StatusProto::FAILED_PRECONDITION);
     result_status->set_message("Schema is incompatible.");
   }
+
   return result_proto;
 }
 
@@ -1095,14 +1050,18 @@
     return result_proto;
   }
 
-  libtextclassifier3::Status index_restoration_status = RestoreIndexIfNeeded();
+  IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
   optimize_stats->set_index_restoration_latency_ms(
       optimize_index_timer->GetElapsedMilliseconds());
-  if (!index_restoration_status.ok()) {
+  // DATA_LOSS means that we have successfully re-added content to the index.
+  // Some indexed content was lost, but otherwise the index is in a valid state
+  // and can be queried.
+  if (!index_restoration_status.status.ok() &&
+      !absl_ports::IsDataLoss(index_restoration_status.status)) {
     status = absl_ports::Annotate(
         absl_ports::InternalError(
             "Failed to reindex documents after optimization."),
-        index_restoration_status.error_message());
+        index_restoration_status.status.error_message());
 
     TransformStatus(status, result_status);
     return result_proto;
@@ -1240,72 +1199,6 @@
   ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL));
   ICING_RETURN_IF_ERROR(index_->PersistToDisk());
 
-  // Update the combined checksum and write to header file.
-  ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
-  ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
-
-  return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::StatusOr<Crc32> IcingSearchEngine::ComputeChecksum() {
-  Crc32 total_checksum;
-  // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
-  // that can support error logging.
-  auto checksum_or = schema_store_->ComputeChecksum();
-  if (!checksum_or.ok()) {
-    ICING_LOG(ERROR) << checksum_or.status().error_message()
-                     << "Failed to compute checksum of SchemaStore";
-    return checksum_or.status();
-  }
-
-  Crc32 schema_store_checksum = std::move(checksum_or).ValueOrDie();
-
-  // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
-  // that can support error logging.
-  checksum_or = document_store_->ComputeChecksum();
-  if (!checksum_or.ok()) {
-    ICING_LOG(ERROR) << checksum_or.status().error_message()
-                     << "Failed to compute checksum of DocumentStore";
-    return checksum_or.status();
-  }
-  Crc32 document_store_checksum = std::move(checksum_or).ValueOrDie();
-
-  total_checksum.Append(std::to_string(document_store_checksum.Get()));
-  total_checksum.Append(std::to_string(schema_store_checksum.Get()));
-
-  return total_checksum;
-}
-
-bool IcingSearchEngine::HeaderExists() {
-  if (!filesystem_->FileExists(
-          MakeHeaderFilename(options_.base_dir()).c_str())) {
-    return false;
-  }
-
-  int64_t file_size =
-      filesystem_->GetFileSize(MakeHeaderFilename(options_.base_dir()).c_str());
-
-  // If it's been truncated to size 0 before, we consider it to be a new file
-  return file_size != 0 && file_size != Filesystem::kBadFileSize;
-}
-
-libtextclassifier3::Status IcingSearchEngine::UpdateHeader(
-    const Crc32& checksum) {
-  // Write the header
-  IcingSearchEngine::Header header;
-  header.magic = IcingSearchEngine::Header::kMagic;
-  header.checksum = checksum.Get();
-
-  // This should overwrite the header.
-  ScopedFd sfd(filesystem_->OpenForWrite(
-      MakeHeaderFilename(options_.base_dir()).c_str()));
-  if (!sfd.is_valid() ||
-      !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
-      !filesystem_->DataSync(sfd.get())) {
-    return absl_ports::InternalError(
-        absl_ports::StrCat("Failed to write IcingSearchEngine header: ",
-                           MakeHeaderFilename(options_.base_dir())));
-  }
   return libtextclassifier3::Status::OK;
 }
 
@@ -1323,6 +1216,7 @@
   }
 
   QueryStatsProto* query_stats = result_proto.mutable_query_stats();
+  query_stats->set_query_length(search_spec.query().length());
   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
 
   libtextclassifier3::Status status = ValidateResultSpec(result_spec);
@@ -1650,19 +1544,20 @@
   return libtextclassifier3::Status::OK;
 }
 
-libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() {
+IcingSearchEngine::IndexRestorationResult
+IcingSearchEngine::RestoreIndexIfNeeded() {
   DocumentId last_stored_document_id =
       document_store_->last_added_document_id();
   DocumentId last_indexed_document_id = index_->last_added_document_id();
 
   if (last_stored_document_id == last_indexed_document_id) {
     // No need to recover.
-    return libtextclassifier3::Status::OK;
+    return {libtextclassifier3::Status::OK, false};
   }
 
   if (last_stored_document_id == kInvalidDocumentId) {
     // Document store is empty but index is not. Reset the index.
-    return index_->Reset();
+    return {index_->Reset(), false};
   }
 
   // TruncateTo ensures that the index does not hold any data that is not
@@ -1671,17 +1566,29 @@
   // lost documents. If the index does not contain any hits for documents with
   // document id greater than last_stored_document_id, then TruncateTo will have
   // no effect.
-  ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
+  auto status = index_->TruncateTo(last_stored_document_id);
+  if (!status.ok()) {
+    return {status, false};
+  }
+  // Last indexed document id may have changed thanks to TruncateTo.
+  last_indexed_document_id = index_->last_added_document_id();
   DocumentId first_document_to_reindex =
       (last_indexed_document_id != kInvalidDocumentId)
           ? index_->last_added_document_id() + 1
           : kMinDocumentId;
+  if (first_document_to_reindex > last_stored_document_id) {
+    // Nothing to restore. Just return.
+    return {libtextclassifier3::Status::OK, false};
+  }
 
-  ICING_ASSIGN_OR_RETURN(
-      std::unique_ptr<IndexProcessor> index_processor,
-      IndexProcessor::Create(normalizer_.get(), index_.get(),
-                             CreateIndexProcessorOptions(options_),
-                             clock_.get()));
+  auto index_processor_or = IndexProcessor::Create(
+      normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_),
+      clock_.get());
+  if (!index_processor_or.ok()) {
+    return {index_processor_or.status(), true};
+  }
+  std::unique_ptr<IndexProcessor> index_processor =
+      std::move(index_processor_or).ValueOrDie();
 
   ICING_VLOG(1) << "Restoring index by replaying documents from document id "
                 << first_document_to_reindex << " to document id "
@@ -1699,7 +1606,7 @@
         continue;
       } else {
         // Returns other errors
-        return document_or.status();
+        return {document_or.status(), true};
       }
     }
     DocumentProto document(std::move(document_or).ValueOrDie());
@@ -1709,7 +1616,7 @@
                                   language_segmenter_.get(),
                                   std::move(document));
     if (!tokenized_document_or.ok()) {
-      return tokenized_document_or.status();
+      return {tokenized_document_or.status(), true};
     }
     TokenizedDocument tokenized_document(
         std::move(tokenized_document_or).ValueOrDie());
@@ -1719,7 +1626,7 @@
     if (!status.ok()) {
       if (!absl_ports::IsDataLoss(status)) {
         // Real error. Stop recovering and pass it up.
-        return status;
+        return {status, true};
       }
       // Just a data loss. Keep trying to add the remaining docs, but report the
       // data loss when we're done.
@@ -1727,7 +1634,7 @@
     }
   }
 
-  return overall_status;
+  return {overall_status, true};
 }
 
 libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index fa1e0c8..3dc7e29 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -53,16 +53,6 @@
 // TODO(cassiewang) Top-level comments and links to design-doc.
 class IcingSearchEngine {
  public:
-  struct Header {
-    static constexpr int32_t kMagic = 0x6e650d0a;
-
-    // Holds the magic as a quick sanity check against file corruption.
-    int32_t magic;
-
-    // Checksum of the IcingSearchEngine's sub-component's checksums.
-    uint32_t checksum;
-  };
-
   // Note: It is only required to provide a pointer to a valid instance of
   // JniCache if this instance needs to perform reverse-jni calls. Users on
   // Linux and iOS should always provide a nullptr.
@@ -508,11 +498,15 @@
   // Do any initialization/recovery necessary to create a DocumentStore
   // instance.
   //
+  // See comments on DocumentStore::Create for explanation of
+  // force_recovery_and_revalidate_documents.
+  //
   // Returns:
   //   OK on success
   //   FAILED_PRECONDITION if initialize_stats is null
   //   INTERNAL on I/O error
   libtextclassifier3::Status InitializeDocumentStore(
+      bool force_recovery_and_revalidate_documents,
       InitializeStatsProto* initialize_stats)
       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
@@ -577,29 +571,19 @@
   // call Index::Reset first.
   //
   // Returns:
-  //   OK on success
+  //   On success, OK and a bool indicating whether or not restoration was
+  //     needed.
+  //   DATA_LOSS, if an error during index merging caused us to lose indexed
+  //     data in the main index. Despite the data loss, this is still considered
+  //     a successful run and needed_restoration will be set to true.
   //   RESOURCE_EXHAUSTED if the index fills up before finishing indexing
   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
   //   INTERNAL_ERROR on any IO errors
-  libtextclassifier3::Status RestoreIndexIfNeeded()
-      ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Computes the combined checksum of the IcingSearchEngine - includes all its
-  // subcomponents
-  //
-  // Returns:
-  //   Combined checksum on success
-  //   INTERNAL_ERROR on compute error
-  libtextclassifier3::StatusOr<Crc32> ComputeChecksum()
-      ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Checks if the header exists already. This does not create the header file
-  // if it doesn't exist.
-  bool HeaderExists() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Update, replace and persist the header file. Creates the header file if it
-  // doesn't exist.
-  libtextclassifier3::Status UpdateHeader(const Crc32& checksum)
+  struct IndexRestorationResult {
+    libtextclassifier3::Status status;
+    bool needed_restoration;
+  };
+  IndexRestorationResult RestoreIndexIfNeeded()
       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // If we lost the schema during a previous failure, it may "look" the same as
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 9f8b7fa..c1de0f0 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -30,6 +30,7 @@
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/legacy/index/icing-mock-filesystem.h"
 #include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/initialize.pb.h"
 #include "icing/proto/optimize.pb.h"
@@ -44,7 +45,6 @@
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
 #include "icing/testing/jni-test-helpers.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/random-string.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
@@ -92,11 +92,16 @@
     PropertyConfigProto_Cardinality_Code_OPTIONAL;
 constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
     PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
+    PropertyConfigProto_Cardinality_Code_REPEATED;
 
 constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
     StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
+    StringIndexingConfig_TokenizerType_Code_NONE;
 
 constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN;
 
 // For mocking purpose, we allow tests to provide a custom Filesystem.
 class TestIcingSearchEngine : public IcingSearchEngine {
@@ -3384,125 +3389,6 @@
               ProtoIsOk());
 }
 
-TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderMagic) {
-  SearchSpecProto search_spec;
-  search_spec.set_query("message");
-  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
-  SearchResultProto expected_search_result_proto;
-  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
-      CreateMessageDocument("namespace", "uri");
-
-  GetResultProto expected_get_result_proto;
-  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
-  *expected_get_result_proto.mutable_document() =
-      CreateMessageDocument("namespace", "uri");
-
-  {
-    // Basic initialization/setup
-    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
-    EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
-    EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
-    EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
-                ProtoIsOk());
-    EXPECT_THAT(
-        icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
-        EqualsProto(expected_get_result_proto));
-    SearchResultProto search_result_proto =
-        icing.Search(search_spec, GetDefaultScoringSpec(),
-                     ResultSpecProto::default_instance());
-    EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
-                                         expected_search_result_proto));
-  }  // This should shut down IcingSearchEngine and persist anything it needs to
-
-  // Change the header's magic value
-  int32_t invalid_magic = 1;  // Anything that's not the actual kMagic value.
-  filesystem()->PWrite(GetHeaderFilename().c_str(),
-                       offsetof(IcingSearchEngine::Header, magic),
-                       &invalid_magic, sizeof(invalid_magic));
-
-  // We should be able to recover from this and access all our previous data
-  IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
-  EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
-
-  // Checks that DocumentLog is still ok
-  EXPECT_THAT(
-      icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
-      EqualsProto(expected_get_result_proto));
-
-  // Checks that the index is still ok so we can search over it
-  SearchResultProto search_result_proto =
-      icing.Search(search_spec, GetDefaultScoringSpec(),
-                   ResultSpecProto::default_instance());
-  EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
-                                       expected_search_result_proto));
-
-  // Checks that Schema is still since it'll be needed to validate the document
-  EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
-              ProtoIsOk());
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderChecksum) {
-  SearchSpecProto search_spec;
-  search_spec.set_query("message");
-  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
-  SearchResultProto expected_search_result_proto;
-  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
-      CreateMessageDocument("namespace", "uri");
-
-  GetResultProto expected_get_result_proto;
-  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
-  *expected_get_result_proto.mutable_document() =
-      CreateMessageDocument("namespace", "uri");
-
-  {
-    // Basic initialization/setup
-    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
-    EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
-    EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
-    EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
-                ProtoIsOk());
-    EXPECT_THAT(
-        icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
-        EqualsProto(expected_get_result_proto));
-    SearchResultProto search_result_proto =
-        icing.Search(search_spec, GetDefaultScoringSpec(),
-                     ResultSpecProto::default_instance());
-    EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
-                                         expected_search_result_proto));
-  }  // This should shut down IcingSearchEngine and persist anything it needs to
-
-  // Change the header's checksum value
-  uint32_t invalid_checksum =
-      1;  // Anything that's not the actual checksum value
-  filesystem()->PWrite(GetHeaderFilename().c_str(),
-                       offsetof(IcingSearchEngine::Header, checksum),
-                       &invalid_checksum, sizeof(invalid_checksum));
-
-  // We should be able to recover from this and access all our previous data
-  IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
-  EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
-
-  // Checks that DocumentLog is still ok
-  EXPECT_THAT(
-      icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
-      EqualsProto(expected_get_result_proto));
-
-  // Checks that the index is still ok so we can search over it
-  SearchResultProto search_result_proto =
-      icing.Search(search_spec, GetDefaultScoringSpec(),
-                   ResultSpecProto::default_instance());
-  EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
-                                       expected_search_result_proto));
-
-  // Checks that Schema is still since it'll be needed to validate the document
-  EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
-              ProtoIsOk());
-}
-
 TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptSchema) {
   {
     // Basic initialization/setup
@@ -3574,9 +3460,10 @@
           .SetCreationTimestampMs(kDefaultCreationTimestampMs)
           .Build();
 
+  IcingSearchEngineOptions options = GetDefaultIcingOptions();
   {
     // Initializes folder and schema
-    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    IcingSearchEngine icing(options, GetTestJniCache());
     EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
 
     SchemaProto schema;
@@ -3650,6 +3537,13 @@
     property->mutable_string_indexing_config()->set_tokenizer_type(
         StringIndexingConfig::TokenizerType::PLAIN);
 
+    // Write the marker file
+    std::string marker_filepath =
+        absl_ports::StrCat(options.base_dir(), "/set_schema_marker");
+    ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str()));
+    ASSERT_TRUE(sfd.is_valid());
+
+    // Write the new schema
     FakeClock fake_clock;
     ICING_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<SchemaStore> schema_store,
@@ -5217,7 +5111,7 @@
   EXPECT_THAT(icing.SetSchema(empty_schema).status(), ProtoIsOk());
 }
 
-TEST_F(IcingSearchEngineTest, ResetAbortedError) {
+TEST_F(IcingSearchEngineTest, ResetDeleteFailureCausesAbortedError) {
   auto mock_filesystem = std::make_unique<MockFilesystem>();
 
   // This fails IcingSearchEngine::Reset(). But since we didn't actually delete
@@ -5251,22 +5145,27 @@
               ProtoIsOk());
 }
 
-TEST_F(IcingSearchEngineTest, ResetInternalError) {
+TEST_F(IcingSearchEngineTest, ResetCreateFailureCausesInternalError) {
   auto mock_filesystem = std::make_unique<MockFilesystem>();
 
-  // Let all other calls succeed.
-  EXPECT_CALL(*mock_filesystem, Write(Matcher<const char*>(_), _, _))
+  // Let all other delete directory calls succeed.
+  EXPECT_CALL(*mock_filesystem,
+              DeleteDirectoryRecursively(Matcher<const char*>(_)))
       .WillRepeatedly(Return(true));
 
-  // This prevents IcingSearchEngine from creating a DocumentStore instance on
-  // reinitialization
-  const std::string document_log_path =
-      GetTestBaseDir() + "/document_dir/document_log";
+  // This prevents IcingSearchEngine from deleting our base dir when resetting
+  EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(Matcher<const char*>(
+                                    StrEq(GetTestBaseDir().c_str()))))
+      .WillOnce(Return(false));
+
+  // The first call will show our base directory had 100 bytes, but after we
+  // falied to delete, we lost those 100 bytes. So this will be reported as an
+  // INTERNAL error since data was lost.
   EXPECT_CALL(
       *mock_filesystem,
-      Write(Matcher<const char*>(StrEq(document_log_path.c_str())), _, _))
-      .WillOnce(Return(true))
-      .WillOnce(Return(false));
+      GetDiskUsage(Matcher<const char*>(StrEq(GetTestBaseDir().c_str()))))
+      .WillOnce(Return(100))
+      .WillOnce(Return(0));
 
   TestIcingSearchEngine icing(GetDefaultIcingOptions(),
                               std::move(mock_filesystem),
@@ -5830,6 +5729,88 @@
   }
 }
 
+TEST_F(IcingSearchEngineTest,
+       DocumentWithNoIndexedContentDoesntCauseRestoreIndex) {
+  // 1. Create an index with a single document in it that has no indexed
+  // content.
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+    // Set a schema for a single type that has no indexed properties.
+    SchemaProto schema =
+        SchemaBuilder()
+            .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+                PropertyConfigBuilder()
+                    .SetName("unindexedField")
+                    .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE)
+                    .SetCardinality(CARDINALITY_REQUIRED)))
+            .Build();
+    ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+    // Add a document that contains no indexed content.
+    DocumentProto document =
+        DocumentBuilder()
+            .SetKey("icing", "fake_type/0")
+            .SetSchema("Message")
+            .AddStringProperty("unindexedField",
+                               "Don't you dare search over this!")
+            .Build();
+    EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+  }
+
+  // 2. Create the index again. This should NOT trigger a recovery of any kind.
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    InitializeResultProto init_result = icing.Initialize();
+    EXPECT_THAT(init_result.status(), ProtoIsOk());
+    EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+                Eq(InitializeStatsProto::NO_DATA_LOSS));
+    EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+                Eq(InitializeStatsProto::NONE));
+  }
+}
+
+TEST_F(IcingSearchEngineTest,
+       DocumentWithNoValidIndexedContentDoesntCauseRestoreIndex) {
+  // 1. Create an index with a single document in it that has no valid indexed
+  // tokens in its content.
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+    // Set a schema for a single type that has no indexed properties.
+    ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+    // Add a document that contains no valid indexed content - just punctuation.
+    DocumentProto document = DocumentBuilder()
+                                 .SetKey("icing", "fake_type/0")
+                                 .SetSchema("Message")
+                                 .AddStringProperty("body", "?...!")
+                                 .Build();
+    EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+  }
+
+  // 2. Create the index again. This should NOT trigger a recovery of any kind.
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    InitializeResultProto init_result = icing.Initialize();
+    EXPECT_THAT(init_result.status(), ProtoIsOk());
+    EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+                Eq(InitializeStatsProto::NO_DATA_LOSS));
+    EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+                Eq(InitializeStatsProto::NONE));
+  }
+}
+
 TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) {
   DocumentProto document = DocumentBuilder()
                                .SetKey("icing", "fake_type/0")
@@ -6111,14 +6092,14 @@
     EXPECT_THAT(
         initialize_result_proto.initialize_stats().document_store_data_status(),
         Eq(InitializeStatsProto::COMPLETE_LOSS));
-    // The complete rewind of ground truth causes the mismatch of total
-    // checksum, so index should be restored.
+    // The complete rewind of ground truth causes us to clear the index, but
+    // that's not considered a restoration.
     EXPECT_THAT(
         initialize_result_proto.initialize_stats().index_restoration_cause(),
-        Eq(InitializeStatsProto::TOTAL_CHECKSUM_MISMATCH));
+        Eq(InitializeStatsProto::NONE));
     EXPECT_THAT(initialize_result_proto.initialize_stats()
                     .index_restoration_latency_ms(),
-                Eq(10));
+                Eq(0));
     EXPECT_THAT(initialize_result_proto.initialize_stats()
                     .schema_store_recovery_cause(),
                 Eq(InitializeStatsProto::NONE));
@@ -6185,26 +6166,51 @@
 }
 
 TEST_F(IcingSearchEngineTest,
-       InitializeShouldLogRecoveryCauseTotalChecksumMismatch) {
+       InitializeShouldLogRecoveryCauseSchemaChangesOutofSync) {
   DocumentProto document = DocumentBuilder()
                                .SetKey("icing", "fake_type/0")
                                .SetSchema("Message")
                                .AddStringProperty("body", "message body")
                                .Build();
+  IcingSearchEngineOptions options = GetDefaultIcingOptions();
   {
     // Initialize and put one document.
-    IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+    IcingSearchEngine icing(options, GetTestJniCache());
     ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
     ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
     ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
   }
 
   {
-    // Change the header's checksum value to a random value.
-    uint32_t invalid_checksum = 1;
-    filesystem()->PWrite(GetHeaderFilename().c_str(),
-                         offsetof(IcingSearchEngine::Header, checksum),
-                         &invalid_checksum, sizeof(invalid_checksum));
+    // Simulate a schema change where power is lost after the schema is written.
+    SchemaProto new_schema =
+        SchemaBuilder()
+            .AddType(
+                SchemaTypeConfigBuilder()
+                    .SetType("Message")
+                    .AddProperty(
+                        PropertyConfigBuilder()
+                            .SetName("body")
+                            .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+                            .SetCardinality(CARDINALITY_REQUIRED))
+                    .AddProperty(
+                        PropertyConfigBuilder()
+                            .SetName("subject")
+                            .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+                            .SetCardinality(CARDINALITY_OPTIONAL)))
+            .Build();
+    // Write the marker file
+    std::string marker_filepath =
+        absl_ports::StrCat(options.base_dir(), "/set_schema_marker");
+    ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str()));
+    ASSERT_TRUE(sfd.is_valid());
+
+    // Write the new schema
+    FakeClock fake_clock;
+    ICING_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<SchemaStore> schema_store,
+        SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+    ICING_EXPECT_OK(schema_store->SetSchema(new_schema));
   }
 
   {
@@ -6219,13 +6225,13 @@
     EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
     EXPECT_THAT(
         initialize_result_proto.initialize_stats().index_restoration_cause(),
-        Eq(InitializeStatsProto::TOTAL_CHECKSUM_MISMATCH));
+        Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
     EXPECT_THAT(initialize_result_proto.initialize_stats()
                     .index_restoration_latency_ms(),
                 Eq(10));
     EXPECT_THAT(initialize_result_proto.initialize_stats()
                     .document_store_recovery_cause(),
-                Eq(InitializeStatsProto::TOTAL_CHECKSUM_MISMATCH));
+                Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
     EXPECT_THAT(initialize_result_proto.initialize_stats()
                     .document_store_recovery_latency_ms(),
                 Eq(10));
@@ -6239,6 +6245,39 @@
                     .schema_store_recovery_latency_ms(),
                 Eq(0));
   }
+
+  {
+    // No recovery should be needed.
+    auto fake_clock = std::make_unique<FakeClock>();
+    fake_clock->SetTimerElapsedMilliseconds(10);
+    TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+                                std::make_unique<Filesystem>(),
+                                std::make_unique<IcingFilesystem>(),
+                                std::move(fake_clock), GetTestJniCache());
+    InitializeResultProto initialize_result_proto = icing.Initialize();
+    EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+    EXPECT_THAT(
+        initialize_result_proto.initialize_stats().index_restoration_cause(),
+        Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(initialize_result_proto.initialize_stats()
+                    .index_restoration_latency_ms(),
+                Eq(0));
+    EXPECT_THAT(initialize_result_proto.initialize_stats()
+                    .document_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(initialize_result_proto.initialize_stats()
+                    .document_store_recovery_latency_ms(),
+                Eq(0));
+    EXPECT_THAT(
+        initialize_result_proto.initialize_stats().document_store_data_status(),
+        Eq(InitializeStatsProto::NO_DATA_LOSS));
+    EXPECT_THAT(initialize_result_proto.initialize_stats()
+                    .schema_store_recovery_cause(),
+                Eq(InitializeStatsProto::NONE));
+    EXPECT_THAT(initialize_result_proto.initialize_stats()
+                    .schema_store_recovery_latency_ms(),
+                Eq(0));
+  }
 }
 
 TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseIndexIOError) {
@@ -6841,6 +6880,7 @@
 
   // Check the stats
   QueryStatsProto exp_stats;
+  exp_stats.set_query_length(7);
   exp_stats.set_num_terms(1);
   exp_stats.set_num_namespaces_filtered(1);
   exp_stats.set_num_schema_types_filtered(1);
@@ -6984,6 +7024,164 @@
   EXPECT_THAT(result.storage_info().total_storage_size(), Ge(0));
 }
 
+TEST_F(IcingSearchEngineTest, SnippetErrorTest) {
+  IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+  ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+  SchemaProto schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder().SetType("Generic").AddProperty(
+              PropertyConfigBuilder()
+                  .SetName("subject")
+                  .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+                  .SetCardinality(CARDINALITY_REPEATED)))
+          .Build();
+  ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace", "uri1")
+          .SetScore(10)
+          .SetSchema("Generic")
+          .AddStringProperty("subject", "I like cats", "I like dogs",
+                             "I like birds", "I like fish")
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
+          .SetKey("namespace", "uri2")
+          .SetScore(20)
+          .SetSchema("Generic")
+          .AddStringProperty("subject", "I like red", "I like green",
+                             "I like blue", "I like yellow")
+          .Build();
+  DocumentProto document3 =
+      DocumentBuilder()
+          .SetKey("namespace", "uri3")
+          .SetScore(5)
+          .SetSchema("Generic")
+          .AddStringProperty("subject", "I like cupcakes", "I like donuts",
+                             "I like eclairs", "I like froyo")
+          .Build();
+  ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+  ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+  ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+  SearchSpecProto search_spec;
+  search_spec.add_schema_type_filters("Generic");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  search_spec.set_query("like");
+  ScoringSpecProto scoring_spec;
+  scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+  ResultSpecProto result_spec;
+  result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+  result_spec.mutable_snippet_spec()->set_num_matches_per_property(3);
+  result_spec.mutable_snippet_spec()->set_max_window_bytes(4);
+  SearchResultProto search_results =
+      icing.Search(search_spec, scoring_spec, result_spec);
+
+  ASSERT_THAT(search_results.results(), SizeIs(3));
+  const SearchResultProto::ResultProto* result = &search_results.results(0);
+  EXPECT_THAT(result->document().uri(), Eq("uri2"));
+  ASSERT_THAT(result->snippet().entries(), SizeIs(3));
+  const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+  EXPECT_THAT(entry->property_name(), "subject[0]");
+  std::string_view content = GetString(&result->document(), "subject[0]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  entry = &result->snippet().entries(1);
+  EXPECT_THAT(entry->property_name(), "subject[1]");
+  content = GetString(&result->document(), "subject[1]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  entry = &result->snippet().entries(2);
+  EXPECT_THAT(entry->property_name(), "subject[2]");
+  content = GetString(&result->document(), "subject[2]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  result = &search_results.results(1);
+  EXPECT_THAT(result->document().uri(), Eq("uri1"));
+  ASSERT_THAT(result->snippet().entries(), SizeIs(3));
+  entry = &result->snippet().entries(0);
+  EXPECT_THAT(entry->property_name(), "subject[0]");
+  content = GetString(&result->document(), "subject[0]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  entry = &result->snippet().entries(1);
+  ASSERT_THAT(entry->property_name(), "subject[1]");
+  content = GetString(&result->document(), "subject[1]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  entry = &result->snippet().entries(2);
+  ASSERT_THAT(entry->property_name(), "subject[2]");
+  content = GetString(&result->document(), "subject[2]");
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+  result = &search_results.results(2);
+  ASSERT_THAT(result->document().uri(), Eq("uri3"));
+  ASSERT_THAT(result->snippet().entries(), IsEmpty());
+}
+
+TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
+  IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+  ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+  ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document = DocumentBuilder()
+                               .SetKey("namespace", "uri1")
+                               .SetSchema("Message")
+                               .AddStringProperty("body", kChinese)
+                               .Build();
+  ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+  // Search and request snippet matching but no windowing.
+  SearchSpecProto search_spec;
+  search_spec.set_query("走");
+  search_spec.set_term_match_type(MATCH_PREFIX);
+
+  ResultSpecProto result_spec;
+  result_spec.mutable_snippet_spec()->set_num_to_snippet(
+      std::numeric_limits<int>::max());
+  result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+      std::numeric_limits<int>::max());
+
+  // Search and make sure that we got a single successful result
+  SearchResultProto search_results = icing.Search(
+      search_spec, ScoringSpecProto::default_instance(), result_spec);
+  ASSERT_THAT(search_results.status(), ProtoIsOk());
+  ASSERT_THAT(search_results.results(), SizeIs(1));
+  const SearchResultProto::ResultProto* result = &search_results.results(0);
+  EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("body"));
+
+  // Get the content for "subject" and see what the match is.
+  std::string_view content = GetString(&result->document(), "body");
+  ASSERT_THAT(content, Eq(kChinese));
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(9));
+  EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(6));
+  std::string_view match =
+      content.substr(match_proto.exact_match_byte_position(),
+                     match_proto.exact_match_byte_length());
+  ASSERT_THAT(match, Eq("走路"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index 2a5a0d9..887e6e4 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -67,9 +67,10 @@
                         &temp_value);
   bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value);
   bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency,
-                        kHasTermFrequency, 1, &temp_value);
-  bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value);
-  bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value);
+                        kHasTermFrequency, /*len=*/1, &temp_value);
+  bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value);
+  bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection,
+                        /*len=*/1, &temp_value);
   value_ = temp_value;
 }
 
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 09dda41..6d8632f 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -64,6 +64,7 @@
         "DocumentId %d must be greater than last added document_id %d",
         document_id, index_->last_added_document_id()));
   }
+  index_->set_last_added_document_id(document_id);
   uint32_t num_tokens = 0;
   libtextclassifier3::Status overall_status;
   for (const TokenizedSection& section : tokenized_document.sections()) {
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index b7ec09e..8a6a9f5 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -36,6 +36,7 @@
 #include "icing/index/term-property-id.h"
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/term.pb.h"
@@ -47,7 +48,6 @@
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -261,7 +261,23 @@
                                 document));
   EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
               IsOk());
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexProcessorTest, NoValidContent) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "fake_type/1")
+          .SetSchema(std::string(kFakeType))
+          .AddStringProperty(std::string(kExactProperty), "?...!")
+          .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
+  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexProcessorTest, OneDoc) {
diff --git a/icing/index/index.h b/icing/index/index.h
index b7021ca..eab5be8 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -127,6 +127,16 @@
     return main_index_->last_added_document_id();
   }
 
+  // Sets last_added_document_id to document_id so long as document_id >
+  // last_added_document_id()
+  void set_last_added_document_id(DocumentId document_id) {
+    DocumentId lite_document_id = lite_index_->last_added_document_id();
+    if (lite_document_id == kInvalidDocumentId ||
+        document_id >= lite_document_id) {
+      lite_index_->set_last_added_document_id(document_id);
+    }
+  }
+
   // Returns debug information for the index in out.
   // verbosity <= 0, simplest debug information - just the lexicons and lite
   //                 index.
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index de4edf8..16593ef 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -153,8 +153,6 @@
       index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
   EXPECT_THAT(itr->Advance(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
 }
 
 TEST_F(IndexTest, EmptyIndexAfterMerge) {
@@ -172,8 +170,6 @@
       index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
   EXPECT_THAT(itr->Advance(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
 }
 
 TEST_F(IndexTest, AdvancePastEnd) {
@@ -238,8 +234,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) {
@@ -256,8 +250,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, SingleHitMultiTermIndex) {
@@ -273,8 +265,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) {
@@ -292,8 +282,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, NoHitMultiTermIndex) {
@@ -308,7 +296,6 @@
       index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
   EXPECT_THAT(itr->Advance(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) {
@@ -325,7 +312,6 @@
       index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
   EXPECT_THAT(itr->Advance(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, MultiHitMultiTermIndex) {
@@ -352,7 +338,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
 }
 
 TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) {
@@ -381,7 +366,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
 }
 
 TEST_F(IndexTest, MultiHitSectionRestrict) {
@@ -402,8 +386,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) {
@@ -426,8 +408,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, SingleHitDedupeIndex) {
@@ -449,8 +429,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, PrefixHit) {
@@ -465,8 +443,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, PrefixHitAfterMerge) {
@@ -483,8 +459,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, MultiPrefixHit) {
@@ -506,8 +480,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, MultiPrefixHitAfterMerge) {
@@ -531,8 +503,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, NoExactHitInPrefixQuery) {
@@ -552,7 +522,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId1, std::vector<SectionId>{kSectionId3})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) {
@@ -574,7 +543,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId1, std::vector<SectionId>{kSectionId3})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, PrefixHitDedupe) {
@@ -590,7 +558,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, PrefixHitDedupeAfterMerge) {
@@ -608,7 +575,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, PrefixToString) {
@@ -705,9 +671,11 @@
 
   std::default_random_engine random;
   std::vector<std::string> query_terms;
+  std::string prefix = "prefix";
   for (int i = 0; i < 2600; ++i) {
     constexpr int kTokenSize = 5;
-    query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random));
+    query_terms.push_back(prefix +
+                          RandomString(kAlNumAlphabet, kTokenSize, &random));
   }
 
   DocumentId document_id = 0;
@@ -716,7 +684,7 @@
   while (status.ok()) {
     for (int i = 0; i < 100; ++i) {
       Index::Editor edit =
-          index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+          index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
                        /*namespace_id=*/0);
       size_t idx = uniform(random);
       status = edit.BufferTerm(query_terms.at(idx).c_str());
@@ -733,11 +701,14 @@
 
   // Adding more hits should fail.
   Index::Editor edit =
-      index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+      index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
                    /*namespace_id=*/0);
-  EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+  std::string term = prefix + "foo";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "bar";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "baz";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
 
@@ -745,12 +716,17 @@
     ICING_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<DocHitInfoIterator> itr,
         index_->GetIterator(query_terms.at(i).c_str(), kSectionIdMaskAll,
-                            TermMatchType::EXACT_ONLY));
+                            TermMatchType::PREFIX));
     // Each query term should contain at least one hit - there may have been
     // other hits for this term that were added.
     EXPECT_THAT(itr->Advance(), IsOk());
   }
-  EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocHitInfoIterator> last_itr,
+      index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+                          TermMatchType::PREFIX));
+  EXPECT_THAT(last_itr->Advance(), IsOk());
+  EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
 }
 
 TEST_F(IndexTest, FullIndexMerge) {
@@ -761,9 +737,11 @@
 
   std::default_random_engine random;
   std::vector<std::string> query_terms;
+  std::string prefix = "prefix";
   for (int i = 0; i < 2600; ++i) {
     constexpr int kTokenSize = 5;
-    query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random));
+    query_terms.push_back(prefix +
+                          RandomString(kAlNumAlphabet, kTokenSize, &random));
   }
 
   DocumentId document_id = 0;
@@ -772,7 +750,7 @@
   while (status.ok()) {
     for (int i = 0; i < 100; ++i) {
       Index::Editor edit =
-          index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+          index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
                        /*namespace_id=*/0);
       size_t idx = uniform(random);
       status = edit.BufferTerm(query_terms.at(idx).c_str());
@@ -791,30 +769,45 @@
 
   // Adding more hits should fail.
   Index::Editor edit =
-      index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+      index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
                    /*namespace_id=*/0);
-  EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+  std::string term = prefix + "foo";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "bar";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "baz";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocHitInfoIterator> last_itr,
+      index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+                          TermMatchType::PREFIX));
+  EXPECT_THAT(last_itr->Advance(), IsOk());
+  EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
 
   // After merging with the main index. Adding more hits should succeed now.
   ICING_ASSERT_OK(index_->Merge());
-  edit =
-      index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, 0);
-  EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
-  EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+  edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, 0);
+  prefix + "foo";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "bar";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+  term = prefix + "baz";
+  EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocHitInfoIterator> itr,
-      index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+      index_->GetIterator(prefix + "bar", kSectionIdMaskAll,
+                          TermMatchType::EXACT_ONLY));
   // We know that "bar" should have at least one hit because we just added it!
   EXPECT_THAT(itr->Advance(), IsOk());
   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(document_id + 1));
-  EXPECT_THAT(index_->last_added_document_id(), Eq(document_id + 1));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      last_itr, index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+                                    TermMatchType::PREFIX));
+  EXPECT_THAT(last_itr->Advance(), IsOk());
+  EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1));
 }
 
 TEST_F(IndexTest, IndexCreateIOFailure) {
@@ -883,8 +876,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, IndexPersistenceAfterMerge) {
@@ -912,8 +903,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, InvalidHitBufferSize) {
@@ -1280,8 +1269,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
 }
 
 TEST_F(IndexTest, PrefixResultsFromLiteAndMain) {
@@ -1314,8 +1301,6 @@
           EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
 }
 
 TEST_F(IndexTest, GetDebugInfo) {
@@ -1422,8 +1407,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId3})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
 }
 
 TEST_F(IndexTest, BackfillingNewTermsSucceeds) {
@@ -1478,8 +1461,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId3));
 }
 
 TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) {
@@ -1527,8 +1508,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) {
@@ -1544,6 +1523,7 @@
                                     TermMatchType::PREFIX, /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId0);
   ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
   // Clipping to invalid should have no effect.
   ICING_ASSERT_OK_AND_ASSIGN(
@@ -1565,6 +1545,7 @@
                       /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId1);
 
   // Clipping to invalid should still have no effect even if both indices have
   // hits.
@@ -1576,8 +1557,6 @@
       ElementsAre(
           EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
           EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 }
 
 TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) {
@@ -1586,6 +1565,7 @@
                                     TermMatchType::PREFIX, /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId0);
 
   ICING_ASSERT_OK(index_->Merge());
 
@@ -1594,6 +1574,7 @@
                       /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId1);
 
   EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
 
@@ -1604,8 +1585,6 @@
   EXPECT_THAT(GetHits(std::move(itr)),
               ElementsAre(EqualsDocHitInfo(
                   kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 }
 
 TEST_F(IndexTest, TruncateToThrowsOutBothIndices) {
@@ -1614,10 +1593,12 @@
                                     TermMatchType::PREFIX, /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId0);
   edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX,
                       /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foul"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId1);
 
   ICING_ASSERT_OK(index_->Merge());
 
@@ -1626,6 +1607,7 @@
                       /*namespace_id=*/0);
   ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+  index_->set_last_added_document_id(kDocumentId2);
 
   EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
 
@@ -1634,8 +1616,6 @@
       std::unique_ptr<DocHitInfoIterator> itr,
       index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX));
   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
-
-  EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
 }
 
 TEST_F(IndexTest, IndexStorageInfoProto) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index 69138e1..fb23934 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -310,8 +310,6 @@
     return absl_ports::ResourceExhaustedError("Hit buffer is full!");
   }
 
-  header_->set_last_added_docid(hit.document_id());
-
   TermIdHitPair term_id_hit_pair(term_id, hit);
   uint32_t cur_size = header_->cur_size();
   TermIdHitPair::Value* valp =
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 90c6fbc..b134aba 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -225,6 +225,9 @@
   DocumentId last_added_document_id() const {
     return header_->last_added_docid();
   }
+  void set_last_added_document_id(DocumentId document_id) const {
+    header_->set_last_added_docid(document_id);
+  }
 
   const IcingDynamicTrie& lexicon() const { return lexicon_; }
 
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
index 58eb8bf..9b75db6 100644
--- a/icing/jni/jni-cache.cc
+++ b/icing/jni/jni-cache.cc
@@ -14,6 +14,8 @@
 
 #include "icing/jni/jni-cache.h"
 
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+
 #include "icing/text_classifier/lib3/utils/java/jni-base.h"
 #include "icing/text_classifier/lib3/utils/java/jni-helper.h"
 #include "icing/absl_ports/canonical_errors.h"
@@ -214,3 +216,5 @@
 
 }  // namespace lib
 }  // namespace icing
+
+#endif  // ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
index a5f16c7..3faaed6 100644
--- a/icing/jni/jni-cache.h
+++ b/icing/jni/jni-cache.h
@@ -15,6 +15,16 @@
 #ifndef ICING_JNI_JNI_CACHE_H_
 #define ICING_JNI_JNI_CACHE_H_
 
+#ifndef ICING_REVERSE_JNI_SEGMENTATION
+namespace icing {
+namespace lib {
+
+class JniCache {};  // Declare an empty class definition for non-Android builds.
+
+}  // namespace lib
+}  // namespace icing
+#else  // ICING_REVERSE_JNI_SEGMENTATION
+
 #include <jni.h>
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -75,4 +85,6 @@
 }  // namespace lib
 }  // namespace icing
 
+#endif  // !ICING_REVERSE_JNI_SEGMENTATION
+
 #endif  // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc
deleted file mode 100644
index 1a8a799..0000000
--- a/icing/jni/reverse-jni-break-iterator.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/jni/reverse-jni-break-iterator.h"
-
-#include <jni.h>
-#include <math.h>
-
-#include <cassert>
-#include <cctype>
-#include <map>
-
-#include "icing/jni/jni-cache.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/text_classifier/lib3/utils/java/jni-base.h"
-#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-// Chosen based on results in go/reverse-jni-benchmarks
-static constexpr int kBatchSize = 100;
-}  // namespace
-
-// -----------------------------------------------------------------------------
-// Implementations that call out to JVM. Behold the beauty.
-// -----------------------------------------------------------------------------
-libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
-ReverseJniBreakIterator::Create(const JniCache* jni_cache,
-                                std::string_view text,
-                                std::string_view locale) {
-  if (jni_cache == nullptr) {
-    return absl_ports::InvalidArgumentError(
-        "Create must be called with a valid JniCache pointer!");
-  }
-
-  ICING_ASSIGN_OR_RETURN(
-      libtextclassifier3::ScopedLocalRef<jstring> java_text,
-      jni_cache->ConvertToJavaString(text.data(), text.length()));
-  if (java_text.get() == nullptr) {
-    return absl_ports::AbortedError("Failed to create Java String from input.");
-  }
-
-  ICING_ASSIGN_OR_RETURN(
-      libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
-      jni_cache->ConvertToJavaString(locale.data(), locale.length()));
-  if (java_locale_string.get() == nullptr) {
-    return absl_ports::AbortedError(
-        "Failed to create Java String from locale.");
-  }
-
-  JNIEnv* jenv = jni_cache->GetEnv();
-  ICING_ASSIGN_OR_RETURN(
-      libtextclassifier3::ScopedLocalRef<jobject> java_locale,
-      libtextclassifier3::JniHelper::NewObject(
-          jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
-          java_locale_string.get()));
-  if (java_locale.get() == nullptr) {
-    return absl_ports::AbortedError(
-        "Failed to create Java Locale from locale.");
-  }
-
-  ICING_ASSIGN_OR_RETURN(
-      libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
-      libtextclassifier3::JniHelper::NewObject(
-          jenv, jni_cache->breakiterator_class.get(),
-          jni_cache->breakiterator_constructor, java_locale.get()));
-  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
-      libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
-                                        jni_cache->jvm);
-  if (iterator_batcher.get() == nullptr) {
-    return absl_ports::AbortedError(
-        "Failed to create Java BreakIteratorBatcher.");
-  }
-
-  ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
-      jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
-      java_text.get()));
-  return std::unique_ptr<ReverseJniBreakIterator>(
-      new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
-}
-
-ReverseJniBreakIterator::ReverseJniBreakIterator(
-    const JniCache* jni_cache,
-    libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
-    : jni_cache_(jni_cache),
-      iterator_batcher_(std::move(iterator_batcher)),
-      is_done_(false),
-      is_almost_done_(false) {}
-
-int ReverseJniBreakIterator::Next() {
-  if (is_done_) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  if (break_indices_cache_.empty()) {
-    if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
-      // Either there were no more results or an error occurred. Either way,
-      // mark ourselves as done and return.
-      is_done_ = true;
-      return ReverseJniBreakIterator::kDone;
-    }
-    is_almost_done_ = break_indices_cache_.size() < kBatchSize;
-  }
-  int break_index = break_indices_cache_.front();
-  break_indices_cache_.pop();
-  is_done_ = is_almost_done_ && break_indices_cache_.empty();
-  return break_index;
-}
-
-int ReverseJniBreakIterator::First() {
-  const int first_index = jni_cache_->GetEnv()->CallIntMethod(
-      iterator_batcher_.get(), jni_cache_->breakiterator_first);
-  if (jni_cache_->ExceptionCheckAndClear()) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  ClearCache();
-  return first_index;
-}
-
-int ReverseJniBreakIterator::Preceding(int offset) {
-  const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
-      iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
-  if (jni_cache_->ExceptionCheckAndClear()) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  ClearCache();
-  return preceding_index;
-}
-
-int ReverseJniBreakIterator::Following(int offset) {
-  const int following_index = jni_cache_->GetEnv()->CallIntMethod(
-      iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
-  if (jni_cache_->ExceptionCheckAndClear()) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  ClearCache();
-  return following_index;
-}
-
-int ReverseJniBreakIterator::FetchNextBatch() {
-  ICING_ASSIGN_OR_RETURN(
-      libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
-      libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
-          jni_cache_->GetEnv(), iterator_batcher_.get(),
-          jni_cache_->breakiterator_next, kBatchSize),
-      ReverseJniBreakIterator::kDone);
-  if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
-  if (num_indices == 0) {
-    return ReverseJniBreakIterator::kDone;
-  }
-  jint* break_indices_arr =
-      static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
-          break_indices.get(), nullptr));
-  for (int i = 0; i < num_indices; ++i) {
-    break_indices_cache_.push(break_indices_arr[i]);
-  }
-  jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
-                                                      break_indices_arr,
-                                                      /*mode=*/0);
-  return num_indices;
-}
-
-void ReverseJniBreakIterator::ClearCache() {
-  break_indices_cache_ = std::queue<int>();
-  is_done_ = false;
-  is_almost_done_ = false;
-}
-
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h
deleted file mode 100644
index c1f05f4..0000000
--- a/icing/jni/reverse-jni-break-iterator.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
-#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
-
-#include <jni.h>
-
-#include <queue>
-#include <string>
-
-#include "icing/jni/jni-cache.h"
-#include "icing/text_classifier/lib3/utils/java/jni-base.h"
-
-namespace icing {
-namespace lib {
-
-// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
-// hides the batching element to provide an interface akin to
-// java.text.BreakIterator.
-//
-// Example:
-// std::string text = "我每天走路去上班。";
-// ASSERT_THAT(text, SizeIs(27));
-// std::unique_ptr<ReverseJniBreakIterator> itr =
-//     ReverseJniBreakIterator::Create(jni_cache, text, locale);
-// std::vector<int> nexts;
-// int next = itr->Next();
-// while (next != ReverseJniBreakIterator::kDone) {
-//   nexts.push_back(next);
-//   next = itr->Next();
-// }
-// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
-class ReverseJniBreakIterator {
- public:
-  static constexpr int kDone = -1;
-
-  // Creates a ReverseJniBreakiterator with the given text and locale.
-  //
-  // Returns:
-  //   A ReverseJniBreakIterator on success
-  //   INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
-  //   INTERNAL if unable to create any of the required Java objects
-  static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
-  Create(const JniCache* jni_cache, std::string_view text,
-         std::string_view locale);
-
-  // Returns the UTF-16 boundary following the current boundary. If the current
-  // boundary is the last text boundary, it returns
-  // ReverseJniBreakIterator::kDONE.
-  //
-  // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
-  // boundary. Callers interested in the UTF-8 boundary are required to maintain
-  // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
-  int Next();
-
-  // Returns the first UTF-16 boundary. The iterator's current position is set
-  // to the first text boundary and any cached data is cleared.
-  int First();
-
-  // Returns the position of the first UTF-16 boundary preceding the UTF-16
-  // offset. If there is no boundary preceding the specified offset, then
-  // ReverseJniBreakIterator::kDone is returned.
-  //
-  // The iterator's current position is set to the segment whose boundary was
-  // returned and any cached data is cleared.
-  int Preceding(int offset);
-
-  // Returns the position of the first UTF-16 boundary following the UTF-16
-  // offset. If there is no boundary following the specified offset, then
-  // ReverseJniBreakIterator::kDone is returned.
-  //
-  // The iterator's current position is set to the segment whose boundary
-  // was returned and any cached data is cleared.
-  int Following(int offset);
-
- private:
-  ReverseJniBreakIterator(
-      const JniCache* jni_cache,
-      libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
-
-  // Fetches the results of up to kBatchSize next calls and stores them in
-  // break_indices_cache_. Returns the number of results or kDone if no more
-  // results could be fetched.
-  int FetchNextBatch();
-
-  // Empties the cache and sets is_done_ and is_almost_done_ to false.
-  void ClearCache();
-
-  // Keeps track of references to Java classes and methods. Does NOT own.
-  const JniCache* jni_cache_;
-
-  // The reference to the actual instance of BreakIteratorBatcher that
-  // this class interacts with.
-  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
-
-  // The cache holding the most recent batch of return values from
-  // BreakIteratorBatcher#next.
-  std::queue<int> break_indices_cache_;
-
-  bool is_done_;
-
-  // The last batch was incomplete (< kBatchSize results were returned). The
-  // next call to BreakIteratorBatcher#next is guaranteed to return an
-  // empty array. Once the results from the last batch are evicted from
-  // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
-  bool is_almost_done_;
-};
-
-}  // namespace lib
-}  // namespace icing
-
-#endif  // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/portable/endian.h b/icing/portable/endian.h
new file mode 100644
index 0000000..42f6c02
--- /dev/null
+++ b/icing/portable/endian.h
@@ -0,0 +1,206 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions that depend on bytesex. We define htonll and ntohll,
+// as well as "Google" versions of all the standards: ghtonl, ghtons, and
+// so on. These functions do exactly the same as their standard variants,
+// but don't require including the dangerous netinet/in.h.
+
+#ifndef ICING_PORTABLE_ENDIAN_H_
+#define ICING_PORTABLE_ENDIAN_H_
+
+#include <cstdint>
+
+// IS_LITTLE_ENDIAN, IS_BIG_ENDIAN
+#if defined OS_LINUX || defined OS_ANDROID || defined(__ANDROID__)
+// _BIG_ENDIAN
+#include <endian.h>
+
+#elif defined(__APPLE__)
+
+// BIG_ENDIAN
+#include <machine/endian.h>  // NOLINT(build/include)
+
+/* Let's try and follow the Linux convention */
+#define __BYTE_ORDER BYTE_ORDER
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#define __BIG_ENDIAN BIG_ENDIAN
+
+#endif  // operating system
+
+// defines __BYTE_ORDER for MSVC
+#ifdef COMPILER_MSVC
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#else  // COMPILER_MSVC
+
+// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN
+// using the above endian definitions from endian.h if
+// endian.h was included
+#ifdef __BYTE_ORDER
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#endif  // __BYTE_ORDER == __LITTLE_ENDIAN
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif  // __BYTE_ORDER == __BIG_ENDIAN
+
+#else  // __BYTE_ORDER
+
+#if defined(__LITTLE_ENDIAN__)
+#define IS_LITTLE_ENDIAN
+#elif defined(__BIG_ENDIAN__)
+#define IS_BIG_ENDIAN
+#endif  // __LITTLE_ENDIAN__ or __BIG_ENDIAN__
+
+#endif  // __BYTE_ORDER
+#endif  // COMPILER_MSVC
+
+// byte swap functions (bswap_16, bswap_32, bswap_64).
+// byte swap functions reverse the order of bytes, e.g.
+//   byteswap of 102030405060708 = 807060504030201
+//   byteswap of 1020304 = 4030201
+
+// The following guarantees declaration of the byte swap functions
+#ifdef COMPILER_MSVC
+#include <stdlib.h>  // NOLINT(build/include)
+
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__GLIBC__) || defined(__BIONIC__) || defined(__ASYLO__)
+#include <byteswap.h>  // IWYU pragma: export
+
+#else  // built-in byteswap functions
+
+static inline uint16 bswap_16(uint16 x) {
+#ifdef __cplusplus
+  return static_cast<uint16>(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));
+#else   // __cplusplus
+  return (uint16)(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));  // NOLINT
+#endif  // __cplusplus
+}
+#define bswap_16(x) bswap_16(x)
+static inline uint32 bswap_32(uint32 x) {
+  return (((x & 0xFF) << 24) | ((x & 0xFF00) << 8) | ((x & 0xFF0000) >> 8) |
+          ((x & 0xFF000000) >> 24));
+}
+#define bswap_32(x) bswap_32(x)
+static inline uint64 bswap_64(uint64 x) {
+  return (((x & (uint64_t)0xFF) << 56) | ((x & (uint64_t)0xFF00) << 40) |
+          ((x & (uint64_t)0xFF0000) << 24) | ((x & (uint64_t)0xFF000000) << 8) |
+          ((x & (uint64_t)0xFF00000000) >> 8) |
+          ((x & (uint64_t)0xFF0000000000) >> 24) |
+          ((x & (uint64_t)0xFF000000000000) >> 40) |
+          ((x & (uint64_t)0xFF00000000000000) >> 56));
+}
+#define bswap_64(x) bswap_64(x)
+
+#endif  // end byteswap functions
+
+// Use compiler byte-swapping intrinsics if they are available.  32-bit
+// and 64-bit versions are available in Clang and GCC as of GCC 4.3.0.
+// The 16-bit version is available in Clang and GCC only as of GCC 4.8.0.
+// For simplicity, we enable them all only for GCC 4.8.0 or later.
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+
+inline uint64_t gbswap_64(uint64_t host_int) {
+  return __builtin_bswap64(host_int);
+}
+inline uint32_t gbswap_32(uint32_t host_int) {
+  return __builtin_bswap32(host_int);
+}
+inline uint16_t gbswap_16(uint16_t host_int) {
+  return __builtin_bswap16(host_int);
+}
+
+#else  // intrinsics available
+
+inline uint64 gbswap_64(uint64 host_int) {
+#if defined(__GNUC__) && defined(__x86_64__) && \
+    !(defined(__APPLE__) && defined(__MACH__))
+  // Adapted from /usr/include/byteswap.h.  Not available on Mac.
+  if (__builtin_constant_p(host_int)) {
+    return __bswap_constant_64(host_int);
+  } else {
+    uint64 result;
+    __asm__("bswap %0" : "=r"(result) : "0"(host_int));
+    return result;
+  }
+#elif defined(bswap_64)
+  return bswap_64(host_int);
+#else   // bswap_64
+  return static_cast<uint64>(bswap_32(static_cast<uint32>(host_int >> 32))) |
+         (static_cast<uint64>(bswap_32(static_cast<uint32>(host_int))) << 32);
+#endif  // bswap_64
+}
+inline uint32 gbswap_32(uint32 host_int) { return bswap_32(host_int); }
+inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); }
+
+#endif  // intrinsics available
+
+#ifdef IS_LITTLE_ENDIAN
+
+// Definitions for ntohl etc. that don't require us to include
+// netinet/in.h. We wrap gbswap_32 and gbswap_16 in functions rather
+// than just #defining them because in debug mode, gcc doesn't
+// correctly handle the (rather involved) definitions of bswap_32.
+// gcc guarantees that inline functions are as fast as macros, so
+// this isn't a performance hit.
+inline uint16_t ghtons(uint16_t x) { return gbswap_16(x); }
+inline uint32_t ghtonl(uint32_t x) { return gbswap_32(x); }
+inline uint64_t ghtonll(uint64_t x) { return gbswap_64(x); }
+
+#elif defined IS_BIG_ENDIAN
+
+// These definitions are simpler on big-endian machines
+// These are functions instead of macros to avoid self-assignment warnings
+// on calls such as "i = ghtnol(i);".  This also provides type checking.
+inline uint16 ghtons(uint16 x) { return x; }
+inline uint32 ghtonl(uint32 x) { return x; }
+inline uint64 ghtonll(uint64 x) { return x; }
+
+#else  // bytesex
+#error \
+    "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined"  // NOLINT
+#endif  // bytesex
+
+#ifndef htonll
+// With the rise of 64-bit, some systems are beginning to define this.
+#define htonll(x) ghtonll(x)
+#endif  // htonll
+
+// ntoh* and hton* are the same thing for any size and bytesex,
+// since the function is an involution, i.e., its own inverse.
+inline uint16_t gntohs(uint16_t x) { return ghtons(x); }
+inline uint32_t gntohl(uint32_t x) { return ghtonl(x); }
+inline uint64_t gntohll(uint64_t x) { return ghtonll(x); }
+
+#ifndef ntohll
+#define ntohll(x) htonll(x)
+#endif  // ntohll
+
+#endif  // ICING_PORTABLE_ENDIAN_H_
diff --git a/icing/testing/platform.h b/icing/portable/platform.h
similarity index 74%
rename from icing/testing/platform.h
rename to icing/portable/platform.h
index ad612d5..8712835 100644
--- a/icing/testing/platform.h
+++ b/icing/portable/platform.h
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_TESTING_PLATFORM_H_
-#define ICING_TESTING_PLATFORM_H_
+#ifndef ICING_PORTABLE_PLATFORM_H_
+#define ICING_PORTABLE_PLATFORM_H_
 
-// This file is meant to hold util functions for tests that help the test
-// determine which platform-specific configuration it may be running in.
 namespace icing {
 namespace lib {
 
@@ -52,7 +50,27 @@
   return false;
 }
 
+enum Architecture {
+  UNKNOWN,
+  BIT_32,
+  BIT_64,
+};
+
+// Returns which architecture we're running on.
+//
+// Architecture macros pulled from
+// https://developer.android.com/ndk/guides/cpu-features
+inline Architecture GetArchitecture() {
+#if defined(__arm__) || defined(__i386__)
+  return BIT_32;
+#elif defined(__aarch64__) || defined(__x86_64__)
+  return BIT_64;
+#else
+  return UNKNOWN;
+#endif
+}
+
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_TESTING_PLATFORM_H_
+#endif  // ICING_PORTABLE_PLATFORM_H_
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index 0f49f4d..daeb479 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -29,6 +29,7 @@
 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/search.pb.h"
 #include "icing/proto/term.pb.h"
@@ -40,7 +41,6 @@
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
 #include "icing/testing/jni-test-helpers.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 8d61dd9..1c9684d 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -24,6 +24,7 @@
 #include "icing/file/mock-filesystem.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/search.pb.h"
@@ -35,7 +36,6 @@
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 0510d55..dc9f8be 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -39,6 +39,7 @@
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
 #include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
@@ -155,20 +156,17 @@
   }
 }
 
-// Returns true if token matches any of the terms in query terms according to
-// the provided match type.
+// Finds the start position of a valid token that is after
+// window_start_min_exclusive
 //
 // Returns:
 //   the position of the window start if successful
 //   INTERNAL_ERROR - if a tokenizer error is encountered
 libtextclassifier3::StatusOr<int> DetermineWindowStart(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
-  int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
-  if (window_start_min < 0) {
-    return 0;
-  }
-  if (!iterator->ResetToTokenAfter(window_start_min)) {
+    std::string_view value, int window_start_min_exclusive,
+    Tokenizer::Iterator* iterator) {
+  if (!iterator->ResetToTokenAfter(window_start_min_exclusive)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
@@ -196,17 +194,16 @@
   return window_end_exclusive;
 }
 
+// Finds the end position of a valid token that is before the
+// window_end_max_exclusive.
+//
 // Returns:
 //   the position of the window end if successful
 //   INTERNAL_ERROR - if a tokenizer error is encountered
 libtextclassifier3::StatusOr<int> DetermineWindowEnd(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
-  int window_end_max_exclusive =
-      match_mid + snippet_spec.max_window_bytes() / 2;
-  if (window_end_max_exclusive >= value.length()) {
-    return value.length();
-  }
+    std::string_view value, int window_end_max_exclusive,
+    Tokenizer::Iterator* iterator) {
   if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
@@ -222,31 +219,109 @@
   std::string_view section_subcontent;
 };
 
+// Creates a snippet match proto for the match pointed to by the iterator and
+// char_iterator
+//
+// Returns:
+//   the position of the window start if successful
+//   INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
+//     in an invalid state
+//   ABORTED_ERROR - if an invalid utf-8 sequence is encountered
 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    const SectionData& value, Tokenizer::Iterator* iterator) {
+    const SectionData& value, Tokenizer::Iterator* iterator,
+    const CharacterIterator& char_iterator) {
   SnippetMatchProto snippet_match;
   Token match = iterator->GetToken();
-  int match_pos = match.text.data() - value.section_subcontent.data();
+  int match_pos = char_iterator.utf8_index();
+
+  // When finding boundaries,  we have a few cases:
+  //
+  // Case 1:
+  //   If we have an odd length match an odd length window, the window surrounds
+  //   the match perfectly.
+  //     match  = "bar" in "foo bar baz"
+  //     window =              |---|
+  //
+  // Case 2:
+  //   If we have an even length match with an even length window, the window
+  //   surrounds the match perfectly.
+  //     match  = "baar" in "foo baar baz"
+  //     window =               |----|
+  //
+  // Case 3:
+  //   If we have an odd length match with an even length window, we allocate
+  //   that extra window byte to the beginning.
+  //     match  = "bar" in "foo bar baz"
+  //     window =             |----|
+  //
+  // Case 4:
+  //   If we have an even length match with an odd length window, we allocate
+  //   that extra window byte to the end.
+  //     match  = "baar" in "foo baar baz"
+  //     window =               |-----|
+  //
+  // We have do +1/-1 below to get the math to match up.
   int match_mid = match_pos + match.text.length() / 2;
+  int window_start_min_exclusive =
+      (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
+  int window_end_max_exclusive =
+      match_mid + (snippet_spec.max_window_bytes() + 1) / 2;
 
-  snippet_match.set_exact_match_position(match_pos);
-  snippet_match.set_exact_match_bytes(match.text.length());
+  snippet_match.set_exact_match_byte_position(match_pos);
+  snippet_match.set_exact_match_utf16_position(char_iterator.utf16_index());
 
-  if (snippet_spec.max_window_bytes() > match.text.length()) {
+  // Create character iterators to find the beginning and end of the window.
+  CharacterIterator forward_char_iterator(char_iterator);
+  CharacterIterator backwards_char_iterator(char_iterator);
+
+  if (!backwards_char_iterator.AdvanceToUtf8(match_pos + match.text.length())) {
+    return absl_ports::AbortedError("Could not retrieve valid utf8 character!");
+  }
+  snippet_match.set_exact_match_byte_length(match.text.length());
+  snippet_match.set_exact_match_utf16_length(
+      backwards_char_iterator.utf16_index() - char_iterator.utf16_index());
+
+  // Only include windows if it'll at least include the matched text. Otherwise,
+  // it'll just be an empty string anyways.
+  if (snippet_spec.max_window_bytes() >= match.text.length()) {
     // Find the beginning of the window.
-    ICING_ASSIGN_OR_RETURN(
-        int window_start,
-        DetermineWindowStart(snippet_spec, value.section_subcontent, match_mid,
-                             iterator));
-    snippet_match.set_window_position(window_start);
+    int window_start;
+    int window_start_utf16;
+    if (window_start_min_exclusive < 0) {
+      window_start = 0;
+      window_start_utf16 = 0;
+    } else {
+      ICING_ASSIGN_OR_RETURN(
+          window_start,
+          DetermineWindowStart(snippet_spec, value.section_subcontent,
+                               window_start_min_exclusive, iterator));
+      if (!forward_char_iterator.RewindToUtf8(window_start)) {
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
+      }
+      window_start_utf16 = forward_char_iterator.utf16_index();
+    }
+    snippet_match.set_window_byte_position(window_start);
+    snippet_match.set_window_utf16_position(window_start_utf16);
 
     // Find the end of the window.
-    ICING_ASSIGN_OR_RETURN(
-        int window_end_exclusive,
-        DetermineWindowEnd(snippet_spec, value.section_subcontent, match_mid,
-                           iterator));
-    snippet_match.set_window_bytes(window_end_exclusive - window_start);
+    int window_end_exclusive;
+    if (window_end_max_exclusive >= value.section_subcontent.length()) {
+      window_end_exclusive = value.section_subcontent.length();
+    } else {
+      ICING_ASSIGN_OR_RETURN(
+          window_end_exclusive,
+          DetermineWindowEnd(snippet_spec, value.section_subcontent,
+                             window_end_max_exclusive, iterator));
+    }
+    if (!backwards_char_iterator.AdvanceToUtf8(window_end_exclusive)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    snippet_match.set_window_byte_length(window_end_exclusive - window_start);
+    snippet_match.set_window_utf16_length(
+        backwards_char_iterator.utf16_index() - window_start_utf16);
 
     // DetermineWindowStart/End may change the position of the iterator. So,
     // reset the iterator back to the original position.
@@ -292,16 +367,38 @@
     std::string_view value = current_property->string_values(i);
     std::unique_ptr<Tokenizer::Iterator> iterator =
         tokenizer->Tokenize(value).ValueOrDie();
+    CharacterIterator char_iterator(value);
     while (iterator->Advance()) {
       Token token = iterator->GetToken();
       if (matcher->Matches(token)) {
-        // If there was an error while retrieving the match, the tokenizer
-        // iterator is probably in an invalid state. There's nothing we can do
-        // here, so just return.
+        if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) {
+          // We can't get the char_iterator to a valid position, so there's no
+          // way for us to provide valid utf-16 indices. There's nothing more we
+          // can do here, so just return whatever we've built up so far.
+          if (!snippet_entry.snippet_matches().empty()) {
+            *snippet_proto->add_entries() = std::move(snippet_entry);
+          }
+          return;
+        }
         SectionData data = {property_path, value};
-        SnippetMatchProto match =
-            RetrieveMatch(match_options->snippet_spec, data, iterator.get())
-                .ValueOrDie();
+        auto match_or = RetrieveMatch(match_options->snippet_spec, data,
+                                      iterator.get(), char_iterator);
+        if (!match_or.ok()) {
+          if (absl_ports::IsAborted(match_or.status())) {
+            // Only an aborted. We can't get this match, but we might be able to
+            // retrieve others. Just continue.
+            continue;
+          } else {
+            // Probably an internal error. The tokenizer iterator is probably in
+            // an invalid state. There's nothing more we can do here, so just
+            // return whatever we've built up so far.
+            if (!snippet_entry.snippet_matches().empty()) {
+              *snippet_proto->add_entries() = std::move(snippet_entry);
+            }
+            return;
+          }
+        }
+        SnippetMatchProto match = std::move(match_or).ValueOrDie();
         snippet_entry.mutable_snippet_matches()->Add(std::move(match));
         if (--match_options->max_matches_remaining <= 0) {
           *snippet_proto->add_entries() = std::move(snippet_entry);
@@ -405,8 +502,7 @@
 
     MatchOptions match_options = {snippet_spec};
     match_options.max_matches_remaining =
-        std::min(snippet_spec.num_to_snippet() - snippet_proto.entries_size(),
-                 snippet_spec.num_matches_per_property());
+        snippet_spec.num_matches_per_property();
 
     // Determine the section name and match type.
     auto section_metadata_or =
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 1cf4e5a..c052a9e 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -24,6 +24,7 @@
 #include "icing/file/mock-filesystem.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/search.pb.h"
@@ -36,7 +37,6 @@
 #include "icing/store/key-mapper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
@@ -184,6 +184,58 @@
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
 }
 
+TEST_F(SnippetRetrieverTest,
+       SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four.... five")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+  // Window starts at the beginning of "three" and at the exact end of
+  // "three". len=5, orig_window= "three"
+  snippet_spec_.set_max_window_bytes(5);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
+}
+
+TEST_F(SnippetRetrieverTest,
+       SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four.... five")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
+
+  // Window starts at the beginning of "four" and at the exact end of
+  // "four". len=4, orig_window= "four"
+  snippet_spec_.set_max_window_bytes(4);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
+}
+
 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
   DocumentProto document =
       DocumentBuilder()
@@ -1082,6 +1134,201 @@
                   "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
 }
 
+TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kChinese)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
+TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      language_segmenter_,
+      language_segmenter_factory::Create(std::move(options)));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      snippet_retriever_,
+      SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+                               normalizer_.get()));
+
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kChinese)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+  // Set a twenty byte window. This will produce a window like this:
+  // String:     "我每天走路去上班。"
+  //                ^       ^
+  // UTF8 idx:      3       18
+  // UTF16 idx:     1       6
+  snippet_spec_.set_max_window_bytes(20);
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("每天走路去"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.window_utf16_position(), Eq(1));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(5));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
+  // The following string has four-byte UTF-8 characters. Most importantly, it
+  // is also two code units in UTF-16.
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //              ^  ^  ^
+  // UTF8 idx:    0  9  18
+  // UTF16 idx:   0  5  10
+  // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+  constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kText)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
+  // The following string has four-byte UTF-8 characters. Most importantly, it
+  // is also two code units in UTF-16.
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //              ^  ^  ^
+  // UTF8 idx:    0  9  18
+  // UTF16 idx:   0  5  10
+  // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+  constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kText)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+  // Set a twenty byte window. This will produce a window like this:
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //                 ^   ^
+  // UTF8 idx:       9   22
+  // UTF16 idx:      5   12
+  snippet_spec_.set_max_window_bytes(20);
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("𐀂𐀃 𐀄"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc
index 15d9a19..3dcc5a9 100644
--- a/icing/schema/section-manager_test.cc
+++ b/icing/schema/section-manager_test.cc
@@ -20,7 +20,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/proto/schema.proto.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/schema/schema-util.h"
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
index b7e1b92..a4734b4 100644
--- a/icing/scoring/scorer.cc
+++ b/icing/scoring/scorer.cc
@@ -89,6 +89,7 @@
     if (!query_it) {
       return default_score_;
     }
+
     return static_cast<double>(
         bm25f_calculator_->ComputeScore(query_it, hit_info, default_score_));
   }
@@ -122,11 +123,11 @@
       case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT:
         return usage_scores.usage_type3_count;
       case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP:
-        return usage_scores.usage_type1_last_used_timestamp_s;
+        return usage_scores.usage_type1_last_used_timestamp_s * 1000.0;
       case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP:
-        return usage_scores.usage_type2_last_used_timestamp_s;
+        return usage_scores.usage_type2_last_used_timestamp_s * 1000.0;
       case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
-        return usage_scores.usage_type3_last_used_timestamp_s;
+        return usage_scores.usage_type3_last_used_timestamp_s * 1000.0;
       default:
         // This shouldn't happen if this scorer is used correctly.
         return default_score_;
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index 31bdd15..8b89514 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -95,6 +95,10 @@
 
   const FakeClock& fake_clock2() { return fake_clock2_; }
 
+  void SetFakeClock1Time(int64_t new_time) {
+    fake_clock1_.SetSystemTimeMilliseconds(new_time);
+  }
+
  private:
   const std::string test_dir_;
   const std::string doc_store_dir_;
@@ -123,7 +127,7 @@
               StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
 }
 
-TEST_F(ScorerTest, ShouldGetDefaultScore) {
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Scorer> scorer,
       Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
@@ -135,6 +139,66 @@
   EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
 }
 
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) {
+  // Creates a test document with a provided score
+  DocumentProto test_document = DocumentBuilder()
+                                    .SetKey("icing", "email/1")
+                                    .SetSchema("email")
+                                    .AddStringProperty("subject", "subject foo")
+                                    .SetScore(42)
+                                    .Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             document_store()->Put(test_document));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Scorer> scorer,
+      Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+                     /*default_score=*/10, document_store()));
+
+  DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+  // The document's score is returned
+  EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42));
+
+  // Delete the document and check that the caller-provided default score is
+  // returned
+  EXPECT_THAT(document_store()->Delete(document_id), IsOk());
+  EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
+}
+
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) {
+  // Creates a test document with a provided score
+  int64_t creation_time = fake_clock1().GetSystemTimeMilliseconds();
+  int64_t ttl = 100;
+  DocumentProto test_document = DocumentBuilder()
+                                    .SetKey("icing", "email/1")
+                                    .SetSchema("email")
+                                    .AddStringProperty("subject", "subject foo")
+                                    .SetScore(42)
+                                    .SetCreationTimestampMs(creation_time)
+                                    .SetTtlMs(ttl)
+                                    .Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             document_store()->Put(test_document));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Scorer> scorer,
+      Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+                     /*default_score=*/10, document_store()));
+
+  DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+  // The document's score is returned since the document hasn't expired yet.
+  EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42));
+
+  // Expire the document and check that the caller-provided default score is
+  // returned
+  SetFakeClock1Time(creation_time + ttl + 10);
+  EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
+}
+
 TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
   // Creates a test document with the default document score 0
   DocumentProto test_document =
@@ -397,7 +461,7 @@
       /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
       UsageReport::USAGE_TYPE1);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time1));
-  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1));
+  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1000));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 
@@ -406,7 +470,7 @@
       /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
       UsageReport::USAGE_TYPE1);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time5));
-  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 
@@ -415,7 +479,7 @@
       /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/3000,
       UsageReport::USAGE_TYPE1);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time3));
-  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 }
@@ -458,7 +522,7 @@
       UsageReport::USAGE_TYPE2);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time1));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1));
+  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1000));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 
   // Report usage with timestamp = 5000ms, score should be updated.
@@ -467,7 +531,7 @@
       UsageReport::USAGE_TYPE2);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time5));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 
   // Report usage with timestamp = 3000ms, score should not be updated.
@@ -476,7 +540,7 @@
       UsageReport::USAGE_TYPE2);
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time3));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000));
   EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
 }
 
@@ -519,7 +583,7 @@
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time1));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1));
+  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1000));
 
   // Report usage with timestamp = 5000ms, score should be updated.
   UsageReport usage_report_type3_time5 = CreateUsageReport(
@@ -528,7 +592,7 @@
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time5));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000));
 
   // Report usage with timestamp = 3000ms, score should not be updated.
   UsageReport usage_report_type3_time3 = CreateUsageReport(
@@ -537,7 +601,7 @@
   ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time3));
   EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
   EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
-  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5));
+  EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000));
 }
 
 TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
@@ -565,6 +629,37 @@
   EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(111));
 }
 
+TEST_F(ScorerTest, ShouldScaleUsageTimestampScoreForMaxTimestamp) {
+  DocumentProto test_document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "subject foo")
+          .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+          .Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             document_store()->Put(test_document));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Scorer> scorer1,
+      Scorer::Create(
+          ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+          /*default_score=*/0, document_store()));
+  DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+  // Create usage report for the maximum allowable timestamp.
+  UsageReport usage_report_type1 = CreateUsageReport(
+      /*name_space=*/"icing", /*uri=*/"email/1",
+      /*timestamp_ms=*/std::numeric_limits<uint32_t>::max() * 1000.0,
+      UsageReport::USAGE_TYPE1);
+
+  double max_int_usage_timestamp_score =
+      std::numeric_limits<uint32_t>::max() * 1000.0;
+  ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1));
+  EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(max_int_usage_timestamp_score));
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index 5e251eb..125e2a7 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -610,9 +610,9 @@
   DocHitInfo doc_hit_info2(document_id2);
   DocHitInfo doc_hit_info3(document_id3);
   ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
-                                         /*score=*/1);
+                                         /*score=*/1000);
   ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
-                                         /*score=*/5);
+                                         /*score=*/5000);
   ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
                                          /*score=*/0);
 
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 2436571..5f478fa 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -85,33 +85,6 @@
   return document_wrapper;
 }
 
-DocumentWrapper CreateDocumentTombstone(std::string_view document_namespace,
-                                        std::string_view document_uri) {
-  DocumentWrapper document_wrapper;
-  document_wrapper.set_deleted(true);
-  DocumentProto* document = document_wrapper.mutable_document();
-  document->set_namespace_(std::string(document_namespace));
-  document->set_uri(std::string(document_uri));
-  return document_wrapper;
-}
-
-DocumentWrapper CreateNamespaceTombstone(std::string_view document_namespace) {
-  DocumentWrapper document_wrapper;
-  document_wrapper.set_deleted(true);
-  DocumentProto* document = document_wrapper.mutable_document();
-  document->set_namespace_(std::string(document_namespace));
-  return document_wrapper;
-}
-
-DocumentWrapper CreateSchemaTypeTombstone(
-    std::string_view document_schema_type) {
-  DocumentWrapper document_wrapper;
-  document_wrapper.set_deleted(true);
-  DocumentProto* document = document_wrapper.mutable_document();
-  document->set_schema(std::string(document_schema_type));
-  return document_wrapper;
-}
-
 std::string MakeHeaderFilename(const std::string& base_dir) {
   return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
 }
@@ -229,6 +202,7 @@
 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
     const Filesystem* filesystem, const std::string& base_dir,
     const Clock* clock, const SchemaStore* schema_store,
+    bool force_recovery_and_revalidate_documents,
     InitializeStatsProto* initialize_stats) {
   ICING_RETURN_ERROR_IF_NULL(filesystem);
   ICING_RETURN_ERROR_IF_NULL(clock);
@@ -236,8 +210,10 @@
 
   auto document_store = std::unique_ptr<DocumentStore>(
       new DocumentStore(filesystem, base_dir, clock, schema_store));
-  ICING_ASSIGN_OR_RETURN(DataLoss data_loss,
-                         document_store->Initialize(initialize_stats));
+  ICING_ASSIGN_OR_RETURN(
+      DataLoss data_loss,
+      document_store->Initialize(force_recovery_and_revalidate_documents,
+                                 initialize_stats));
 
   CreateResult create_result;
   create_result.document_store = std::move(document_store);
@@ -246,6 +222,7 @@
 }
 
 libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
+    bool force_recovery_and_revalidate_documents,
     InitializeStatsProto* initialize_stats) {
   auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create(
       filesystem_, MakeDocumentLogFilename(base_dir_),
@@ -262,10 +239,11 @@
       std::move(create_result_or).ValueOrDie();
   document_log_ = std::move(create_result.proto_log);
 
-  if (create_result.has_data_loss()) {
-    ICING_LOG(WARNING)
-        << "Data loss in document log, regenerating derived files.";
-    if (initialize_stats != nullptr) {
+  if (force_recovery_and_revalidate_documents ||
+      create_result.has_data_loss()) {
+    if (create_result.has_data_loss() && initialize_stats != nullptr) {
+      ICING_LOG(WARNING)
+          << "Data loss in document log, regenerating derived files.";
       initialize_stats->set_document_store_recovery_cause(
           InitializeStatsProto::DATA_LOSS);
 
@@ -280,7 +258,8 @@
       }
     }
     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
-    libtextclassifier3::Status status = RegenerateDerivedFiles();
+    libtextclassifier3::Status status =
+        RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
     if (initialize_stats != nullptr) {
       initialize_stats->set_document_store_recovery_latency_ms(
           document_recovery_timer->GetElapsedMilliseconds());
@@ -295,13 +274,12 @@
       ICING_VLOG(1)
           << "Couldn't find derived files or failed to initialize them, "
              "regenerating derived files for DocumentStore.";
-      if (initialize_stats != nullptr) {
+      std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
+      libtextclassifier3::Status status = RegenerateDerivedFiles(
+          /*force_recovery_and_revalidate_documents*/ false);
+      if (initialize_stats != nullptr && num_documents() > 0) {
         initialize_stats->set_document_store_recovery_cause(
             InitializeStatsProto::IO_ERROR);
-      }
-      std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
-      libtextclassifier3::Status status = RegenerateDerivedFiles();
-      if (initialize_stats != nullptr) {
         initialize_stats->set_document_store_recovery_latency_ms(
             document_recovery_timer->GetElapsedMilliseconds());
       }
@@ -407,7 +385,8 @@
   return libtextclassifier3::Status::OK;
 }
 
-libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
+libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
+    bool revalidate_documents) {
   ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
   ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
   ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
@@ -441,148 +420,80 @@
 
     DocumentWrapper document_wrapper =
         std::move(document_wrapper_or).ValueOrDie();
-    if (document_wrapper.deleted()) {
-      if (!document_wrapper.document().uri().empty()) {
-        // Individual document deletion.
-        auto document_id_or =
-            GetDocumentId(document_wrapper.document().namespace_(),
-                          document_wrapper.document().uri());
-        // Updates document_id mapper with deletion
-        if (document_id_or.ok()) {
-          ICING_RETURN_IF_ERROR(document_id_mapper_->Set(
-              document_id_or.ValueOrDie(), kDocDeletedFlag));
-        } else if (!absl_ports::IsNotFound(document_id_or.status())) {
-          // Real error
-          return absl_ports::Annotate(
-              document_id_or.status(),
-              absl_ports::StrCat("Failed to find document id. namespace: ",
-                                 document_wrapper.document().namespace_(),
-                                 ", uri: ", document_wrapper.document().uri()));
-        }
-      } else if (!document_wrapper.document().namespace_().empty()) {
-        // Namespace deletion.
-        ICING_ASSIGN_OR_RETURN(
-            NamespaceId namespace_id,
-            namespace_mapper_->Get(document_wrapper.document().namespace_()));
-        // Tombstone indicates it's a soft delete.
-        ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId,
-                                          /*soft_delete=*/true));
-      } else if (!document_wrapper.document().schema().empty()) {
-        // SchemaType deletion.
-        auto schema_type_id_or = schema_store_->GetSchemaTypeId(
-            document_wrapper.document().schema());
-
-        if (schema_type_id_or.ok()) {
-          // Tombstone indicates it's a soft delete.
-          ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId,
-                                            schema_type_id_or.ValueOrDie(),
-                                            /*soft_delete=*/true));
-        } else {
-          // The deleted schema type doesn't have a SchemaTypeId we can refer
-          // to in the FilterCache.
-          //
-          // TODO(cassiewang): We could avoid reading out all the documents.
-          // When we see a schema type doesn't have a SchemaTypeId, assign the
-          // unknown schema type a unique, temporary SchemaTypeId and store
-          // that in the FilterCache. Then, when we see the schema type
-          // tombstone here, we can look up its temporary SchemaTypeId and
-          // just iterate through the FilterCache to mark those documents as
-          // deleted.
-          int size = document_id_mapper_->num_elements();
-          for (DocumentId document_id = 0; document_id < size; document_id++) {
-            auto document_or = Get(document_id);
-            if (absl_ports::IsNotFound(document_or.status())) {
-              // Skip nonexistent documents
-              continue;
-            } else if (!document_or.ok()) {
-              // Real error, pass up
-              return absl_ports::Annotate(
-                  document_or.status(),
-                  IcingStringUtil::StringPrintf(
-                      "Failed to retrieve Document for DocumentId %d",
-                      document_id));
-            }
-
-            // Guaranteed to have a document now.
-            DocumentProto document = document_or.ValueOrDie();
-
-            if (document.schema() == document_wrapper.document().schema()) {
-              ICING_RETURN_IF_ERROR(
-                  document_id_mapper_->Set(document_id, kDocDeletedFlag));
-            }
-          }
-        }
-      } else {
-        return absl_ports::InternalError(
-            "Encountered an invalid tombstone during recovery!");
+    // Revalidate that this document is still compatible if requested.
+    if (revalidate_documents) {
+      if (!document_validator_.Validate(document_wrapper.document()).ok()) {
+        // Document is no longer valid with the current schema. Mark as
+        // deleted
+        DocumentId new_document_id = document_id_mapper_->num_elements();
+        ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
+        ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
+        continue;
       }
-    } else {
-      // Updates key mapper and document_id mapper with the new document
-      DocumentId new_document_id = document_id_mapper_->num_elements();
-      ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
-          MakeFingerprint(document_wrapper.document().namespace_(),
-                          document_wrapper.document().uri()),
-          new_document_id));
-      ICING_RETURN_IF_ERROR(
-          document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
-
-      SchemaTypeId schema_type_id;
-      auto schema_type_id_or =
-          schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
-      if (absl_ports::IsNotFound(schema_type_id_or.status())) {
-        // Didn't find a SchemaTypeId. This means that the DocumentStore and
-        // the SchemaStore are out of sync. But DocumentStore can't do
-        // anything about it so just ignore this for now. This should be
-        // detected/handled by the owner of DocumentStore. Set it to some
-        // arbitrary invalid value for now, it'll get updated to the correct
-        // ID later.
-        schema_type_id = -1;
-      } else if (!schema_type_id_or.ok()) {
-        // Real error. Pass it up
-        return schema_type_id_or.status();
-      } else {
-        // We're guaranteed that SchemaTypeId is valid now
-        schema_type_id = schema_type_id_or.ValueOrDie();
-      }
-
-      ICING_ASSIGN_OR_RETURN(
-          NamespaceId namespace_id,
-          namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
-                                      namespace_mapper_->num_keys()));
-
-      // Update corpus maps
-      std::string corpus =
-          MakeFingerprint(document_wrapper.document().namespace_(),
-                          document_wrapper.document().schema());
-      ICING_ASSIGN_OR_RETURN(
-          CorpusId corpusId,
-          corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
-
-      ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
-                             GetCorpusAssociatedScoreDataToUpdate(corpusId));
-      scoring_data.AddDocument(
-          document_wrapper.document().internal_fields().length_in_tokens());
-
-      ICING_RETURN_IF_ERROR(
-          UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
-
-      ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
-          new_document_id,
-          DocumentAssociatedScoreData(
-              corpusId, document_wrapper.document().score(),
-              document_wrapper.document().creation_timestamp_ms(),
-              document_wrapper.document()
-                  .internal_fields()
-                  .length_in_tokens())));
-
-      int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
-          document_wrapper.document().creation_timestamp_ms(),
-          document_wrapper.document().ttl_ms());
-
-      ICING_RETURN_IF_ERROR(UpdateFilterCache(
-          new_document_id, DocumentFilterData(namespace_id, schema_type_id,
-                                              expiration_timestamp_ms)));
     }
+    // Updates key mapper and document_id mapper with the new document
+    DocumentId new_document_id = document_id_mapper_->num_elements();
+    ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+        MakeFingerprint(document_wrapper.document().namespace_(),
+                        document_wrapper.document().uri()),
+        new_document_id));
+    ICING_RETURN_IF_ERROR(
+        document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
+
+    SchemaTypeId schema_type_id;
+    auto schema_type_id_or =
+        schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
+    if (absl_ports::IsNotFound(schema_type_id_or.status())) {
+      // Didn't find a SchemaTypeId. This means that the DocumentStore and
+      // the SchemaStore are out of sync. But DocumentStore can't do
+      // anything about it so just ignore this for now. This should be
+      // detected/handled by the owner of DocumentStore. Set it to some
+      // arbitrary invalid value for now, it'll get updated to the correct
+      // ID later.
+      schema_type_id = -1;
+    } else if (!schema_type_id_or.ok()) {
+      // Real error. Pass it up
+      return schema_type_id_or.status();
+    } else {
+      // We're guaranteed that SchemaTypeId is valid now
+      schema_type_id = schema_type_id_or.ValueOrDie();
+    }
+
+    ICING_ASSIGN_OR_RETURN(
+        NamespaceId namespace_id,
+        namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+                                    namespace_mapper_->num_keys()));
+
+    // Update corpus maps
+    std::string corpus =
+        MakeFingerprint(document_wrapper.document().namespace_(),
+                        document_wrapper.document().schema());
+    ICING_ASSIGN_OR_RETURN(
+        CorpusId corpusId,
+        corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
+
+    ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+                           GetCorpusAssociatedScoreDataToUpdate(corpusId));
+    scoring_data.AddDocument(
+        document_wrapper.document().internal_fields().length_in_tokens());
+
+    ICING_RETURN_IF_ERROR(
+        UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+
+    ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+        new_document_id,
+        DocumentAssociatedScoreData(
+            corpusId, document_wrapper.document().score(),
+            document_wrapper.document().creation_timestamp_ms(),
+            document_wrapper.document().internal_fields().length_in_tokens())));
+
+    int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
+        document_wrapper.document().creation_timestamp_ms(),
+        document_wrapper.document().ttl_ms());
+
+    ICING_RETURN_IF_ERROR(UpdateFilterCache(
+        new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+                                            expiration_timestamp_ms)));
     iterator_status = iterator.Advance();
   }
 
@@ -920,18 +831,20 @@
                                           expiration_timestamp_ms)));
 
   if (old_document_id_or.ok()) {
+    // The old document exists, copy over the usage scores and delete the old
+    // document.
     DocumentId old_document_id = old_document_id_or.ValueOrDie();
-    auto offset_or = DoesDocumentExistAndGetFileOffset(old_document_id);
 
-    if (offset_or.ok()) {
-      // The old document exists, copy over the usage scores.
-      ICING_RETURN_IF_ERROR(
-          usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
-                                         /*to_document_id=*/new_document_id));
+    ICING_RETURN_IF_ERROR(
+        usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
+                                       /*to_document_id=*/new_document_id));
 
-      // Hard delete the old document.
-      ICING_RETURN_IF_ERROR(
-          HardDelete(old_document_id, offset_or.ValueOrDie()));
+    // Delete the old document. It's fine if it's not found since it might have
+    // been deleted previously.
+    auto delete_status = Delete(old_document_id);
+    if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+      // Real error, pass it up.
+      return delete_status;
     }
   }
 
@@ -973,8 +886,16 @@
 
 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
     DocumentId document_id, bool clear_internal_fields) const {
-  ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
-                         DoesDocumentExistAndGetFileOffset(document_id));
+  ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
+
+  auto document_log_offset_or = document_id_mapper_->Get(document_id);
+  if (!document_log_offset_or.ok()) {
+    // Since we've just checked that our document_id is valid a few lines
+    // above, there's no reason this should fail and an error should never
+    // happen.
+    return absl_ports::InternalError("Failed to find document offset.");
+  }
+  int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
 
   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
   // that can support error logging.
@@ -1025,7 +946,7 @@
     }
     const DocumentFilterData* data = status_or_data.ValueOrDie();
 
-    if (DoesDocumentExist(document_id)) {
+    if (InternalDoesDocumentExist(document_id)) {
       existing_namespace_ids.insert(data->namespace_id());
     }
   }
@@ -1038,45 +959,78 @@
   return existing_namespaces;
 }
 
-libtextclassifier3::StatusOr<int64_t>
-DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
+bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
   if (!IsDocumentIdValid(document_id)) {
-    return absl_ports::InvalidArgumentError(
-        IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id));
+    return false;
   }
 
-  auto file_offset_or = document_id_mapper_->Get(document_id);
-
-  bool deleted =
-      file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag;
-  if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) {
-    // Document has been deleted or doesn't exist
-    return absl_ports::NotFoundError(
-        IcingStringUtil::StringPrintf("Document %d not found", document_id));
+  if (document_id >= document_id_mapper_->num_elements()) {
+    // Somehow got an validly constructed document_id that the document store
+    // doesn't know about
+    return false;
   }
 
-  ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
-                         filter_cache_->Get(document_id));
-  if (clock_.GetSystemTimeMilliseconds() >=
-      filter_data->expiration_timestamp_ms()) {
-    // Past the expiration time, so also return NOT FOUND since it *shouldn't*
-    // exist anymore.
-    return absl_ports::NotFoundError(
-        IcingStringUtil::StringPrintf("Document %d not found", document_id));
-  }
-
-  ICING_RETURN_IF_ERROR(file_offset_or.status());
-  return *file_offset_or.ValueOrDie();
+  return InternalDoesDocumentExist(document_id);
 }
 
-bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
-  // If we can successfully get the document log offset, the document exists.
-  return DoesDocumentExistAndGetFileOffset(document_id).ok();
+libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus(
+    DocumentId document_id) const {
+  if (!IsDocumentIdValid(document_id)) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Document id '%d' invalid.", document_id));
+  }
+
+  if (document_id >= document_id_mapper_->num_elements()) {
+    // Somehow got a validly constructed document_id that the document store
+    // doesn't know about.
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Unknown document id '%d'.", document_id));
+  }
+
+  if (!InternalDoesDocumentExist(document_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Document id '%d' doesn't exist", document_id));
+  };
+  return libtextclassifier3::Status::OK;
+}
+
+bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const {
+  return !IsDeleted(document_id) && !IsExpired(document_id);
+}
+
+bool DocumentStore::IsDeleted(DocumentId document_id) const {
+  auto file_offset_or = document_id_mapper_->Get(document_id);
+  if (!file_offset_or.ok()) {
+    // This would only happen if document_id is out of range of the
+    // document_id_mapper, meaning we got some invalid document_id. Callers
+    // should already have checked that their document_id is valid or used
+    // DoesDocumentExist(WithStatus). Regardless, return true since the
+    // document doesn't exist.
+    return true;
+  }
+  int64_t file_offset = *file_offset_or.ValueOrDie();
+  return file_offset == kDocDeletedFlag;
+}
+
+bool DocumentStore::IsExpired(DocumentId document_id) const {
+  auto filter_data_or = filter_cache_->Get(document_id);
+  if (!filter_data_or.ok()) {
+    // This would only happen if document_id is out of range of the
+    // filter_cache, meaning we got some invalid document_id. Callers should
+    // already have checked that their document_id is valid or used
+    // DoesDocumentExist(WithStatus). Regardless, return true since the
+    // document doesn't exist.
+    return true;
+  }
+  const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
+
+  // Check if it's past the expiration time
+  return clock_.GetSystemTimeMilliseconds() >=
+         filter_data->expiration_timestamp_ms();
 }
 
 libtextclassifier3::Status DocumentStore::Delete(
-    const std::string_view name_space, const std::string_view uri,
-    bool soft_delete) {
+    const std::string_view name_space, const std::string_view uri) {
   // Try to get the DocumentId first
   auto document_id_or = GetDocumentId(name_space, uri);
   if (!document_id_or.ok()) {
@@ -1085,69 +1039,18 @@
         absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
                            ", uri: ", uri));
   }
-
-  // Check if the DocumentId's Document still exists.
-  DocumentId document_id = document_id_or.ValueOrDie();
-  auto file_offset_or = DoesDocumentExistAndGetFileOffset(document_id);
-  if (!file_offset_or.ok()) {
-    return absl_ports::Annotate(
-        file_offset_or.status(),
-        absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
-                           ", uri: ", uri));
-  }
-
-  if (soft_delete) {
-    return SoftDelete(name_space, uri, document_id);
-  } else {
-    return HardDelete(document_id, file_offset_or.ValueOrDie());
-  }
+  return Delete(document_id_or.ValueOrDie());
 }
 
-libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
-                                                 bool soft_delete) {
-  // Copy out the document to get namespace and uri.
-  ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
-                         DoesDocumentExistAndGetFileOffset(document_id));
+libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) {
+  ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
 
-  if (soft_delete) {
-    auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
-    if (!document_wrapper_or.ok()) {
-      ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
-                       << "Failed to read from document log";
-      return document_wrapper_or.status();
-    }
-    DocumentWrapper document_wrapper =
-        std::move(document_wrapper_or).ValueOrDie();
-
-    return SoftDelete(document_wrapper.document().namespace_(),
-                      document_wrapper.document().uri(), document_id);
-  } else {
-    return HardDelete(document_id, document_log_offset);
+  auto document_log_offset_or = document_id_mapper_->Get(document_id);
+  if (!document_log_offset_or.ok()) {
+    return absl_ports::InternalError("Failed to find document offset.");
   }
-}
+  int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
 
-// TODO(b/169969469): Consider removing SoftDelete().
-libtextclassifier3::Status DocumentStore::SoftDelete(
-    std::string_view name_space, std::string_view uri, DocumentId document_id) {
-  // Update ground truth first.
-  // Mark the document as deleted by appending a tombstone of it and actually
-  // remove it from file later in Optimize()
-  // TODO(b/144458732): Implement a more robust version of
-  // ICING_RETURN_IF_ERROR that can support error logging.
-  libtextclassifier3::Status status =
-      document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
-          .status();
-  if (!status.ok()) {
-    return absl_ports::Annotate(
-        status, absl_ports::StrCat("Failed to delete Document. namespace:",
-                                   name_space, ", uri: ", uri));
-  }
-
-  return document_id_mapper_->Set(document_id, kDocDeletedFlag);
-}
-
-libtextclassifier3::Status DocumentStore::HardDelete(
-    DocumentId document_id, int64_t document_log_offset) {
   // Erases document proto.
   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
   return ClearDerivedData(document_id);
@@ -1165,6 +1068,11 @@
 
 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
+  if (!DoesDocumentExist(document_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Can't get usage scores, document id '%d' doesn't exist", document_id));
+  }
+
   auto score_data_or = score_cache_->GetCopy(document_id);
   if (!score_data_or.ok()) {
     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
@@ -1228,6 +1136,10 @@
 
 libtextclassifier3::StatusOr<UsageStore::UsageScores>
 DocumentStore::GetUsageScores(DocumentId document_id) const {
+  if (!DoesDocumentExist(document_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Can't get usage scores, document id '%d' doesn't exist", document_id));
+  }
   return usage_store_->GetUsageScores(document_id);
 }
 
@@ -1236,11 +1148,22 @@
   ICING_ASSIGN_OR_RETURN(DocumentId document_id,
                          GetDocumentId(usage_report.document_namespace(),
                                        usage_report.document_uri()));
+  // We can use the internal version here because we got our document_id from
+  // our internal data structures. We would have thrown some error if the
+  // namespace and/or uri were incorrect.
+  if (!InternalDoesDocumentExist(document_id)) {
+    // Document was probably deleted or expired.
+    return absl_ports::NotFoundError(absl_ports::StrCat(
+        "Couldn't report usage on a nonexistent document: (namespace: '",
+        usage_report.document_namespace(), "', uri: '",
+        usage_report.document_uri(), "')"));
+  }
+
   return usage_store_->AddUsageReport(usage_report, document_id);
 }
 
 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
-    std::string_view name_space, bool soft_delete) {
+    std::string_view name_space) {
   DeleteByGroupResult result;
   auto namespace_id_or = namespace_mapper_->Get(name_space);
   if (!namespace_id_or.ok()) {
@@ -1250,26 +1173,7 @@
     return result;
   }
   NamespaceId namespace_id = namespace_id_or.ValueOrDie();
-
-  if (soft_delete) {
-    // To delete an entire namespace, we append a tombstone that only contains
-    // the deleted bit and the name of the deleted namespace.
-    // TODO(b/144458732): Implement a more robust version of
-    // ICING_RETURN_IF_ERROR that can support error logging.
-    libtextclassifier3::Status status =
-        document_log_->WriteProto(CreateNamespaceTombstone(name_space))
-            .status();
-    if (!status.ok()) {
-      ICING_LOG(ERROR) << status.error_message()
-                       << "Failed to delete namespace. namespace = "
-                       << name_space;
-      result.status = std::move(status);
-      return result;
-    }
-  }
-
-  auto num_deleted_or =
-      BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete);
+  auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
   if (!num_deleted_or.ok()) {
     result.status = std::move(num_deleted_or).status();
     return result;
@@ -1288,7 +1192,7 @@
 }
 
 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
-    std::string_view schema_type, bool soft_delete) {
+    std::string_view schema_type) {
   DeleteByGroupResult result;
   auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
   if (!schema_type_id_or.ok()) {
@@ -1299,26 +1203,7 @@
     return result;
   }
   SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
-
-  if (soft_delete) {
-    // To soft-delete an entire schema type, we append a tombstone that only
-    // contains the deleted bit and the name of the deleted schema type.
-    // TODO(b/144458732): Implement a more robust version of
-    // ICING_RETURN_IF_ERROR that can support error logging.
-    libtextclassifier3::Status status =
-        document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
-            .status();
-    if (!status.ok()) {
-      ICING_LOG(ERROR) << status.error_message()
-                       << "Failed to delete schema_type. schema_type = "
-                       << schema_type;
-      result.status = std::move(status);
-      return result;
-    }
-  }
-
-  auto num_deleted_or =
-      BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete);
+  auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
   if (!num_deleted_or.ok()) {
     result.status = std::move(num_deleted_or).status();
     return result;
@@ -1335,7 +1220,7 @@
 }
 
 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
-    NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) {
+    NamespaceId namespace_id, SchemaTypeId schema_type_id) {
   // Tracks if there were any existing documents with this namespace that we
   // will mark as deleted.
   int num_updated_documents = 0;
@@ -1367,31 +1252,16 @@
       continue;
     }
 
-    // The document has the desired namespace and schema type, it either exists
-    // or has been soft-deleted / expired.
-    if (soft_delete) {
-      if (DoesDocumentExist(document_id)) {
-        ++num_updated_documents;
-      }
-
-      // docid_mapper_->Set can only fail if document_id is < 0
-      // or >= docid_mapper_->num_elements. So the only possible way to get an
-      // error here would be if filter_cache_->num_elements >
-      // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
-      ICING_RETURN_IF_ERROR(
-          document_id_mapper_->Set(document_id, kDocDeletedFlag));
-    } else {
-      // Hard delete.
-      libtextclassifier3::Status delete_status =
-          Delete(document_id, /*soft_delete=*/false);
-      if (absl_ports::IsNotFound(delete_status)) {
-        continue;
-      } else if (!delete_status.ok()) {
-        // Real error, pass up.
-        return delete_status;
-      }
-      ++num_updated_documents;
+    // The document has the desired namespace and schema type, it either
+    // exists or has expired.
+    libtextclassifier3::Status delete_status = Delete(document_id);
+    if (absl_ports::IsNotFound(delete_status)) {
+      continue;
+    } else if (!delete_status.ok()) {
+      // Real error, pass up.
+      return delete_status;
     }
+    ++num_updated_documents;
   }
 
   return num_updated_documents;
@@ -1459,15 +1329,9 @@
   for (DocumentId document_id = 0;
        document_id < document_id_mapper_->num_elements(); ++document_id) {
     // Check if it's deleted first.
-    auto location_or = document_id_mapper_->Get(document_id);
-    if (!location_or.ok()) {
-      ICING_VLOG(1) << "Error trying to get document offsets for document "
-                       "store storage info counts.";
-      continue;
-    }
-    if (*location_or.ValueOrDie() == kDocDeletedFlag) {
-      // We don't have the namespace id of hard deleted documents anymore, so we
-      // can't add to our namespace storage info.
+    if (IsDeleted(document_id)) {
+      // We don't have the namespace id of hard deleted documents anymore, so
+      // we can't add to our namespace storage info.
       ++total_num_deleted;
       continue;
     }
@@ -1505,23 +1369,7 @@
     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
 
     // Update our stats
-    if (DoesDocumentExist(document_id)) {
-      ++total_num_alive;
-      namespace_storage_info.set_num_alive_documents(
-          namespace_storage_info.num_alive_documents() + 1);
-      if (usage_scores.usage_type1_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type1(
-            namespace_storage_info.num_alive_documents_usage_type1() + 1);
-      }
-      if (usage_scores.usage_type2_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type2(
-            namespace_storage_info.num_alive_documents_usage_type2() + 1);
-      }
-      if (usage_scores.usage_type3_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type3(
-            namespace_storage_info.num_alive_documents_usage_type3() + 1);
-      }
-    } else {
+    if (IsExpired(document_id)) {
       ++total_num_expired;
       namespace_storage_info.set_num_expired_documents(
           namespace_storage_info.num_expired_documents() + 1);
@@ -1537,6 +1385,22 @@
         namespace_storage_info.set_num_expired_documents_usage_type3(
             namespace_storage_info.num_expired_documents_usage_type3() + 1);
       }
+    } else {
+      ++total_num_alive;
+      namespace_storage_info.set_num_alive_documents(
+          namespace_storage_info.num_alive_documents() + 1);
+      if (usage_scores.usage_type1_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type1(
+            namespace_storage_info.num_alive_documents_usage_type1() + 1);
+      }
+      if (usage_scores.usage_type2_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type2(
+            namespace_storage_info.num_alive_documents_usage_type2() + 1);
+      }
+      if (usage_scores.usage_type3_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type3(
+            namespace_storage_info.num_alive_documents_usage_type3() + 1);
+      }
     }
   }
 
@@ -1617,50 +1481,19 @@
   schema_store_ = schema_store;
   document_validator_.UpdateSchemaStore(schema_store);
 
-  // Append a tombstone for each deleted schema type. This way, we don't have
-  // to read out each document, check if the schema type has been deleted, and
-  // append a tombstone per-document.
-  for (const auto& schema_type :
-       set_schema_result.schema_types_deleted_by_name) {
-    // TODO(b/144458732): Implement a more robust version of
-    // ICING_RETURN_IF_ERROR that can support error logging.
-    libtextclassifier3::Status status =
-        document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
-            .status();
-    if (!status.ok()) {
-      ICING_LOG(ERROR) << status.error_message()
-                       << "Failed to delete schema_type. schema_type = "
-                       << schema_type;
-      return status;
-    }
-  }
-
   int size = document_id_mapper_->num_elements();
   for (DocumentId document_id = 0; document_id < size; document_id++) {
-    auto exists_or = DoesDocumentExistAndGetFileOffset(document_id);
-    if (absl_ports::IsNotFound(exists_or.status())) {
+    if (!InternalDoesDocumentExist(document_id)) {
       // Skip nonexistent documents
       continue;
-    } else if (!exists_or.ok()) {
-      // Real error, pass up
-      return absl_ports::Annotate(
-          exists_or.status(),
-          IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d",
-                                        document_id));
     }
 
     // Guaranteed that the document exists now.
     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
                            filter_cache_->Get(document_id));
 
-    if (set_schema_result.schema_types_deleted_by_id.count(
-            filter_data->schema_type_id()) != 0) {
-      // We already created a tombstone for this deleted type. Just update the
-      // derived files now.
-      ICING_RETURN_IF_ERROR(
-          document_id_mapper_->Set(document_id, kDocDeletedFlag));
-      continue;
-    }
+    bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
+                               filter_data->schema_type_id()) != 0;
 
     // Check if we need to update the FilterCache entry for this document. It
     // may have been assigned a different SchemaTypeId in the new SchemaStore.
@@ -1684,17 +1517,17 @@
         filter_cache_->mutable_array()[document_id].set_schema_type_id(
             schema_type_id);
       }
-
       if (revalidate_document) {
-        if (!document_validator_.Validate(document).ok()) {
-          // Document is no longer valid with the new SchemaStore. Mark as
-          // deleted
-          auto delete_status = Delete(document.namespace_(), document.uri());
-          if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
-            // Real error, pass up
-            return delete_status;
-          }
-        }
+        delete_document = !document_validator_.Validate(document).ok();
+      }
+    }
+
+    if (delete_document) {
+      // Document is no longer valid with the new SchemaStore. Mark as deleted
+      auto delete_status = Delete(document_id);
+      if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+        // Real error, pass up
+        return delete_status;
       }
     }
   }
@@ -1729,11 +1562,9 @@
   for (DocumentId document_id = 0; document_id < size; document_id++) {
     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
     if (absl_ports::IsNotFound(document_or.status())) {
-      // Don't optimize nonexistent documents, but collect stats
-      auto location_or = document_id_mapper_->Get(document_id);
-      if (location_or.ok() && *location_or.ValueOrDie() == kDocDeletedFlag) {
+      if (IsDeleted(document_id)) {
         ++num_deleted;
-      } else {
+      } else if (IsExpired(document_id)) {
         ++num_expired;
       }
       continue;
@@ -1776,6 +1607,7 @@
     // Copy over usage scores.
     ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
                            usage_store_->GetUsageScores(document_id));
+
     DocumentId new_document_id = new_document_id_or.ValueOrDie();
     ICING_RETURN_IF_ERROR(
         new_doc_store->SetUsageScores(new_document_id, usage_scores));
@@ -1797,7 +1629,7 @@
   int32_t num_documents = document_id_mapper_->num_elements();
   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
        ++document_id) {
-    if (!DoesDocumentExist(document_id)) {
+    if (!InternalDoesDocumentExist(document_id)) {
       ++optimize_info.optimizable_docs;
     }
 
@@ -1835,10 +1667,10 @@
   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
                          document_key_mapper_->GetElementsSize());
 
-  // We don't include the namespace_mapper or the corpus_mapper because it's not
-  // clear if we could recover any space even if Optimize were called. Deleting
-  // 100s of documents could still leave a few documents of a namespace, and
-  // then there would be no change.
+  // We don't include the namespace_mapper or the corpus_mapper because it's
+  // not clear if we could recover any space even if Optimize were called.
+  // Deleting 100s of documents could still leave a few documents of a
+  // namespace, and then there would be no change.
 
   int64_t total_size = document_log_file_size + document_key_mapper_size +
                        document_id_mapper_file_size + score_cache_file_size +
@@ -1868,8 +1700,8 @@
 libtextclassifier3::Status DocumentStore::ClearDerivedData(
     DocumentId document_id) {
   // We intentionally leave the data in key_mapper_ because locating that data
-  // requires fetching namespace and uri. Leaving data in key_mapper_ should be
-  // fine because the data is hashed.
+  // requires fetching namespace and uri. Leaving data in key_mapper_ should
+  // be fine because the data is hashed.
 
   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
 
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 533b240..9e1b3ec 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -109,6 +109,11 @@
   // previously initialized with this directory, it will reload the files saved
   // by the last instance.
   //
+  // force_recovery_and_revalidate_documents=true will pre-emptively throw out
+  // the derived files and validate each document while recreating them. This
+  // can be used to indicate that the schema (and type ids) may have changed and
+  // those changes might not have been applied to the document store.
+  //
   // If initialize_stats is present, the fields related to DocumentStore will be
   // populated.
   //
@@ -125,6 +130,7 @@
   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
       const Filesystem* filesystem, const std::string& base_dir,
       const Clock* clock, const SchemaStore* schema_store,
+      bool force_recovery_and_revalidate_documents = false,
       InitializeStatsProto* initialize_stats = nullptr);
 
   // Returns the maximum DocumentId that the DocumentStore has assigned. If
@@ -192,18 +198,21 @@
   // Check if a document exists. Existence means it hasn't been deleted and it
   // hasn't expired yet.
   //
+  // NOTE: This should be used when callers don't care about error messages,
+  // expect documents to be deleted/not found, or in frequently called code
+  // paths that could cause performance issues. A signficant amount of CPU
+  // cycles can be saved if we don't construct strings and create new Status
+  // objects on the heap. See b/185822483.
+  //
   // Returns:
   //   boolean whether a document exists or not
   bool DoesDocumentExist(DocumentId document_id) const;
 
   // Deletes the document identified by the given namespace and uri. The
-  // document proto will be marked as deleted if 'soft_delete' is true,
-  // otherwise the document proto will be erased immediately.
+  // document proto will be erased immediately.
   //
   // NOTE:
-  // 1. The soft deletion uses less CPU power, it can be applied on
-  //    non-sensitive data.
-  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
@@ -211,26 +220,20 @@
   //   NOT_FOUND if no document exists with namespace, uri
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status Delete(std::string_view name_space,
-                                    std::string_view uri,
-                                    bool soft_delete = false);
+                                    std::string_view uri);
 
-  // Deletes the document identified by the given document_id. The
-  // document proto will be marked as deleted if 'soft_delete' is true,
-  // otherwise the document proto will be erased immediately.
+  // Deletes the document identified by the given document_id. The document
+  // proto will be erased immediately.
   //
   // NOTE:
-  // 1. If possible, please use the other method Delete(name_space, uri,
-  //    soft_delete) for soft deletes because we need namespace and uri to
-  //    perform soft deletes.
-  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   INTERNAL_ERROR on IO error
   //   INVALID_ARGUMENT if document_id is invalid.
-  libtextclassifier3::Status Delete(DocumentId document_id,
-                                    bool soft_delete = false);
+  libtextclassifier3::Status Delete(DocumentId document_id);
 
   // Returns the NamespaceId of the string namespace
   //
@@ -253,16 +256,9 @@
   // Returns the DocumentAssociatedScoreData of the document specified by the
   // DocumentId.
   //
-  // NOTE: This does not check if the document exists and will return the
-  // DocumentFilterData of the document even if it has been deleted. Users
-  // should check DoesDocumentExist(document_id) if they only want existing
-  // documents' DocumentFilterData.
-  //
   // Returns:
   //   DocumentAssociatedScoreData on success
-  //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
-  //                DocumentIds
-  //   NOT_FOUND if no score data is found
+  //   NOT_FOUND if the document or the score data is not found
   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
   GetDocumentAssociatedScoreData(DocumentId document_id) const;
 
@@ -299,8 +295,8 @@
   //
   // Returns:
   //   UsageScores on success
+  //   NOT_FOUND if document_id no longer exists.
   //   INVALID_ARGUMENT if document_id is invalid
-  //   INTERNAL_ERROR on I/O errors
   libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores(
       DocumentId document_id) const;
 
@@ -314,38 +310,30 @@
   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
 
   // Deletes all documents belonging to the given namespace. The documents will
-  // be marked as deleted if 'soft_delete' is true, otherwise they will be
-  // erased immediately.
+  // be erased immediately.
   //
   // NOTE:
-  // 1. The soft deletion uses less CPU power, it can be applied on
-  //    non-sensitive data.
-  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if namespace doesn't exist
   //   INTERNAL_ERROR on IO error
-  DeleteByGroupResult DeleteByNamespace(std::string_view name_space,
-                                        bool soft_delete = false);
+  DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
 
   // Deletes all documents belonging to the given schema type. The documents
-  // will be marked as deleted if 'soft_delete' is true, otherwise they will be
-  // erased immediately.
+  // will be erased immediately.
   //
   // NOTE:
-  // 1. The soft deletion uses less CPU power, it can be applied on
-  //    non-sensitive data.
-  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    Space is not reclaimed for deleted documents until Optimize() is
   //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if schema_type doesn't exist
   //   INTERNAL_ERROR on IO error
-  DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type,
-                                         bool soft_delete = false);
+  DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
 
   // Syncs all the data and metadata changes to disk.
   //
@@ -508,6 +496,7 @@
   bool initialized_ = false;
 
   libtextclassifier3::StatusOr<DataLoss> Initialize(
+      bool force_recovery_and_revalidate_documents,
       InitializeStatsProto* initialize_stats);
 
   // Creates sub-components and verifies the integrity of each sub-component.
@@ -518,6 +507,9 @@
 
   // Re-generates all files derived from the ground truth: the document log.
   //
+  // revalidate_documents=true will also cause each document to be revalidated
+  // the schema as it is read out of the document log.
+  //
   // NOTE: if this function fails, the only thing we can do is to retry it until
   // it succeeds or prevent the initialization of a DocumentStore. The
   // DocumentStore object wouldn't work reliably if this fails.
@@ -528,7 +520,7 @@
   //   document_id
   //      mapper.
   //   3. Create header and store the updated combined checksum
-  libtextclassifier3::Status RegenerateDerivedFiles();
+  libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
 
   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
   // file, and re-creates a new instance of the document_key_mapper .
@@ -591,9 +583,8 @@
   // Helper function to do batch deletes. Documents with the given
   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
-  // kInvalidSchemaTypeId. The document protos will be marked as deleted if
-  // 'soft_delete' is true, otherwise the document protos with their derived
-  // data will be erased / cleared immediately.
+  // kInvalidSchemaTypeId. The document protos with their derived data will be
+  // erased / cleared immediately.
   //
   // NOTE: Space is not reclaimed in the derived files until Optimize() is
   // called.
@@ -602,28 +593,7 @@
   //   Number of documents that were actually updated to be deleted
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
-                                                SchemaTypeId schema_type_id,
-                                                bool soft_delete);
-
-  // Marks the document identified by the given name_space, uri and document_id
-  // as deleted, to be removed later during Optimize().
-  //
-  // Returns:
-  //   OK on success
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status SoftDelete(std::string_view name_space,
-                                        std::string_view uri,
-                                        DocumentId document_id);
-
-  // Erases the document at the given document_log_offset from the document_log
-  // and clears the derived data identified by the given document_id. The space
-  // will be reclaimed later during Optimize().
-  //
-  // Returns:
-  //   OK on success
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status HardDelete(DocumentId document_id,
-                                        int64_t document_log_offset);
+                                                SchemaTypeId schema_type_id);
 
   // Helper method to find a DocumentId that is associated with the given
   // namespace and uri.
@@ -654,22 +624,46 @@
   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
 
-  // Helper method to validate the document id and return the file offset of the
-  // associated document in document_log_.
-  //
-  // This can be a more informative call than just DoesDocumentExist because it
-  // can return more status errors on whether the Document actually doesn't
-  // exist or if there was an internal error while accessing files.
+  // Check if a document exists. Existence means it hasn't been deleted and it
+  // hasn't expired yet.
   //
   // Returns:
-  //   The file offset on success
+  //   OK if the document exists
   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
   //                    maximum value
   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
+  libtextclassifier3::Status DoesDocumentExistWithStatus(
       DocumentId document_id) const;
 
+  // Check if a document exists. Existence means it hasn't been deleted and it
+  // hasn't expired yet.
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  //
+  // Returns:
+  //   boolean whether a document exists or not
+  bool InternalDoesDocumentExist(DocumentId document_id) const;
+
+  // Checks if a document has been deleted
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  bool IsDeleted(DocumentId document_id) const;
+
+  // Checks if a document has expired.
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  bool IsExpired(DocumentId document_id) const;
+
   // Updates the entry in the score cache for document_id.
   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
new file mode 100644
index 0000000..f68e115
--- /dev/null
+++ b/icing/store/document-store_benchmark.cc
@@ -0,0 +1,174 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+// Run on a Linux workstation:
+//    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/store:document-store_benchmark
+//
+//    $ blaze-bin/icing/store/document-store_benchmark
+//    --benchmarks=all --benchmark_memory_usage
+//
+// Run on an Android device:
+//    $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+//    --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/store:document-store_benchmark
+//
+//    $ adb push blaze-bin/icing/store/document-store_benchmark
+//    /data/local/tmp/
+//
+//    $ adb shell /data/local/tmp/document-store_benchmark
+//    --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+    PropertyConfigProto_Cardinality_Code_OPTIONAL;
+
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+    StringIndexingConfig_TokenizerType_Code_PLAIN;
+
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+
+class DestructibleDirectory {
+ public:
+  explicit DestructibleDirectory(const Filesystem& filesystem,
+                                 const std::string& dir)
+      : filesystem_(filesystem), dir_(dir) {
+    filesystem_.CreateDirectoryRecursively(dir_.c_str());
+  }
+  ~DestructibleDirectory() {
+    filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+  }
+
+ private:
+  Filesystem filesystem_;
+  std::string dir_;
+};
+
+DocumentProto CreateDocument(const std::string namespace_,
+                             const std::string uri) {
+  return DocumentBuilder()
+      .SetKey(namespace_, uri)
+      .SetSchema("email")
+      .AddStringProperty("subject", "subject foo")
+      .AddStringProperty("body", "body bar")
+      .Build();
+}
+
+SchemaProto CreateSchema() {
+  return SchemaBuilder()
+      .AddType(
+          SchemaTypeConfigBuilder()
+              .SetType("email")
+              .AddProperty(PropertyConfigBuilder()
+                               .SetName("subject")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL))
+              .AddProperty(PropertyConfigBuilder()
+                               .SetName("body")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL)))
+      .Build();
+}
+
+std::unique_ptr<SchemaStore> CreateSchemaStore(Filesystem filesystem,
+                                               const std::string directory,
+                                               const Clock* clock) {
+  const std::string schema_store_dir = directory + "/schema";
+  filesystem.CreateDirectoryRecursively(schema_store_dir.data());
+  std::unique_ptr<SchemaStore> schema_store =
+      SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie();
+
+  auto set_schema_status = schema_store->SetSchema(CreateSchema());
+  if (!set_schema_status.ok()) {
+    ICING_LOG(ERROR) << set_schema_status.status().error_message();
+  }
+
+  return schema_store;
+}
+
+void BM_DoesDocumentExistBenchmark(benchmark::State& state) {
+  Filesystem filesystem;
+  Clock clock;
+
+  std::string directory = GetTestTempDir() + "/icing";
+  DestructibleDirectory ddir(filesystem, directory);
+
+  std::string document_store_dir = directory + "/store";
+  std::unique_ptr<SchemaStore> schema_store =
+      CreateSchemaStore(filesystem, directory, &clock);
+
+  filesystem.CreateDirectoryRecursively(document_store_dir.data());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem, document_store_dir, &clock,
+                            schema_store.get()));
+  std::unique_ptr<DocumentStore> document_store =
+      std::move(create_result.document_store);
+
+  int max_document_id = 300000;
+  for (int i = 0; i < max_document_id; ++i) {
+    // Put and delete a lot of documents to fill up our derived files with
+    // stuff.
+    ICING_ASSERT_OK(document_store->Put(
+        CreateDocument("namespace", /*uri=*/std::to_string(i))));
+    document_store->Delete("namespace", /*uri=*/std::to_string(i));
+  }
+
+  std::default_random_engine random;
+  std::uniform_int_distribution<> dist(1, max_document_id);
+  for (auto s : state) {
+    // Check random document ids to see if they exist. Hopefully to simulate
+    // page faulting in different sections of our mmapped derived files.
+    int document_id = dist(random);
+    benchmark::DoNotOptimize(document_store->DoesDocumentExist(document_id));
+  }
+}
+BENCHMARK(BM_DoesDocumentExistBenchmark);
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 7b04a76..b37c6de 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <string>
 
+#include "icing/text_classifier/lib3/utils/base/status.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
@@ -29,6 +30,7 @@
 #include "icing/file/mock-filesystem.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/storage.pb.h"
@@ -41,7 +43,6 @@
 #include "icing/store/namespace-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -90,6 +91,9 @@
 
 constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
 
+constexpr PropertyConfigProto_DataType_Code TYPE_INT =
+    PropertyConfigProto_DataType_Code_INT64;
+
 UsageReport CreateUsageReport(std::string name_space, std::string uri,
                               int64 timestamp_ms,
                               UsageReport::UsageType usage_type) {
@@ -181,6 +185,19 @@
     filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
   }
 
+  void CorruptDocStoreHeaderChecksumFile() {
+    // Change the DocStore's header combined checksum so that it won't match the
+    // recalculated checksum on initialization. This will force a regeneration
+    // of derived files from ground truth.
+    const std::string header_file =
+        absl_ports::StrCat(document_store_dir_, "/document_store_header");
+    DocumentStore::Header header;
+    header.magic = DocumentStore::Header::kMagic;
+    header.checksum = 10;  // Arbitrary garbage checksum
+    filesystem_.DeleteFile(header_file.c_str());
+    filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+  }
+
   const Filesystem filesystem_;
   const std::string test_dir_;
   FakeClock fake_clock_;
@@ -310,7 +327,7 @@
   EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1)));
 }
 
-TEST_F(DocumentStoreTest, IsDocumentExisting) {
+TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -342,7 +359,7 @@
               IsFalse());
 }
 
-TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) {
+TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -356,29 +373,7 @@
       IsOkAndHolds(EqualsProto(test_document1_)));
 
   ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
-                                         test_document1_.uri(),
-                                         /*soft_delete=*/true));
-  EXPECT_THAT(
-      document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
-      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
-  EXPECT_THAT(
-      document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
-      IsOkAndHolds(EqualsProto(test_document1_)));
-
-  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
-                                         test_document1_.uri(),
-                                         /*soft_delete=*/false));
+                                         test_document1_.uri()));
   EXPECT_THAT(
       document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
       StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -488,7 +483,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) {
+TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -519,7 +514,7 @@
   // DELETE namespace.1. document1 and document 4 should be deleted. document2
   // and document3 should still be retrievable.
   DocumentStore::DeleteByGroupResult group_result =
-      doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true);
+      doc_store->DeleteByNamespace("namespace.1");
   EXPECT_THAT(group_result.status, IsOk());
   EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
   EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
@@ -532,51 +527,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> doc_store =
-      std::move(create_result.document_store);
-
-  DocumentProto document1 = test_document1_;
-  document1.set_namespace_("namespace.1");
-  document1.set_uri("uri1");
-  ICING_ASSERT_OK(doc_store->Put(document1));
-
-  DocumentProto document2 = test_document1_;
-  document2.set_namespace_("namespace.2");
-  document2.set_uri("uri1");
-  ICING_ASSERT_OK(doc_store->Put(document2));
-
-  DocumentProto document3 = test_document1_;
-  document3.set_namespace_("namespace.3");
-  document3.set_uri("uri1");
-  ICING_ASSERT_OK(doc_store->Put(document3));
-
-  DocumentProto document4 = test_document1_;
-  document4.set_namespace_("namespace.1");
-  document4.set_uri("uri2");
-  ICING_ASSERT_OK(doc_store->Put(document4));
-
-  // DELETE namespace.1. document1 and document 4 should be deleted. document2
-  // and document3 should still be retrievable.
-  DocumentStore::DeleteByGroupResult group_result =
-      doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false);
-  EXPECT_THAT(group_result.status, IsOk());
-  EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
-  EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
-              IsOkAndHolds(EqualsProto(document2)));
-  EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
-              IsOkAndHolds(EqualsProto(document3)));
-  EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) {
+TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -589,10 +540,7 @@
   int64_t document_log_size_before = filesystem_.GetFileSize(
       absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
 
-  EXPECT_THAT(doc_store
-                  ->DeleteByNamespace("nonexistent_namespace",
-                                      /*soft_delete=*/true)
-                  .status,
+  EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace").status,
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 
   int64_t document_log_size_after = filesystem_.GetFileSize(
@@ -600,31 +548,7 @@
   EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
 }
 
-TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> doc_store =
-      std::move(create_result.document_store);
-
-  // Validates that deleting something non-existing won't append anything to
-  // ground truth
-  int64_t document_log_size_before = filesystem_.GetFileSize(
-      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
-
-  EXPECT_THAT(doc_store
-                  ->DeleteByNamespace("nonexistent_namespace",
-                                      /*soft_delete=*/false)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
-  int64_t document_log_size_after = filesystem_.GetFileSize(
-      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
-  EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
-}
-
-TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) {
+TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -639,33 +563,9 @@
   // At this point, there are no existing documents with the namespace, even
   // though Icing's derived files know about this namespace. We should still
   // return NOT_FOUND since nothing existing has this namespace.
-  EXPECT_THAT(document_store
-                  ->DeleteByNamespace(test_document1_.namespace_(),
-                                      /*soft_delete=*/true)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  ICING_EXPECT_OK(document_store->Put(test_document1_));
-  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
-                                         test_document1_.uri()));
-
-  // At this point, there are no existing documents with the namespace, even
-  // though Icing's derived files know about this namespace. We should still
-  // return NOT_FOUND since nothing existing has this namespace.
-  EXPECT_THAT(document_store
-                  ->DeleteByNamespace(test_document1_.namespace_(),
-                                      /*soft_delete=*/false)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(
+      document_store->DeleteByNamespace(test_document1_.namespace_()).status,
+      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
@@ -710,17 +610,7 @@
         absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
   }  // Destructors should update checksum and persist all data to file.
 
-  // Change the DocStore's header combined checksum so that it won't match the
-  // recalculated checksum on initialization. This will force a regeneration of
-  // derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+  CorruptDocStoreHeaderChecksumFile();
   // Successfully recover from a corrupt derived file issue.
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -744,92 +634,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) {
-  SchemaProto schema =
-      SchemaBuilder()
-          .AddType(SchemaTypeConfigBuilder().SetType("email"))
-          .AddType(SchemaTypeConfigBuilder().SetType("message"))
-          .AddType(SchemaTypeConfigBuilder().SetType("person"))
-          .Build();
-  std::string schema_store_dir = schema_store_dir_ + "_custom";
-  filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
-  filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
-  ICING_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<SchemaStore> schema_store,
-      SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
-
-  ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  DocumentProto email_document_1 = DocumentBuilder()
-                                       .SetKey("namespace1", "1")
-                                       .SetSchema("email")
-                                       .SetCreationTimestampMs(1)
-                                       .Build();
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id,
-                             document_store->Put(email_document_1));
-
-  DocumentProto email_document_2 = DocumentBuilder()
-                                       .SetKey("namespace2", "2")
-                                       .SetSchema("email")
-                                       .SetCreationTimestampMs(1)
-                                       .Build();
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id,
-                             document_store->Put(email_document_2));
-
-  DocumentProto message_document = DocumentBuilder()
-                                       .SetKey("namespace", "3")
-                                       .SetSchema("message")
-                                       .SetCreationTimestampMs(1)
-                                       .Build();
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
-                             document_store->Put(message_document));
-
-  DocumentProto person_document = DocumentBuilder()
-                                      .SetKey("namespace", "4")
-                                      .SetSchema("person")
-                                      .SetCreationTimestampMs(1)
-                                      .Build();
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
-                             document_store->Put(person_document));
-
-  // Delete the "email" type and ensure that it works across both
-  // email_document's namespaces. And that other documents aren't affected.
-  DocumentStore::DeleteByGroupResult group_result =
-      document_store->DeleteBySchemaType("email", /*soft_delete=*/true);
-  EXPECT_THAT(group_result.status, IsOk());
-  EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
-  EXPECT_THAT(document_store->Get(email_1_document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(document_store->Get(email_2_document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(document_store->Get(message_document_id),
-              IsOkAndHolds(EqualsProto(message_document)));
-  EXPECT_THAT(document_store->Get(person_document_id),
-              IsOkAndHolds(EqualsProto(person_document)));
-
-  // Delete the "message" type and check that other documents aren't affected
-  group_result =
-      document_store->DeleteBySchemaType("message", /*soft_delete=*/true);
-  EXPECT_THAT(group_result.status, IsOk());
-  EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
-  EXPECT_THAT(document_store->Get(email_1_document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(document_store->Get(email_2_document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(document_store->Get(message_document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  EXPECT_THAT(document_store->Get(person_document_id),
-              IsOkAndHolds(EqualsProto(person_document)));
-}
-
-TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) {
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
   SchemaProto schema =
       SchemaBuilder()
           .AddType(SchemaTypeConfigBuilder().SetType("email"))
@@ -888,7 +693,7 @@
   // Delete the "email" type and ensure that it works across both
   // email_document's namespaces. And that other documents aren't affected.
   DocumentStore::DeleteByGroupResult group_result =
-      document_store->DeleteBySchemaType("email", /*soft_delete=*/true);
+      document_store->DeleteBySchemaType("email");
   EXPECT_THAT(group_result.status, IsOk());
   EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
   EXPECT_THAT(document_store->Get(email_1_document_id),
@@ -901,8 +706,7 @@
               IsOkAndHolds(EqualsProto(person_document)));
 
   // Delete the "message" type and check that other documents aren't affected
-  group_result =
-      document_store->DeleteBySchemaType("message", /*soft_delete=*/true);
+  group_result = document_store->DeleteBySchemaType("message");
   EXPECT_THAT(group_result.status, IsOk());
   EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
   EXPECT_THAT(document_store->Get(email_1_document_id),
@@ -915,7 +719,7 @@
               IsOkAndHolds(EqualsProto(person_document)));
 }
 
-TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -928,10 +732,7 @@
   int64_t document_log_size_before = filesystem_.GetFileSize(
       absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
 
-  EXPECT_THAT(document_store
-                  ->DeleteBySchemaType("nonexistent_type",
-                                       /*soft_delete=*/true)
-                  .status,
+  EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type").status,
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 
   int64_t document_log_size_after = filesystem_.GetFileSize(
@@ -940,32 +741,7 @@
   EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
 }
 
-TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  // Validates that deleting something non-existing won't append anything to
-  // ground truth
-  int64_t document_log_size_before = filesystem_.GetFileSize(
-      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
-
-  EXPECT_THAT(document_store
-                  ->DeleteBySchemaType("nonexistent_type",
-                                       /*soft_delete=*/false)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
-  int64_t document_log_size_after = filesystem_.GetFileSize(
-      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
-
-  EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
-}
-
-TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) {
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -977,30 +753,9 @@
   ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
                                          test_document1_.uri()));
 
-  EXPECT_THAT(document_store
-                  ->DeleteBySchemaType(test_document1_.schema(),
-                                       /*soft_delete=*/true)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  ICING_EXPECT_OK(document_store->Put(test_document1_));
-  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
-                                         test_document1_.uri()));
-
-  EXPECT_THAT(document_store
-                  ->DeleteBySchemaType(test_document1_.schema(),
-                                       /*soft_delete=*/false)
-                  .status,
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(
+      document_store->DeleteBySchemaType(test_document1_.schema()).status,
+      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
@@ -1057,17 +812,7 @@
         absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
   }  // Destructors should update checksum and persist all data to file.
 
-  // Change the DocumentStore's header combined checksum so that it won't match
-  // the recalculated checksum on initialization. This will force a regeneration
-  // of derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+  CorruptDocStoreHeaderChecksumFile();
   // Successfully recover from a corrupt derived file issue.
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -1087,6 +832,19 @@
               IsOkAndHolds(EqualsProto(message_document)));
 }
 
+TEST_F(DocumentStoreTest, PutDeleteThenPut) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  std::unique_ptr<DocumentStore> doc_store =
+      std::move(create_result.document_store);
+  ICING_EXPECT_OK(doc_store->Put(test_document1_));
+  ICING_EXPECT_OK(
+      doc_store->Delete(test_document1_.namespace_(), test_document1_.uri()));
+  ICING_EXPECT_OK(doc_store->Put(test_document1_));
+}
+
 TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
   SchemaProto schema =
       SchemaBuilder()
@@ -1146,16 +904,7 @@
         absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
   }  // Destructors should update checksum and persist all data to file.
 
-  // Change the DocumentStore's header combined checksum so that it won't match
-  // the recalculated checksum on initialization. This will force a regeneration
-  // of derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+  CorruptDocStoreHeaderChecksumFile();
 
   SchemaProto new_schema =
       SchemaBuilder()
@@ -1484,17 +1233,7 @@
                 IsOkAndHolds(EqualsProto(test_document2_)));
   }
 
-  // Change the DocStore's header combined checksum so that it won't match the
-  // recalculated checksum on initialization. This will force a regeneration of
-  // derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+  CorruptDocStoreHeaderChecksumFile();
   // Successfully recover from a corrupt derived file issue.
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -1900,7 +1639,7 @@
           /*length_in_tokens=*/7)));
 }
 
-TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) {
+TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1909,10 +1648,10 @@
       std::move(create_result.document_store);
 
   EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0),
-              StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) {
+TEST_F(DocumentStoreTest, DeleteClearsFilterCache) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1930,59 +1669,13 @@
           /*schema_type_id=*/0,
           /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
 
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
-  // Associated entry of the deleted document is removed.
-  EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk());
-}
-
-TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> doc_store =
-      std::move(create_result.document_store);
-
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
-                             doc_store->Put(test_document1_));
-
-  EXPECT_THAT(
-      doc_store->GetDocumentFilterData(document_id),
-      IsOkAndHolds(DocumentFilterData(
-          /*namespace_id=*/0,
-          /*schema_type_id=*/0,
-          /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
-
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
   // Associated entry of the deleted document is removed.
   EXPECT_THAT(doc_store->GetDocumentFilterData(document_id),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> doc_store =
-      std::move(create_result.document_store);
-
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
-                             doc_store->Put(test_document1_, /*num_tokens=*/4));
-
-  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
-              IsOkAndHolds(DocumentAssociatedScoreData(
-                  /*corpus_id=*/0, /*document_score=*/document1_score_,
-                  /*creation_timestamp_ms=*/document1_creation_timestamp_,
-                  /*length_in_tokens=*/4)));
-
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
-  // Associated entry of the deleted document is removed.
-  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(),
-              IsOk());
-}
-
-TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) {
+TEST_F(DocumentStoreTest, DeleteClearsScoreCache) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -2000,13 +1693,13 @@
                   /*creation_timestamp_ms=*/document1_creation_timestamp_,
                   /*length_in_tokens=*/4)));
 
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
   // Associated entry of the deleted document is removed.
   EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, SoftDeleteDoesNotClearUsageScores) {
+TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -2028,15 +1721,21 @@
   ASSERT_THAT(doc_store->GetUsageScores(document_id),
               IsOkAndHolds(expected_scores));
 
-  // Soft delete the document.
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+  // Delete the document.
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
 
-  // The scores should be the same.
+  // Can't report or get usage scores on the deleted document
+  ASSERT_THAT(
+      doc_store->ReportUsage(usage_report_type1),
+      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+               HasSubstr("Couldn't report usage on a nonexistent document")));
+
   ASSERT_THAT(doc_store->GetUsageScores(document_id),
-              IsOkAndHolds(expected_scores));
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+                       HasSubstr("Can't get usage scores")));
 }
 
-TEST_F(DocumentStoreTest, HardDeleteShouldClearUsageScores) {
+TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -2044,8 +1743,20 @@
   std::unique_ptr<DocumentStore> doc_store =
       std::move(create_result.document_store);
 
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
-                             doc_store->Put(test_document1_));
+  DocumentProto document = DocumentBuilder()
+                               .SetKey("icing", "email/1")
+                               .SetSchema("email")
+                               .AddStringProperty("subject", "subject foo")
+                               .AddStringProperty("body", "body bar")
+                               .SetScore(document1_score_)
+                               .SetCreationTimestampMs(10)
+                               .SetTtlMs(100)
+                               .Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+  // Some arbitrary time before the document's creation time (10) + ttl (100)
+  fake_clock_.SetSystemTimeMilliseconds(109);
 
   // Report usage with type 1.
   UsageReport usage_report_type1 = CreateUsageReport(
@@ -2058,13 +1769,18 @@
   ASSERT_THAT(doc_store->GetUsageScores(document_id),
               IsOkAndHolds(expected_scores));
 
-  // Hard delete the document.
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+  // Some arbitrary time past the document's creation time (10) + ttl (100)
+  fake_clock_.SetSystemTimeMilliseconds(200);
 
-  // The scores should be cleared.
-  expected_scores.usage_type1_count = 0;
+  // Can't report or get usage scores on the expired document
+  ASSERT_THAT(
+      doc_store->ReportUsage(usage_report_type1),
+      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+               HasSubstr("Couldn't report usage on a nonexistent document")));
+
   ASSERT_THAT(doc_store->GetUsageScores(document_id),
-              IsOkAndHolds(expected_scores));
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+                       HasSubstr("Can't get usage scores")));
 }
 
 TEST_F(DocumentStoreTest,
@@ -2356,16 +2072,7 @@
     message_expiration_timestamp = message_data.expiration_timestamp_ms();
   }  // Everything destructs and commits changes to file
 
-  // Change the DocumentStore's header combined checksum so that it won't match
-  // the recalculated checksum on initialization. This will force a regeneration
-  // of derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+  CorruptDocStoreHeaderChecksumFile();
 
   // Change the schema so that we don't know of the Document's type anymore.
   // Since we can't set backwards incompatible changes, we do some file-level
@@ -3155,17 +2862,7 @@
                 IsOkAndHolds(expected_scores));
   }
 
-  // Change the DocStore's header combined checksum so that it won't match the
-  // recalculated checksum on initialization. This will force a regeneration of
-  // derived files from ground truth.
-  const std::string header_file =
-      absl_ports::StrCat(document_store_dir_, "/document_store_header");
-  DocumentStore::Header header;
-  header.magic = DocumentStore::Header::kMagic;
-  header.checksum = 10;  // Arbitrary garbage checksum
-  filesystem_.DeleteFile(header_file.c_str());
-  filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+  CorruptDocStoreHeaderChecksumFile();
   // Successfully recover from a corrupt derived file issue.
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -3264,45 +2961,6 @@
               IsOkAndHolds(expected_scores));
 }
 
-TEST_F(DocumentStoreTest,
-       UsageScoresShouldNotBeCopiedOverFromOldSoftDeletedDocs) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentStore::CreateResult create_result,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-  std::unique_ptr<DocumentStore> document_store =
-      std::move(create_result.document_store);
-
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentId document_id,
-      document_store->Put(DocumentProto(test_document1_)));
-
-  // Report usage with type 1.
-  UsageReport usage_report_type1 = CreateUsageReport(
-      /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
-      UsageReport::USAGE_TYPE1);
-  ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1));
-
-  UsageStore::UsageScores expected_scores;
-  ++expected_scores.usage_type1_count;
-  ASSERT_THAT(document_store->GetUsageScores(document_id),
-              IsOkAndHolds(expected_scores));
-
-  // Soft delete the doc.
-  ICING_ASSERT_OK(document_store->Delete(document_id, /*soft_delete=*/true));
-
-  // Put the same document.
-  ICING_ASSERT_OK_AND_ASSIGN(
-      DocumentId updated_document_id,
-      document_store->Put(DocumentProto(test_document1_)));
-  // We should get a different document id.
-  ASSERT_THAT(updated_document_id, Not(Eq(document_id)));
-
-  // Usage scores should be cleared.
-  EXPECT_THAT(document_store->GetUsageScores(updated_document_id),
-              IsOkAndHolds(UsageStore::UsageScores()));
-}
-
 TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -3431,6 +3089,15 @@
   ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
 }
 
+// TODO(b/185845269) Re-enable this test by copying over a full valid set of
+// document store files. Right now this test only includes the score_cache and
+// the document store header.
+//
+// This causes a problem now because this cl changes behavior to not consider an
+// InitializeDerivedFiles failure to be a recovery if there is nothing to
+// recover because the doocument store is empty.
+#define DISABLE_BACKWARDS_COMPAT_TEST
+#ifndef DISABLE_BACKWARDS_COMPAT_TEST
 TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
   // The directory testdata/v0/document_store contains only the scoring_cache
   // and the document_store_header (holding the crc for the scoring_cache). If
@@ -3471,7 +3138,9 @@
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get(), &initialize_stats));
+                            schema_store_.get(),
+                            /*force_recovery_and_revalidate_documents=*/false,
+                            &initialize_stats));
   std::unique_ptr<DocumentStore> doc_store =
       std::move(create_result.document_store);
   // The store_cache trigger regeneration because its element size is
@@ -3479,6 +3148,7 @@
   // score_cache).
   EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause());
 }
+#endif  // DISABLE_BACKWARDS_COMPAT_TEST
 
 TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) {
   ICING_ASSERT_OK_AND_ASSIGN(
@@ -3589,6 +3259,399 @@
               Eq(0));
 }
 
+TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
+  // Start fresh and set the schema with one type.
+  filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+  SchemaTypeConfigProto email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("body")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+  // The typeid for "email" should be 0.
+  ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+
+  DocumentId docid = kInvalidDocumentId;
+  {
+    // Create the document store the first time and add an email document.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                              schema_store.get()));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    DocumentProto doc =
+        DocumentBuilder()
+            .SetKey("icing", "email/1")
+            .SetSchema("email")
+            .AddStringProperty("subject", "subject foo")
+            .AddStringProperty("body", "body bar")
+            .SetScore(document1_score_)
+            .SetCreationTimestampMs(
+                document1_creation_timestamp_)  // A random timestamp
+            .SetTtlMs(document1_ttl_)
+            .Build();
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc));
+    ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
+                               doc_store->GetDocumentFilterData(docid));
+
+    ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+  }
+
+  // Add another type to the schema before the email type.
+  schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder()
+                       .SetType("alarm")
+                       .AddProperty(
+                           PropertyConfigBuilder()
+                               .SetName("name")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL))
+                       .AddProperty(PropertyConfigBuilder()
+                                        .SetName("time")
+                                        .SetDataType(TYPE_INT)
+                                        .SetCardinality(CARDINALITY_OPTIONAL)))
+          .AddType(email_type_config)
+          .Build();
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+  // Adding a new type should cause ids to be reassigned. Ids are assigned in
+  // order of appearance so 'alarm' should be 0 and 'email' should be 1.
+  ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0));
+  ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1));
+
+  {
+    // Create the document store the second time and force recovery
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(
+            &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+            /*force_recovery_and_revalidate_documents=*/true));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    // Ensure that the type id of the email document has been correctly updated.
+    ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
+                               doc_store->GetDocumentFilterData(docid));
+    ASSERT_THAT(filter_data.schema_type_id(), Eq(1));
+  }
+}
+
+TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) {
+  // Start fresh and set the schema with one type.
+  filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+  SchemaTypeConfigProto email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("body")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+  // The typeid for "email" should be 0.
+  ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+
+  DocumentId docid = kInvalidDocumentId;
+  {
+    // Create the document store the first time and add an email document.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                              schema_store.get()));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    DocumentProto doc =
+        DocumentBuilder()
+            .SetKey("icing", "email/1")
+            .SetSchema("email")
+            .AddStringProperty("subject", "subject foo")
+            .AddStringProperty("body", "body bar")
+            .SetScore(document1_score_)
+            .SetCreationTimestampMs(
+                document1_creation_timestamp_)  // A random timestamp
+            .SetTtlMs(document1_ttl_)
+            .Build();
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc));
+    ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
+                               doc_store->GetDocumentFilterData(docid));
+
+    ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+  }
+
+  // Add another type to the schema.
+  schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder()
+                       .SetType("alarm")
+                       .AddProperty(
+                           PropertyConfigBuilder()
+                               .SetName("name")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL))
+                       .AddProperty(PropertyConfigBuilder()
+                                        .SetName("time")
+                                        .SetDataType(TYPE_INT)
+                                        .SetCardinality(CARDINALITY_OPTIONAL)))
+          .AddType(email_type_config)
+          .Build();
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+  // Adding a new type should cause ids to be reassigned. Ids are assigned in
+  // order of appearance so 'alarm' should be 0 and 'email' should be 1.
+  ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0));
+  ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1));
+
+  {
+    // Create the document store the second time. Don't force recovery.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(
+            &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+            /*force_recovery_and_revalidate_documents=*/false));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    // Check that the type id of the email document has not been updated.
+    ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
+                               doc_store->GetDocumentFilterData(docid));
+    ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+  }
+}
+
+TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
+  // Start fresh and set the schema with one type.
+  filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+  SchemaTypeConfigProto email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("body")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+
+  DocumentId docid = kInvalidDocumentId;
+  DocumentProto docWithBody =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "subject foo")
+          .AddStringProperty("body", "body bar")
+          .SetScore(document1_score_)
+          .SetCreationTimestampMs(
+              document1_creation_timestamp_)  // A random timestamp
+          .SetTtlMs(document1_ttl_)
+          .Build();
+  DocumentProto docWithoutBody =
+      DocumentBuilder()
+          .SetKey("icing", "email/2")
+          .SetSchema("email")
+          .AddStringProperty("subject", "subject foo")
+          .SetScore(document1_score_)
+          .SetCreationTimestampMs(
+              document1_creation_timestamp_)  // A random timestamp
+          .SetTtlMs(document1_ttl_)
+          .Build();
+
+  {
+    // Create the document store the first time and add two email documents: one
+    // that has the 'body' section and one that doesn't.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                              schema_store.get()));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
+
+    ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+                IsOkAndHolds(EqualsProto(docWithBody)));
+    ASSERT_THAT(
+        doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+        IsOkAndHolds(EqualsProto(docWithoutBody)));
+  }
+
+  // Delete the 'body' property from the 'email' type, making all pre-existing
+  // documents with the 'body' property invalid.
+  email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  schema = SchemaBuilder().AddType(email_type_config).Build();
+  ASSERT_THAT(schema_store->SetSchema(
+                  schema, /*ignore_errors_and_delete_documents=*/true),
+              IsOk());
+
+  {
+    // Create the document store the second time and force recovery
+    CorruptDocStoreHeaderChecksumFile();
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(
+            &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+            /*force_recovery_and_revalidate_documents=*/true));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+                StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+    ASSERT_THAT(
+        doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+        IsOkAndHolds(EqualsProto(docWithoutBody)));
+  }
+}
+
+TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
+  // Start fresh and set the schema with one type.
+  filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+  SchemaTypeConfigProto email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("body")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+  ASSERT_THAT(schema_store->SetSchema(schema), IsOk());
+
+  DocumentId docid = kInvalidDocumentId;
+  DocumentProto docWithBody =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "subject foo")
+          .AddStringProperty("body", "body bar")
+          .SetScore(document1_score_)
+          .SetCreationTimestampMs(
+              document1_creation_timestamp_)  // A random timestamp
+          .SetTtlMs(document1_ttl_)
+          .Build();
+  DocumentProto docWithoutBody =
+      DocumentBuilder()
+          .SetKey("icing", "email/2")
+          .SetSchema("email")
+          .AddStringProperty("subject", "subject foo")
+          .SetScore(document1_score_)
+          .SetCreationTimestampMs(
+              document1_creation_timestamp_)  // A random timestamp
+          .SetTtlMs(document1_ttl_)
+          .Build();
+
+  {
+    // Create the document store the first time and add two email documents: one
+    // that has the 'body' section and one that doesn't.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                              schema_store.get()));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
+    ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
+
+    ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+                IsOkAndHolds(EqualsProto(docWithBody)));
+    ASSERT_THAT(
+        doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+        IsOkAndHolds(EqualsProto(docWithoutBody)));
+  }
+
+  // Delete the 'body' property from the 'email' type, making all pre-existing
+  // documents with the 'body' property invalid.
+  email_type_config =
+      SchemaTypeConfigBuilder()
+          .SetType("email")
+          .AddProperty(PropertyConfigBuilder()
+                           .SetName("subject")
+                           .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                           .SetCardinality(CARDINALITY_OPTIONAL))
+          .Build();
+  schema = SchemaBuilder().AddType(email_type_config).Build();
+  ASSERT_THAT(schema_store->SetSchema(
+                  schema, /*ignore_errors_and_delete_documents=*/true),
+              IsOk());
+
+  {
+    // Corrupt the document store header checksum so that we will perform
+    // recovery, but without revalidation.
+    CorruptDocStoreHeaderChecksumFile();
+    ICING_ASSERT_OK_AND_ASSIGN(
+        DocumentStore::CreateResult create_result,
+        DocumentStore::Create(
+            &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+            /*force_recovery_and_revalidate_documents=*/false));
+    std::unique_ptr<DocumentStore> doc_store =
+        std::move(create_result.document_store);
+
+    ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+                IsOkAndHolds(EqualsProto(docWithBody)));
+    ASSERT_THAT(
+        doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+        IsOkAndHolds(EqualsProto(docWithoutBody)));
+  }
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h
index bbc8084..8d8bdf2 100644
--- a/icing/testing/common-matchers.h
+++ b/icing/testing/common-matchers.h
@@ -25,7 +25,6 @@
 #include "icing/absl_ports/str_join.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/legacy/core/icing-string-util.h"
-#include "icing/proto/search.proto.h"
 #include "icing/proto/search.pb.h"
 #include "icing/schema/schema-store.h"
 #include "icing/schema/section.h"
diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h
index adc469a..67a98c3 100644
--- a/icing/testing/jni-test-helpers.h
+++ b/icing/testing/jni-test-helpers.h
@@ -15,6 +15,8 @@
 #ifndef ICING_TESTING_JNI_TEST_HELPERS_H_
 #define ICING_TESTING_JNI_TEST_HELPERS_H_
 
+#include <memory>
+
 #include "icing/jni/jni-cache.h"
 
 #ifdef ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h
index 78430cc..12133f5 100644
--- a/icing/testing/schema-generator.h
+++ b/icing/testing/schema-generator.h
@@ -18,7 +18,6 @@
 #include <random>
 #include <string>
 
-#include "icing/proto/schema.proto.h"
 #include "icing/proto/schema.pb.h"
 
 namespace icing {
diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
index 6a017ef..cfd20c2 100644
--- a/icing/testing/snippet-helpers.cc
+++ b/icing/testing/snippet-helpers.cc
@@ -61,8 +61,8 @@
     std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
   std::vector<std::string_view> windows;
   for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
-    windows.push_back(
-        content.substr(match.window_position(), match.window_bytes()));
+    windows.push_back(content.substr(match.window_byte_position(),
+                                     match.window_byte_length()));
   }
   return windows;
 }
@@ -71,8 +71,8 @@
     std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
   std::vector<std::string_view> matches;
   for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
-    matches.push_back(content.substr(match.exact_match_position(),
-                                     match.exact_match_bytes()));
+    matches.push_back(content.substr(match.exact_match_byte_position(),
+                                     match.exact_match_byte_length()));
   }
   return matches;
 }
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index e60c168..cae3eee 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,11 +18,7 @@
 #include <memory>
 #include <string_view>
 
-#ifdef __ANDROID__
 #include "icing/jni/jni-cache.h"
-#else   // __ANDROID__
-class JniCache;  // forward declaration to let non-Android builds work.
-#endif  // __ANDROID__
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 2b1911e..317da04 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -16,8 +16,8 @@
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
@@ -143,8 +143,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithZeroNotFound) {
+TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithZeroNotFound) {
   language_segmenter_factory::SegmenterOptions options(ULOC_US);
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f578567..2fb9750 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -19,9 +19,9 @@
 #include "gmock/gmock.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/icu-i18n-test-utils.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index e1a666b..500efa0 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -17,8 +17,8 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
-#include "icing/testing/platform.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
similarity index 85%
rename from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
rename to icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
index 8392363..5f5202c 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
@@ -21,12 +21,12 @@
 JNIEnv* g_jenv = nullptr;
 
 extern "C" JNIEXPORT jboolean JNICALL
-Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
-    JNIEnv* env, jclass ignored) {
+Java_icing_jni_ReverseJniLanguageSegmenterJniTest_testsMain(JNIEnv* env,
+                                                            jclass ignored) {
   g_jenv = env;
 
   std::vector<char*> my_argv;
-  char arg[] = "reverse-jni-language-segmenter-test-lib";
+  char arg[] = "jni-test-lib";
   my_argv.push_back(arg);
   int argc = 1;
   char** argv = &(my_argv[0]);
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
deleted file mode 100644
index 64b68ec..0000000
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-
-#include <jni.h>
-
-#include "icing/jni/jni-cache.h"
-#include "gtest/gtest.h"
-
-extern JNIEnv* g_jenv;
-
-namespace icing {
-namespace lib {
-
-namespace test_internal {
-
-class ReverseJniLanguageSegmenterTest
-    : public testing::TestWithParam<const char*> {
- protected:
-  ReverseJniLanguageSegmenterTest()
-      : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
-
-  static std::string GetLocale() { return GetParam(); }
-
-  std::unique_ptr<JniCache> jni_cache_;
-};
-
-}  // namespace test_internal
-
-}  // namespace lib
-}  // namespace icing
-
-#endif  // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
similarity index 99%
rename from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
rename to icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 2c268ff..72c3180 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -12,17 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+#include <jni.h>
 
 #include <memory>
 #include <string_view>
 
+#include "icing/jni/jni-cache.h"
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "gmock/gmock.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
 #include "unicode/uloc.h"
@@ -120,6 +122,14 @@
   return terms;
 }
 
+class ReverseJniLanguageSegmenterTest
+    : public testing::TestWithParam<const char*> {
+ protected:
+  static std::string GetLocale() { return GetParam(); }
+
+  std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
+};
+
 }  // namespace
 
 TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc
deleted file mode 100644
index 45c9bf5..0000000
--- a/icing/tools/document-store-dump.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tools/document-store-dump.h"
-
-#include <cinttypes>
-
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-void AppendDocumentProto(DocId document_id, const Document& doc,
-                         std::string* output) {
-  absl_ports::StrAppend(
-      output, IcingStringUtil::StringPrintf(
-                  "Document {\n   document_id: %d\n  corpus_id: %d\n  uri: "
-                  "'%s'\n  score: %d\n  created_timestamp_ms: %" PRIu64 "\n",
-                  static_cast<int>(document_id), doc.corpus_id(),
-                  doc.uri().c_str(), static_cast<int>(doc.score()),
-                  static_cast<int64_t>(doc.created_timestamp_ms())));
-  for (const auto& section : doc.sections()) {
-    absl_ports::StrAppend(
-        output, IcingStringUtil::StringPrintf(
-                    "  section {\n    id: %d\n    indexed_length: "
-                    "%d\n    content: '%s'\n    snippet: '%s'\n",
-                    static_cast<int>(section.id()),
-                    static_cast<int>(section.indexed_length()),
-                    section.content().c_str(), section.snippet().c_str()));
-    for (int64_t extracted_number : section.extracted_numbers()) {
-      absl_ports::StrAppend(output, IcingStringUtil::StringPrintf(
-                                        "    extracted_numbers: %" PRId64 "\n",
-                                        extracted_number));
-    }
-    for (const std::string& annotation_token : section.annotation_tokens()) {
-      absl_ports::StrAppend(
-          output, IcingStringUtil::StringPrintf("    annotation_tokens: '%s'\n",
-                                                annotation_token.c_str()));
-    }
-    std::string indexed = (section.config().indexed()) ? "true" : "false";
-    std::string index_prefixes =
-        (section.config().index_prefixes()) ? "true" : "false";
-    absl_ports::StrAppend(
-        output,
-        IcingStringUtil::StringPrintf(
-            "    config {\n      name: '%s'\n      indexed: %s\n      "
-            "tokenizer: %d\n      weight: %d\n      index_prefixes: %s\n      "
-            "subsection_separator: '%s'\n",
-            section.config().name().c_str(), indexed.c_str(),
-            section.config().tokenizer(),
-            static_cast<int>(section.config().weight()), index_prefixes.c_str(),
-            section.config().subsection_separator().c_str()));
-    for (const auto& variant_generator :
-         section.config().variant_generators()) {
-      absl_ports::StrAppend(
-          output, IcingStringUtil::StringPrintf(
-                      "      variant_generators: %d\n", variant_generator));
-    }
-    absl_ports::StrAppend(
-        output,
-        IcingStringUtil::StringPrintf(
-            "      common_term_legacy_hit_score: %d\n      "
-            "rfc822_host_name_term_legacy_hit_score: %d\n      "
-            "semantic_property: '%s'\n      universal_section_id: %d\n      "
-            "omnibox_section_type: %d\n      st_section_type: %d\n    }\n  }\n",
-            section.config().common_term_legacy_hit_score(),
-            section.config().rfc822_host_name_term_legacy_hit_score(),
-            section.config().semantic_property().c_str(),
-            section.config().universal_section_id(),
-            section.config().omnibox_section_type(),
-            section.config().st_section_type()));
-  }
-  for (const auto& language : doc.languages()) {
-    std::string used_classifier =
-        (language.used_classifier()) ? "true" : "false";
-    absl_ports::StrAppend(
-        output, IcingStringUtil::StringPrintf(
-                    "  languages {\n    language: %d\n    score: %d\n    "
-                    "used_classifier: %s\n  }\n",
-                    language.language(), static_cast<int>(language.score()),
-                    used_classifier.c_str()));
-  }
-  absl_ports::StrAppend(
-      output, IcingStringUtil::StringPrintf(
-                  " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n"));
-}
-
-}  // namespace
-
-std::string GetDocumentStoreDump(const DocumentStore& document_store) {
-  std::string output;
-  for (DocId document_id = 0; document_id < document_store.num_documents();
-       document_id++) {
-    Document doc;
-    if (!document_store.ReadDocument(document_id, &doc)) {
-      ICING_LOG(FATAL) << "Failed to read document";
-    }
-
-    AppendDocumentProto(document_id, doc, &output);
-  }
-  return output;
-}
-
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h
deleted file mode 100644
index 023b301..0000000
--- a/icing/tools/document-store-dump.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-
-namespace icing {
-namespace lib {
-
-// Utility function for dumping the complete document store content.
-// This provides a human-readable representation of the document store, mainly
-// provided for easier understandability for developers.
-// The output of this class should only be available on cmdline-tool-level
-// (with root access), or unit tests. In other words it should not be possible
-// to trigger this on a release key device, for data protection reasons.
-std::string GetDocumentStoreDump(const DocumentStore& document_store);
-
-}  // namespace lib
-}  // namespace icing
-#endif  // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc
deleted file mode 100644
index 72a11e9..0000000
--- a/icing/tools/icing-tool.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-//
-// A tool to debug the native index.
-
-#include <getopt.h>
-#include <unistd.h>
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/tools/document-store-dump.h"
-#include "icing/util/logging.h"
-
-using std::vector;
-using ::wireless_android_play_playlog::icing::IndexRestorationStats;
-
-namespace icing {
-namespace lib {
-
-// 256KB for debugging.
-const size_t kMaxDocumentSizeForDebugging = 1u << 18;
-// Dump dynamic trie stats and contents.
-void ProcessDynamicTrie(const char* filename) {
-  Filesystem filesystem;
-  DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem);
-  if (!trie.Init()) {
-    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed",
-                                                      filename);
-    return;
-  }
-
-  std::string out;
-  trie.GetDebugInfo(true, &out);
-  printf("Stats:\n%s", out.c_str());
-
-  std::ostringstream contents;
-  vector<std::string> keys;
-  trie.DumpTrie(&contents, &keys);
-  printf("Contents:\n%s", contents.str().c_str());
-}
-
-NativeIndexImpl* MakeIndex(const char* root_dir) {
-  NativeConfig native_config;
-  native_config.set_max_document_size(kMaxDocumentSizeForDebugging);
-  FlashIndexOptions flash_index_options(
-      NativeIndexImpl::GetNativeIndexDir(root_dir));
-  NativeIndexImpl* ni =
-      new NativeIndexImpl(root_dir, native_config, flash_index_options);
-  InitStatus init_status;
-  if (!ni->Init(&init_status)) {
-    ICING_LOG(FATAL) << "Failed to initialize legacy native index impl";
-  }
-
-  IndexRestorationStats unused;
-  ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused);
-  return ni;
-}
-
-void RunQuery(NativeIndexImpl* ni, const std::string& query, int start,
-              int num_results) {
-  // Pull out corpusids and uris.
-  QueryRequestSpec spec;
-  spec.set_no_corpus_filter(true);
-  spec.set_want_uris(true);
-  spec.set_scoring_verbosity_level(1);
-  spec.set_prefix_match(true);
-
-  QueryResponse response;
-  ni->ExecuteQuery(query, spec, 10000, start, num_results, &response);
-
-  ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-      "Query [%s] num results %u", query.c_str(), response.num_results());
-
-  for (int i = 0, uri_offset = 0; i < response.num_results(); i++) {
-    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-        "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i),
-        response.uri_lengths(i), response.uri_buffer().data() + uri_offset);
-    uri_offset += response.uri_lengths(i);
-  }
-}
-
-void RunSuggest(NativeIndexImpl* ni, const std::string& prefix,
-                int num_results) {
-  SuggestionResponse results;
-  ni->Suggest(prefix, num_results, vector<CorpusId>(), &results);
-
-  ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-      "Query [%s] num results %zu", prefix.c_str(),
-      static_cast<size_t>(results.suggestions_size()));
-
-  for (size_t i = 0; i < results.suggestions_size(); i++) {
-    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-        "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(),
-        results.suggestions(i).display_text().c_str());
-  }
-}
-
-int IcingTool(int argc, char** argv) {
-  auto file_storage = CreatePosixFileStorage();
-  enum Options {
-    OPT_FILENAME,
-    OPT_OP,
-    OPT_QUERY,
-    NUM_OPT,
-  };
-  static const option kOptions[NUM_OPT + 1] = {
-      {"filename", 1, nullptr, 0},
-      {"op", 1, nullptr, 0},
-      {"query", 1, nullptr, 0},
-      {nullptr, 0, nullptr, 0},
-  };
-  const char* opt_values[NUM_OPT];
-  memset(opt_values, 0, sizeof(opt_values));
-
-  while (true) {
-    int opt_idx = -1;
-    int ret = getopt_long(argc, argv, "", kOptions, &opt_idx);
-    if (ret != 0) break;
-
-    if (opt_idx >= 0 && opt_idx < NUM_OPT) {
-      opt_values[opt_idx] = optarg;
-    }
-  }
-
-  if (!opt_values[OPT_OP]) {
-    ICING_LOG(ERROR) << "No op specified";
-    return -1;
-  }
-
-  if (!opt_values[OPT_FILENAME]) {
-    ICING_LOG(ERROR) << "No filename specified";
-    return -1;
-  }
-  if (!strncmp(
-          opt_values[OPT_FILENAME],
-          "/data/data/com.google.android.gms/files/AppDataSearch",
-          strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) {
-    ICING_LOG(ERROR)
-        << "Should not read directly from the file in gmscore - "
-           "icing-tool also commits writes as side-effects which corrupts "
-           "the index on concurrent modification";
-    return -1;
-  }
-
-  const char* op = opt_values[OPT_OP];
-  DocumentStore::Options options(file_storage.get(),
-                                 kMaxDocumentSizeForDebugging);
-  if (!strcmp(op, "dyntrie")) {
-    std::string full_file_path =
-        absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon");
-    ProcessDynamicTrie(full_file_path.c_str());
-  } else if (!strcmp(op, "verify")) {
-    std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
-    ni->CheckVerify();
-  } else if (!strcmp(op, "query")) {
-    if (opt_values[OPT_QUERY] == nullptr) {
-      ICING_LOG(FATAL) << "Opt value is null";
-    }
-
-    std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
-    RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100);
-  } else if (!strcmp(op, "suggest")) {
-    if (opt_values[OPT_QUERY] == nullptr) {
-      ICING_LOG(FATAL) << "Opt value is null";
-    }
-
-    std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
-    RunSuggest(ni.get(), opt_values[OPT_QUERY], 100);
-  } else if (!strcmp(op, "dump-all-docs")) {
-    DocumentStore ds(opt_values[OPT_FILENAME], options);
-    if (!ds.Init()) {
-      ICING_LOG(FATAL) << "Legacy document store failed to initialize";
-    }
-
-    printf(
-        "------ Document Store Dump Start ------\n"
-        "%s\n"
-        "------ Document Store Dump End ------\n",
-        GetDocumentStoreDump(ds).c_str());
-  } else if (!strcmp(op, "dump-uris")) {
-    CorpusId corpus_id = kInvalidCorpusId;
-    if (opt_values[OPT_QUERY]) {
-      // Query is corpus id.
-      corpus_id = atoi(opt_values[OPT_QUERY]);  // NOLINT
-    }
-    DocumentStore ds(opt_values[OPT_FILENAME], options);
-    if (!ds.Init()) {
-      ICING_LOG(FATAL) << "Legacy document store failed to initialize";
-    }
-
-    DocPropertyFilter dpf;
-    ds.AddDeletedTagFilter(&dpf);
-
-    // Dump with format "<corpusid> <uri> <tagname>*".
-    int filtered = 0;
-    vector<std::string> tagnames;
-    for (DocId document_id = 0; document_id < ds.num_documents();
-         document_id++) {
-      Document doc;
-      if (!ds.ReadDocument(document_id, &doc)) {
-        ICING_LOG(FATAL) << "Failed to read document.";
-      }
-
-      if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
-        filtered++;
-        continue;
-      }
-      if (dpf.Match(0, document_id)) {
-        filtered++;
-        continue;
-      }
-
-      tagnames.clear();
-      ds.GetAllSetUserTagNames(document_id, &tagnames);
-
-      printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(),
-             StringUtil::JoinStrings("/", tagnames).c_str());
-    }
-    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-        "Processed %u filtered %d", ds.num_documents(), filtered);
-  } else if (!strcmp(op, "dump-docs")) {
-    std::string out_filename = opt_values[OPT_FILENAME];
-    out_filename.append("/docs-dump");
-    CorpusId corpus_id = kInvalidCorpusId;
-    if (opt_values[OPT_QUERY]) {
-      // Query is corpus id.
-      corpus_id = atoi(opt_values[OPT_QUERY]);  // NOLINT
-      out_filename.push_back('.');
-      out_filename.append(opt_values[OPT_QUERY]);
-    }
-    DocumentStore ds(opt_values[OPT_FILENAME], options);
-    if (!ds.Init()) {
-      ICING_LOG(FATAL) << "Legacy document store failed to initialize";
-    }
-
-    DocPropertyFilter dpf;
-    ds.AddDeletedTagFilter(&dpf);
-
-    // Dump with format (<32-bit length><serialized content>)*.
-    FILE* fp = fopen(out_filename.c_str(), "w");
-    int filtered = 0;
-    for (DocId document_id = 0; document_id < ds.num_documents();
-         document_id++) {
-      Document doc;
-      if (!ds.ReadDocument(document_id, &doc)) {
-        ICING_LOG(FATAL) << "Failed to read document.";
-      }
-
-      if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
-        filtered++;
-        continue;
-      }
-      if (dpf.Match(0, document_id)) {
-        filtered++;
-        continue;
-      }
-
-      std::string serialized = doc.SerializeAsString();
-      uint32_t length = serialized.size();
-      if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) {
-        ICING_LOG(FATAL) << "Failed to write length information to file";
-      }
-
-      if (fwrite(serialized.data(), 1, serialized.size(), fp) !=
-          serialized.size()) {
-        ICING_LOG(FATAL) << "Failed to write document to file";
-      }
-    }
-    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
-        "Processed %u filtered %d", ds.num_documents(), filtered);
-    fclose(fp);
-  } else {
-    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op);
-    return -1;
-  }
-
-  return 0;
-}
-
-}  // namespace lib
-}  // namespace icing
-
-int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); }
diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h
index e2bb817..7ca20b4 100644
--- a/icing/util/bit-util.h
+++ b/icing/util/bit-util.h
@@ -24,19 +24,18 @@
 
 // Manipulating bit fields.
 //
-// x       value containing the bit field(s)
-// offset  offset of bit field in x
-// len     len of bit field in x
+// value       value containing the bit field(s)
+// lsb_offset  offset of bit field in value, starting from the least significant
+//             bit. for example, the '1' in '0100' has a lsb_offset of 2
+// len         len of bit field in value
 //
 // REQUIREMENTS
 //
-// - x an unsigned integer <= 64 bits
-// - offset + len <= sizeof(x) * 8
+// - value is an unsigned integer <= 64 bits
+// - lsb_offset + len <= sizeof(value) * 8
 //
 // There is no error checking so you will get garbage if you don't
 // ensure the above.
-//
-// To set a value, use BITFIELD_CLEAR then BITFIELD_OR.
 
 // Shifting by more than the word length is undefined (on ARM it has the
 // intended effect, but on Intel it shifts by % word length), so check the
@@ -44,20 +43,65 @@
 inline uint64_t BitfieldMask(uint32_t len) {
   return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len))));
 }
-inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) {
-  return ((mask) >> (lsb_offset)) & BitfieldMask(len);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+                          uint8_t* value_out) {
+  *value_out &= ~(BitfieldMask(len) << lsb_offset);
 }
-inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len,
-                        uint32_t* mask) {
-  // We conservatively mask val at len so x won't be corrupted if val >=
-  // 1 << len.
-  *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+                          uint16_t* value_out) {
+  *value_out &= ~(BitfieldMask(len) << lsb_offset);
 }
-inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len,
-                        uint64_t* mask) {
-  // We conservatively mask val at len so x won't be corrupted if val >=
-  // 1 << len.
-  *mask |= (value & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+                          uint32_t* value_out) {
+  *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+                          uint64_t* value_out) {
+  *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline uint64_t BitfieldGet(uint64_t value, uint32_t lsb_offset, uint32_t len) {
+  return ((value) >> (lsb_offset)) & BitfieldMask(len);
+}
+
+inline void BitfieldSet(uint8_t new_value, uint32_t lsb_offset, uint32_t len,
+                        uint8_t* value_out) {
+  BitfieldClear(lsb_offset, len, value_out);
+
+  // We conservatively mask new_value at len so value won't be corrupted if
+  // new_value >= (1 << len).
+  *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint16_t new_value, uint32_t lsb_offset, uint32_t len,
+                        uint16_t* value_out) {
+  BitfieldClear(lsb_offset, len, value_out);
+
+  // We conservatively mask new_value at len so value won't be corrupted if
+  // new_value >= (1 << len).
+  *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint32_t new_value, uint32_t lsb_offset, uint32_t len,
+                        uint32_t* value_out) {
+  BitfieldClear(lsb_offset, len, value_out);
+
+  // We conservatively mask new_value at len so value won't be corrupted if
+  // new_value >= (1 << len).
+  *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint64_t new_value, uint32_t lsb_offset, uint32_t len,
+                        uint64_t* value_out) {
+  BitfieldClear(lsb_offset, len, value_out);
+
+  // We conservatively mask new_value at len so value won't be corrupted if
+  // new_value >= (1 << len).
+  *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
 }
 
 }  // namespace bit_util
diff --git a/icing/util/bit-util_test.cc b/icing/util/bit-util_test.cc
new file mode 100644
index 0000000..3b86a21
--- /dev/null
+++ b/icing/util/bit-util_test.cc
@@ -0,0 +1,145 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/bit-util.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(BitUtilTest, BitfieldMask) {
+  // Check that we can handle up to uint8_t's
+  EXPECT_THAT(bit_util::BitfieldMask(/*len=*/0), Eq(0b0));
+  EXPECT_THAT(bit_util::BitfieldMask(/*len=*/1), Eq(0b01));
+
+  // Check that we can handle up to uint32_t's
+  EXPECT_THAT(bit_util::BitfieldMask(/*len=*/16), Eq(0b01111111111111111));
+
+  // Check that we can handle up to uint64_t's
+  EXPECT_THAT(
+      bit_util::BitfieldMask(/*len=*/63),
+      Eq(0b0111111111111111111111111111111111111111111111111111111111111111));
+}
+
+TEST(BitUtilTest, BitfieldClear) {
+  // Check that we can handle up to uint8_t's
+  uint8_t value_8 = 0b0;
+  bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b0));
+
+  value_8 = 0b01;
+  bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b00));
+
+  value_8 = 0b011;
+  bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b001));
+
+  value_8 = 0b011;
+  bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/2, &value_8);
+  EXPECT_THAT(value_8, Eq(0b000));
+
+  value_8 = 0b0110;
+  bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/2, &value_8);
+  EXPECT_THAT(value_8, Eq(0b0000));
+
+  // Check that we can handle up to uint32_t's
+  uint32_t value_32 = 0b010000000000000000000000;
+  bit_util::BitfieldClear(/*lsb_offset=*/22, /*len=*/1, &value_32);
+  EXPECT_THAT(value_32, Eq(0b0));
+
+  // Check that we can handle up to uint64_t's
+  uint64_t value_64 = 0b0100000000000000000000000000000000000;
+  bit_util::BitfieldClear(/*lsb_offset=*/35, /*len=*/1, &value_64);
+  EXPECT_THAT(value_64, Eq(0b0));
+}
+
+TEST(BitUtilTest, BitfieldGet) {
+  // Get something in the uint8_t range
+  EXPECT_THAT(bit_util::BitfieldGet(0b0, /*lsb_offset=*/0, /*len=*/1), Eq(0b0));
+  EXPECT_THAT(bit_util::BitfieldGet(0b01, /*lsb_offset=*/0, /*len=*/1),
+              Eq(0b01));
+  EXPECT_THAT(bit_util::BitfieldGet(0b010, /*lsb_offset=*/1, /*len=*/1),
+              Eq(0b01));
+  EXPECT_THAT(bit_util::BitfieldGet(0b001, /*lsb_offset=*/1, /*len=*/1),
+              Eq(0b0));
+  EXPECT_THAT(bit_util::BitfieldGet(0b011, /*lsb_offset=*/0, /*len=*/2),
+              Eq(0b011));
+  EXPECT_THAT(bit_util::BitfieldGet(0b0110, /*lsb_offset=*/1, /*len=*/2),
+              Eq(0b011));
+  EXPECT_THAT(bit_util::BitfieldGet(0b0101, /*lsb_offset=*/0, /*len=*/3),
+              Eq(0b0101));
+
+  // Get something in the uint32_t range
+  EXPECT_THAT(
+      bit_util::BitfieldGet(0b01000000000000, /*lsb_offset=*/12, /*len=*/1),
+      Eq(0b01));
+
+  // Get something in the uint64_t range
+  EXPECT_THAT(bit_util::BitfieldGet(0b010000000000000000000000000000000000,
+                                    /*lsb_offset=*/34, /*len=*/1),
+              Eq(0b01));
+}
+
+TEST(BitUtilTest, BitfieldSet) {
+  // Set something in the uint8_t range
+  uint8_t value_8 = 0b0;
+  bit_util::BitfieldSet(0b0, /*lsb_offset=*/0, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b0));
+
+  value_8 = 0b01;
+  bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b01));
+
+  value_8 = 0b00;
+  bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b01));
+
+  value_8 = 0b00;
+  bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+  EXPECT_THAT(value_8, Eq(0b011));
+
+  value_8 = 0b01;
+  bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+  EXPECT_THAT(value_8, Eq(0b011));
+
+  value_8 = 0b01;
+  bit_util::BitfieldSet(0b01, /*lsb_offset=*/1, /*len=*/1, &value_8);
+  EXPECT_THAT(value_8, Eq(0b011));
+
+  value_8 = 0b0001;
+  bit_util::BitfieldSet(0b011, /*lsb_offset=*/1, /*len=*/2, &value_8);
+  EXPECT_THAT(value_8, Eq(0b0111));
+
+  // Set something in the uint32_t range
+  uint32_t value_32 = 0b0;
+  bit_util::BitfieldSet(0b01, /*lsb_offset=*/16, /*len=*/1, &value_32);
+  EXPECT_THAT(value_32, Eq(0b010000000000000000));
+
+  // Set something in the uint64_t range
+  uint64_t value_64 = 0b0;
+  bit_util::BitfieldSet(0b01, /*lsb_offset=*/34, /*len=*/1, &value_64);
+  EXPECT_THAT(value_64, Eq(0b010000000000000000000000000000000000));
+}
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/java/Android.bp b/java/Android.bp
index 7daeb0a..ef417ba 100644
--- a/java/Android.bp
+++ b/java/Android.bp
@@ -25,9 +25,12 @@
     name: "libicing-java",
     srcs: ["src/**/*.java"],
     static_libs: [
-        "androidx.annotation_annotation",
         "icing-java-proto-lite",
         "libprotobuf-java-lite",
     ],
+    libs: [
+        "androidx.annotation_annotation",
+    ],
+    sdk_version: "current",
     apex_available: ["com.android.appsearch"],
 }
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index 3d25908..1f5fb51 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -53,9 +53,11 @@
  *
  * <p>If this instance has been closed, the instance is no longer usable.
  *
+ * <p>Keep this class to be non-Final so that it can be mocked in AppSearch.
+ *
  * <p>NOTE: This class is NOT thread-safe.
  */
-public final class IcingSearchEngine implements Closeable {
+public class IcingSearchEngine implements Closeable {
 
   private static final String TAG = "IcingSearchEngine";
   private static final ExtensionRegistryLite EXTENSION_REGISTRY_LITE =
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index bca0223..2019033 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -45,6 +45,8 @@
 import com.google.android.icing.proto.SearchResultProto;
 import com.google.android.icing.proto.SearchSpecProto;
 import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.SnippetMatchProto;
+import com.google.android.icing.proto.SnippetProto;
 import com.google.android.icing.proto.StatusProto;
 import com.google.android.icing.proto.StorageInfoResultProto;
 import com.google.android.icing.proto.StringIndexingConfig;
@@ -486,6 +488,140 @@
     assertStatusOk(reportUsageResultProto.getStatus());
   }
 
+  @Test
+  public void testCJKTSnippets() throws Exception {
+    assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+    SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+    assertStatusOk(
+        icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+    // String:     "我每天走路去上班。"
+    //              ^ ^  ^   ^^
+    // UTF16 idx:   0 1  3   5 6
+    // Breaks into segments: "我", "每天", "走路", "去", "上班"
+    String chinese = "我每天走路去上班。";
+    assertThat(chinese.length()).isEqualTo(9);
+    DocumentProto emailDocument1 =
+        createEmailDocument("namespace", "uri1").toBuilder()
+            .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese))
+            .build();
+    assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+    // Search and request snippet matching but no windowing.
+    SearchSpecProto searchSpec =
+        SearchSpecProto.newBuilder()
+            .setQuery("每")
+            .setTermMatchType(TermMatchType.Code.PREFIX)
+            .build();
+    ResultSpecProto resultSpecProto =
+        ResultSpecProto.newBuilder()
+            .setSnippetSpec(
+                ResultSpecProto.SnippetSpecProto.newBuilder()
+                    .setNumToSnippet(Integer.MAX_VALUE)
+                    .setNumMatchesPerProperty(Integer.MAX_VALUE))
+            .build();
+
+    // Search and make sure that we got a single successful results
+    SearchResultProto searchResultProto =
+        icingSearchEngine.search(
+            searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+    assertStatusOk(searchResultProto.getStatus());
+    assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+    // Ensure that one and only one property was matched and it was "subject"
+    SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+    assertThat(snippetProto.getEntriesList()).hasSize(1);
+    SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+    assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+    // Get the content for "subject" and see what the match is.
+    DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+    assertThat(resultDocument.getPropertiesList()).hasSize(1);
+    PropertyProto subjectProperty = resultDocument.getProperties(0);
+    assertThat(subjectProperty.getName()).isEqualTo("subject");
+    assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+    String content = subjectProperty.getStringValues(0);
+
+    // Ensure that there is one and only one match within "subject"
+    assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+    SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+    int matchStart = matchProto.getExactMatchUtf16Position();
+    int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+    assertThat(matchStart).isEqualTo(1);
+    assertThat(matchEnd).isEqualTo(3);
+    String match = content.substring(matchStart, matchEnd);
+    assertThat(match).isEqualTo("每天");
+  }
+
+  @Test
+  public void testUtf16MultiByteSnippets() throws Exception {
+    assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+    SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+    assertStatusOk(
+        icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+    // String:    "𐀀𐀁 𐀂𐀃 𐀄"
+    //             ^  ^  ^
+    // UTF16 idx:  0  5  10
+    // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+    String text = "𐀀𐀁 𐀂𐀃 𐀄";
+    assertThat(text.length()).isEqualTo(12);
+    DocumentProto emailDocument1 =
+        createEmailDocument("namespace", "uri1").toBuilder()
+            .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(text))
+            .build();
+    assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+    // Search and request snippet matching but no windowing.
+    SearchSpecProto searchSpec =
+        SearchSpecProto.newBuilder()
+            .setQuery("𐀂")
+            .setTermMatchType(TermMatchType.Code.PREFIX)
+            .build();
+    ResultSpecProto resultSpecProto =
+        ResultSpecProto.newBuilder()
+            .setSnippetSpec(
+                ResultSpecProto.SnippetSpecProto.newBuilder()
+                    .setNumToSnippet(Integer.MAX_VALUE)
+                    .setNumMatchesPerProperty(Integer.MAX_VALUE))
+            .build();
+
+    // Search and make sure that we got a single successful results
+    SearchResultProto searchResultProto =
+        icingSearchEngine.search(
+            searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+    assertStatusOk(searchResultProto.getStatus());
+    assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+    // Ensure that one and only one property was matched and it was "subject"
+    SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+    assertThat(snippetProto.getEntriesList()).hasSize(1);
+    SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+    assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+    // Get the content for "subject" and see what the match is.
+    DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+    assertThat(resultDocument.getPropertiesList()).hasSize(1);
+    PropertyProto subjectProperty = resultDocument.getProperties(0);
+    assertThat(subjectProperty.getName()).isEqualTo("subject");
+    assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+    String content = subjectProperty.getStringValues(0);
+
+    // Ensure that there is one and only one match within "subject"
+    assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+    SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+    int matchStart = matchProto.getExactMatchUtf16Position();
+    int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+    assertThat(matchStart).isEqualTo(5);
+    assertThat(matchEnd).isEqualTo(9);
+    String match = content.substring(matchStart, matchEnd);
+    assertThat(match).isEqualTo("𐀂𐀃");
+  }
+
   private static void assertStatusOk(StatusProto status) {
     assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK);
   }
diff --git a/proto/icing/proto/document_wrapper.proto b/proto/icing/proto/document_wrapper.proto
index e8eb992..929ee33 100644
--- a/proto/icing/proto/document_wrapper.proto
+++ b/proto/icing/proto/document_wrapper.proto
@@ -20,7 +20,6 @@
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
-
 option objc_class_prefix = "ICNG";
 
 // DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to
@@ -30,6 +29,5 @@
 message DocumentWrapper {
   optional DocumentProto document = 1;
 
-  // Indicates if the document is marked as deleted
-  optional bool deleted = 2;
+  reserved 2;
 }
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
index a9780b5..29f7f80 100644
--- a/proto/icing/proto/logging.proto
+++ b/proto/icing/proto/logging.proto
@@ -40,8 +40,9 @@
     // Data in index is inconsistent with ground truth.
     INCONSISTENT_WITH_GROUND_TRUTH = 2;
 
-    // Total checksum of all the components does not match.
-    TOTAL_CHECKSUM_MISMATCH = 3;
+    // Changes were made to the schema, but possibly not fully applied to the
+    // document store and the index - requiring a recovery.
+    SCHEMA_CHANGES_OUT_OF_SYNC = 3;
 
     // Random I/O errors.
     IO_ERROR = 4;
@@ -49,13 +50,13 @@
 
   // Possible recovery causes for document store:
   // - DATA_LOSS
-  // - TOTAL_CHECKSUM_MISMATCH
+  // - SCHEMA_CHANGES_OUT_OF_SYNC
   // - IO_ERROR
   optional RecoveryCause document_store_recovery_cause = 2;
 
   // Possible recovery causes for index:
   // - INCONSISTENT_WITH_GROUND_TRUTH
-  // - TOTAL_CHECKSUM_MISMATCH
+  // - SCHEMA_CHANGES_OUT_OF_SYNC
   // - IO_ERROR
   optional RecoveryCause index_restoration_cause = 3;
 
@@ -125,8 +126,11 @@
 
 // Stats of the top-level function IcingSearchEngine::Search() and
 // IcingSearchEngine::GetNextPage().
-// Next tag: 16
+// Next tag: 17
 message QueryStatsProto {
+  // The UTF-8 length of the query string
+  optional int32 query_length = 16;
+
   // Number of terms in the query string.
   optional int32 num_terms = 1;
 
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index 4e48ad7..66fdbe6 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -136,18 +136,29 @@
 }
 
 // The representation of a single match within a DocumentProto property.
-// Next tag: 6
+// Next tag: 10
 message SnippetMatchProto {
-  // The position and length within the matched string at which the exact
-  // match begins.
-  optional int32 exact_match_position = 2;
+  // The index of the byte in the string at which the match begins and the
+  // length in bytes of the match.
+  optional int32 exact_match_byte_position = 2;
+  optional int32 exact_match_byte_length = 3;
 
-  optional int32 exact_match_bytes = 3;
+  // The index of the UTF-16 code unit in the string at which the match begins
+  // and the length in UTF-16 code units of the match. This is for use with
+  // UTF-16 encoded strings like Java.lang.String.
+  optional int32 exact_match_utf16_position = 6;
+  optional int32 exact_match_utf16_length = 7;
 
-  // The position and length of the suggested snippet window.
-  optional int32 window_position = 4;
+  // The index of the byte in the string at which the suggested snippet window
+  // begins and the length in bytes of the window.
+  optional int32 window_byte_position = 4;
+  optional int32 window_byte_length = 5;
 
-  optional int32 window_bytes = 5;
+  // The index of the UTF-16 code unit in the string at which the suggested
+  // snippet window begins and the length in UTF-16 code units of the window.
+  // This is for use with UTF-16 encoded strings like Java.lang.String.
+  optional int32 window_utf16_position = 8;
+  optional int32 window_utf16_length = 9;
 
   reserved 1;
 }
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 4421ac5..4069810 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=367066667)
+set(synced_AOSP_CL_number=375495869)