Pull upstream changes. Change-Id: I44831fdadcdb67f2e19570a35cb4c76faf8397f9

commit: e15b6b66f871a71b73278c34d5c54f648f880c29 [log] [tgz]
author: Terry Wang <tytytyww@google.com> Thu Sep 24 13:39:23 2020 -0700
committer: Terry Wang <tytytyww@google.com> Thu Sep 24 13:39:23 2020 -0700
tree: 61e187172a8802fae8e39f04ce69d3ae5c939f2e
parent: 9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 [diff]
diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc
index d283e13..dfe5566 100644
--- a/icing/absl_ports/annotate.cc
+++ b/icing/absl_ports/annotate.cc

@@ -33,7 +33,7 @@
 
   std::string new_msg =
       (!s.error_message().empty())
-          ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg)
+          ? absl_ports::StrCat(msg, kErrorSeparator, s.error_message())
           : std::string(msg);
   return libtextclassifier3::Status(s.CanonicalCode(), new_msg);
 }

diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 62943b8..95511ac 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h

@@ -78,6 +78,23 @@
 namespace icing {
 namespace lib {
 
+namespace {
+
+bool IsEmptyBuffer(const char* buffer, int size) {
+  return std::all_of(buffer, buffer + size,
+                     [](const char byte) { return byte == 0; });
+}
+
+// Helper function to get stored proto size from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+// Helper function to get stored proto magic from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+}  // namespace
+
 template <typename ProtoT>
 class FileBackedProtoLog {
  public:
@@ -206,10 +223,19 @@
   //
   // Returns:
   //   A proto on success
+  //   NOT_FOUND if the proto at the given offset has been erased
   //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
 
+  // Erases the data of a proto located at file_offset from the file.
+  //
+  // Returns:
+  //   OK on success
+  //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::Status EraseProto(int64_t file_offset);
+
   // Calculates and returns the disk usage in bytes. Rounds up to the nearest
   // block size.
   //
@@ -239,7 +265,7 @@
     Iterator(const Filesystem& filesystem, const std::string& file_path,
              int64_t initial_offset);
 
-    // Advances to the position of next proto.
+    // Advances to the position of next proto whether it has been erased or not.
     //
     // Returns:
     //   OK on success
@@ -716,10 +742,15 @@
       int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
 
   // Copy out however many bytes it says the proto is
-  int stored_size = metadata & 0x00FFFFFF;
+  int stored_size = GetProtoSize(metadata);
 
   ICING_RETURN_IF_ERROR(
       mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+
+  if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+    return absl_ports::NotFoundError("The proto data has been erased.");
+  }
+
   google::protobuf::io::ArrayInputStream proto_stream(
       mmapped_file.mutable_region(), stored_size);
 
@@ -736,6 +767,62 @@
 }
 
 template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
+    int64_t file_offset) {
+  int64_t file_size = filesystem_->GetFileSize(fd_.get());
+  if (file_offset >= file_size) {
+    // file_size points to the next byte to write at, so subtract one to get the
+    // inclusive, actual size of file.
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "Trying to erase data at a location, %lld, "
+        "out of range of the file size, %lld",
+        static_cast<long long>(file_offset),
+        static_cast<long long>(file_size - 1)));
+  }
+
+  MemoryMappedFile mmapped_file(
+      *filesystem_, file_path_,
+      MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+  // Read out the metadata
+  ICING_ASSIGN_OR_RETURN(
+      int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+  ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+                                           GetProtoSize(metadata)));
+
+  // We need to update the crc checksum if the erased area is before the rewind
+  // position.
+  if (file_offset + sizeof(metadata) < header_->rewind_offset) {
+    // We need to calculate [original string xor 0s].
+    // The xored string is the same as the original string because 0 xor 0 = 0,
+    // 1 xor 0 = 1.
+    const std::string_view xored_str(mmapped_file.region(),
+                                     mmapped_file.region_size());
+
+    Crc32 crc(header_->log_checksum);
+    ICING_ASSIGN_OR_RETURN(
+        uint32_t new_crc,
+        crc.UpdateWithXor(
+            xored_str,
+            /*full_data_size=*/header_->rewind_offset - sizeof(Header),
+            /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
+
+    header_->log_checksum = new_crc;
+    header_->header_checksum = header_->CalculateHeaderChecksum();
+
+    if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+                             sizeof(Header))) {
+      return absl_ports::InternalError(
+          absl_ports::StrCat("Failed to update header to: ", file_path_));
+    }
+  }
+
+  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
 libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
     const {
   int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
@@ -781,8 +868,7 @@
     ICING_ASSIGN_OR_RETURN(
         int metadata,
         ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
-    int proto_size = metadata & 0x00FFFFFF;
-    current_offset_ += sizeof(metadata) + proto_size;
+    current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
   }
 
   if (current_offset_ < file_size_) {
@@ -829,7 +915,7 @@
   ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
   memcpy(&metadata, mmapped_file->region(), metadata_size);
   // Checks magic number
-  uint8_t stored_k_proto_magic = metadata >> 24;
+  uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
   if (stored_k_proto_magic != kProtoMagic) {
     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
         "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
@@ -842,7 +928,7 @@
 libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
   int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
   if (file_size == header_->rewind_offset) {
-    // No changes made, don't need to update the checksum.
+    // No new protos appended, don't need to update the checksum.
     return libtextclassifier3::Status::OK;
   }
 

diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index 3a9060d..fad5248 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc

@@ -48,7 +48,10 @@
   // https://stackoverflow.com/a/47368753
   FileBackedProtoLogTest() {}
 
-  void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; }
+  void SetUp() override {
+    file_path_ = GetTestTempDir() + "/proto_log";
+    filesystem_.DeleteFile(file_path_.c_str());
+  }
 
   void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
 
@@ -93,7 +96,7 @@
           FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                      max_proto_size)));
   auto proto_log = std::move(create_result.proto_log);
-  EXPECT_FALSE(create_result.data_loss);
+  ASSERT_FALSE(create_result.data_loss);
 
   DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
 
@@ -110,7 +113,7 @@
           FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                      max_proto_size_)));
   auto proto_log = std::move(create_result.proto_log);
-  EXPECT_FALSE(create_result.data_loss);
+  ASSERT_FALSE(create_result.data_loss);
 
   // Write a proto
   DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -144,7 +147,7 @@
             FileBackedProtoLog<DocumentProto>::Options(
                 /*compress_in=*/false, max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Write the first proto
     DocumentProto document1 =
@@ -191,7 +194,7 @@
             FileBackedProtoLog<DocumentProto>::Options(
                 /*compress_in=*/false, max_proto_size_)));
     auto recreated_proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Write a third proto
     DocumentProto document3 =
@@ -213,7 +216,7 @@
             FileBackedProtoLog<DocumentProto>::Options(
                 /*compress_in=*/true, max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Write the first proto
     DocumentProto document1 =
@@ -260,7 +263,7 @@
             FileBackedProtoLog<DocumentProto>::Options(
                 /*compress_in=*/true, max_proto_size_)));
     auto recreated_proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Write a third proto
     DocumentProto document3 =
@@ -360,7 +363,7 @@
             FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                        max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Write and persist the first proto
     ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
@@ -430,7 +433,7 @@
           FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                      max_proto_size_)));
   auto proto_log = std::move(create_result.proto_log);
-  EXPECT_FALSE(create_result.data_loss);
+  ASSERT_FALSE(create_result.data_loss);
 
   {
     // Empty iterator
@@ -481,7 +484,7 @@
             FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                        max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     ICING_EXPECT_OK(proto_log->WriteProto(document));
 
@@ -499,7 +502,7 @@
             FileBackedProtoLog<DocumentProto>::Options(compress_,
                                                        max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.data_loss);
+    ASSERT_FALSE(create_result.data_loss);
 
     // Checksum should be consistent across instances
     EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
@@ -514,6 +517,166 @@
   }
 }
 
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      FileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                     max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.data_loss);
+
+  // Writes and erases proto
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                             proto_log->WriteProto(document1));
+  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+  // Checks if the erased area is set to 0.
+  int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+  MemoryMappedFile mmapped_file(filesystem_, file_path_,
+                                MemoryMappedFile::Strategy::READ_ONLY);
+
+  // document1_offset + sizeof(int) is the start byte of the proto where
+  // sizeof(int) is the size of the proto metadata.
+  mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+  for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+    ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+  }
+}
+
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+      FileBackedProtoLog<DocumentProto>::Create(
+          &filesystem_, file_path_,
+          FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                     max_proto_size_)));
+  auto proto_log = std::move(create_result.proto_log);
+  ASSERT_FALSE(create_result.data_loss);
+
+  // Writes 2 protos
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                             proto_log->WriteProto(document1));
+  ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+                             proto_log->WriteProto(document2));
+
+  // Erases the first proto
+  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+  // The first proto has been erased.
+  ASSERT_THAT(proto_log->ReadProto(document1_offset),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  // The second proto should be returned.
+  ASSERT_THAT(proto_log->ReadProto(document2_offset),
+              IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+  DocumentProto document1 =
+      DocumentBuilder().SetKey("namespace", "uri1").Build();
+  DocumentProto document2 =
+      DocumentBuilder().SetKey("namespace", "uri2").Build();
+  DocumentProto document3 =
+      DocumentBuilder().SetKey("namespace", "uri3").Build();
+  DocumentProto document4 =
+      DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+  int64_t document2_offset;
+  int64_t document3_offset;
+
+  {
+    // Erase data after the rewind position. This won't update the checksum
+    // immediately.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        FileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                       max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.data_loss);
+
+    // Writes 3 protos
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+                               proto_log->WriteProto(document1));
+    ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+                               proto_log->WriteProto(document2));
+    ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+                               proto_log->WriteProto(document3));
+
+    // Erases the 1st proto, checksum won't be updated immediately because the
+    // rewind position is 0.
+    ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(2293202502))));
+  }  // New checksum is updated in destructor.
+
+  {
+    // Erase data before the rewind position. This will update the checksum
+    // immediately.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        FileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                       max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.data_loss);
+
+    // Erases the 2nd proto that is now before the rewind position. Checksum is
+    // updated.
+    ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(639634028))));
+  }
+
+  {
+    // Append data and erase data before the rewind position. This will update
+    // the checksum twice: in EraseProto() and destructor.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        FileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                       max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.data_loss);
+
+    // Append a new document which is after the rewind position.
+    ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+    // Erases the 3rd proto that is now before the rewind position. Checksum is
+    // updated.
+    ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+    EXPECT_THAT(proto_log->ComputeChecksum(),
+                IsOkAndHolds(Eq(Crc32(1990198693))));
+  }  // Checksum is updated with the newly appended document.
+
+  {
+    // A successful creation means that the checksum matches.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        FileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            FileBackedProtoLog<DocumentProto>::Options(compress_,
+                                                       max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.data_loss);
+  }
+}
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index e4ec0cd..eb89db8 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h

@@ -187,7 +187,7 @@
   //
   // Returns:
   //   OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
-  libtextclassifier3::Status TruncateTo(int32_t len);
+  libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
 
   // Flushes content to underlying file.
   //

diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index c973885..5e0a46e 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc

@@ -59,6 +59,7 @@
 #include "icing/util/crc32.h"
 #include "icing/util/logging.h"
 #include "icing/util/status-macros.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -148,30 +149,31 @@
 
 void TransformStatus(const libtextclassifier3::Status& internal_status,
                      StatusProto* status_proto) {
+  StatusProto::Code code;
   switch (internal_status.CanonicalCode()) {
     case libtextclassifier3::StatusCode::OK:
-      status_proto->set_code(StatusProto::OK);
+      code = StatusProto::OK;
       break;
     case libtextclassifier3::StatusCode::DATA_LOSS:
-      status_proto->set_code(StatusProto::WARNING_DATA_LOSS);
+      code = StatusProto::WARNING_DATA_LOSS;
       break;
     case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
-      status_proto->set_code(StatusProto::INVALID_ARGUMENT);
+      code = StatusProto::INVALID_ARGUMENT;
       break;
     case libtextclassifier3::StatusCode::NOT_FOUND:
-      status_proto->set_code(StatusProto::NOT_FOUND);
+      code = StatusProto::NOT_FOUND;
       break;
     case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
-      status_proto->set_code(StatusProto::FAILED_PRECONDITION);
+      code = StatusProto::FAILED_PRECONDITION;
       break;
     case libtextclassifier3::StatusCode::ABORTED:
-      status_proto->set_code(StatusProto::ABORTED);
+      code = StatusProto::ABORTED;
       break;
     case libtextclassifier3::StatusCode::INTERNAL:
       // TODO(b/147699081): Cleanup our internal use of INTERNAL since it
       // doesn't match with what it *should* indicate as described in
       // go/icing-library-apis.
-      status_proto->set_code(StatusProto::INTERNAL);
+      code = StatusProto::INTERNAL;
       break;
     case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
       // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
@@ -179,17 +181,35 @@
       // internally to indicate other resources are exhausted (e.g.
       // DocHitInfos) - although none of these are exposed through the API.
       // Consider separating the two cases out more clearly.
-      status_proto->set_code(StatusProto::OUT_OF_SPACE);
+      code = StatusProto::OUT_OF_SPACE;
       break;
-    default:
+    case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+      code = StatusProto::ALREADY_EXISTS;
+      break;
+    case libtextclassifier3::StatusCode::CANCELLED:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::UNKNOWN:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::UNAVAILABLE:
+      [[fallthrough]];
+    case libtextclassifier3::StatusCode::UNAUTHENTICATED:
       // Other internal status codes aren't supported externally yet. If it
       // should be supported, add another switch-case above.
-      ICING_LOG(FATAL) << IcingStringUtil::StringPrintf(
+      ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
           "Internal status code %d not supported in the external API",
           internal_status.error_code());
+      code = StatusProto::UNKNOWN;
       break;
   }
-
+  status_proto->set_code(code);
   status_proto->set_message(internal_status.error_message());
 }
 
@@ -681,12 +701,14 @@
   // that can support error logging.
   libtextclassifier3::Status status =
       document_store_->DeleteByNamespace(name_space);
-  TransformStatus(status, result_status);
   if (!status.ok()) {
     ICING_LOG(ERROR) << status.error_message()
                      << "Failed to delete Namespace: " << name_space;
+    TransformStatus(status, result_status);
     return delete_result;
   }
+
+  result_status->set_code(StatusProto::OK);
   return delete_result;
 }
 
@@ -707,15 +729,82 @@
   // that can support error logging.
   libtextclassifier3::Status status =
       document_store_->DeleteBySchemaType(schema_type);
-  TransformStatus(status, result_status);
   if (!status.ok()) {
     ICING_LOG(ERROR) << status.error_message()
                      << "Failed to delete SchemaType: " << schema_type;
+    TransformStatus(status, result_status);
     return delete_result;
   }
+
+  result_status->set_code(StatusProto::OK);
   return delete_result;
 }
 
+DeleteResultProto IcingSearchEngine::DeleteByQuery(
+    const SearchSpecProto& search_spec) {
+  ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
+                << " from doc store";
+
+  DeleteResultProto result_proto;
+  StatusProto* result_status = result_proto.mutable_status();
+
+  absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
+
+  libtextclassifier3::Status status =
+      ValidateSearchSpec(search_spec, performance_configuration_);
+  if (!status.ok()) {
+    TransformStatus(status, result_status);
+    return result_proto;
+  }
+
+  // Gets unordered results from query processor
+  auto query_processor_or = QueryProcessor::Create(
+      index_.get(), language_segmenter_.get(), normalizer_.get(),
+      document_store_.get(), schema_store_.get(), clock_.get());
+  if (!query_processor_or.ok()) {
+    TransformStatus(query_processor_or.status(), result_status);
+    return result_proto;
+  }
+  std::unique_ptr<QueryProcessor> query_processor =
+      std::move(query_processor_or).ValueOrDie();
+
+  auto query_results_or = query_processor->ParseSearch(search_spec);
+  if (!query_results_or.ok()) {
+    TransformStatus(query_results_or.status(), result_status);
+    return result_proto;
+  }
+  QueryProcessor::QueryResults query_results =
+      std::move(query_results_or).ValueOrDie();
+
+  ICING_LOG(ERROR) << "Deleting the docs that matched the query.";
+  bool found_results = false;
+  while (query_results.root_iterator->Advance().ok()) {
+    ICING_LOG(ERROR)
+        << "Deleting doc "
+        << query_results.root_iterator->doc_hit_info().document_id();
+    found_results = true;
+    status = document_store_->Delete(
+        query_results.root_iterator->doc_hit_info().document_id());
+    if (!status.ok()) {
+      TransformStatus(status, result_status);
+      return result_proto;
+    }
+  }
+  if (found_results) {
+    result_proto.mutable_status()->set_code(StatusProto::OK);
+  } else {
+    result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+    result_proto.mutable_status()->set_message(
+        "No documents matched the query to delete by!");
+  }
+  return result_proto;
+}
+
 PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
   ICING_VLOG(1) << "Persisting data to disk";
 
@@ -1147,6 +1236,9 @@
     // Ensures that current directory is still present.
     if (!filesystem_->CreateDirectoryRecursively(
             current_document_dir.c_str())) {
+      // Can't even create the old directory. Mark as uninitialized and return
+      // INTERNAL.
+      initialized_ = false;
       return absl_ports::InternalError(
           "Failed to create file directory for document store");
     }
@@ -1159,6 +1251,9 @@
     // TODO(b/144458732): Implement a more robust version of
     // TC_ASSIGN_OR_RETURN that can support error logging.
     if (!document_store_or.ok()) {
+      // Unable to create DocumentStore from the old file. Mark as uninitialized
+      // and return INTERNAL.
+      initialized_ = false;
       ICING_LOG(ERROR) << "Failed to create document store instance";
       return absl_ports::Annotate(
           absl_ports::InternalError("Failed to create document store instance"),
@@ -1173,13 +1268,18 @@
   }
 
   // Recreates the doc store instance
-  ICING_ASSIGN_OR_RETURN(
-      document_store_,
+  auto document_store_or =
       DocumentStore::Create(filesystem_.get(), current_document_dir,
-                            clock_.get(), schema_store_.get()),
-      absl_ports::InternalError(
-          "Document store has been optimized, but a valid document store "
-          "instance can't be created"));
+                            clock_.get(), schema_store_.get());
+  if (!document_store_or.ok()) {
+    // Unable to create DocumentStore from the new file. Mark as uninitialized
+    // and return INTERNAL.
+    initialized_ = false;
+    return absl_ports::InternalError(
+        "Document store has been optimized, but a valid document store "
+        "instance can't be created");
+  }
+  document_store_ = std::move(document_store_or).ValueOrDie();
 
   // Deletes tmp directory
   if (!filesystem_->DeleteDirectoryRecursively(

diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 6ae76d7..55d6b2f 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h

@@ -128,6 +128,9 @@
   //
   // Returns:
   //   OK on success
+  //   ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
+  //     type or contains a type that has multiple properties with the same
+  //     name.
   //   INVALID_ARGUMENT if 'new_schema' is invalid
   //   FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
   //     has not been initialized yet.
@@ -256,6 +259,21 @@
   DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
       ICING_LOCKS_EXCLUDED(mutex_);
 
+  // Deletes all Documents that match the query specified in search_spec. Delete
+  // changes are automatically applied to disk, callers can also call
+  // PersistToDisk() to flush changes immediately.
+  //
+  // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+  // called.
+  //
+  // Returns:
+  //   OK on success
+  //   NOT_FOUND if the query doesn't match any documents
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+  //   INTERNAL_ERROR on IO error
+  DeleteResultProto DeleteByQuery(const SearchSpecProto& search_spec)
+      ICING_LOCKS_EXCLUDED(mutex_);
+
   // Retrieves, scores, ranks, and returns the results according to the specs.
   // Results can be empty. If there're multiple pages of results,
   // SearchResultProto.next_page_token will be populated and that can be used to

diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index b0946c9..5a8bb80 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc

@@ -55,6 +55,7 @@
 using ::testing::IsEmpty;
 using ::testing::Lt;
 using ::testing::Matcher;
+using ::testing::Ne;
 using ::testing::Return;
 using ::testing::SizeIs;
 using ::testing::StrEq;
@@ -470,6 +471,163 @@
               HasSubstr("Unable to open file for write"));
 }
 
+TEST_F(IcingSearchEngineTest, SetSchemaDelete2) {
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions());
+    ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+    // 1. Create a schema with an Email type with properties { "title", "body"}
+    SchemaProto schema;
+    SchemaTypeConfigProto* type = schema.add_types();
+    type->set_schema_type("Email");
+    PropertyConfigProto* property = type->add_properties();
+    property->set_property_name("title");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+    property = type->add_properties();
+    property->set_property_name("body");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+    EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+    // 2. Add an email document
+    DocumentProto doc = DocumentBuilder()
+                            .SetKey("emails", "email#1")
+                            .SetSchema("Email")
+                            .AddStringProperty("title", "Hello world.")
+                            .AddStringProperty("body", "Goodnight Moon.")
+                            .Build();
+    EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+  }
+
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions());
+    ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+    // 3. Set a schema that deletes email. This should fail.
+    SchemaProto schema;
+    SchemaTypeConfigProto* type = schema.add_types();
+    type->set_schema_type("Message");
+    PropertyConfigProto* property = type->add_properties();
+    property->set_property_name("body");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+    EXPECT_THAT(icing.SetSchema(schema, false).status().code(),
+                Eq(StatusProto::FAILED_PRECONDITION));
+
+    // 4. Try to delete by email type.
+    EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+                Eq(StatusProto::OK));
+  }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDelete) {
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions());
+    ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+    // 1. Create a schema with an Email type with properties { "title", "body"}
+    SchemaProto schema;
+    SchemaTypeConfigProto* type = schema.add_types();
+    type->set_schema_type("Email");
+    PropertyConfigProto* property = type->add_properties();
+    property->set_property_name("title");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+    property = type->add_properties();
+    property->set_property_name("body");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+    EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+    // 2. Add an email document
+    DocumentProto doc = DocumentBuilder()
+                            .SetKey("emails", "email#1")
+                            .SetSchema("Email")
+                            .AddStringProperty("title", "Hello world.")
+                            .AddStringProperty("body", "Goodnight Moon.")
+                            .Build();
+    EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+  }
+
+  {
+    IcingSearchEngine icing(GetDefaultIcingOptions());
+    ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+    // 3. Set a schema that deletes email. This should fail.
+    SchemaProto schema;
+    SchemaTypeConfigProto* type = schema.add_types();
+    type->set_schema_type("Message");
+    PropertyConfigProto* property = type->add_properties();
+    property->set_property_name("body");
+    property->set_data_type(PropertyConfigProto::DataType::STRING);
+    property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+    EXPECT_THAT(icing.SetSchema(schema, true).status().code(),
+                Eq(StatusProto::OK));
+
+    // 4. Try to delete by email type.
+    EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+                Eq(StatusProto::NOT_FOUND));
+  }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDuplicateTypesReturnsAlreadyExists) {
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+  // Create a schema with types { "Email", "Message" and "Email" }
+  SchemaProto schema;
+  SchemaTypeConfigProto* type = schema.add_types();
+  type->set_schema_type("Email");
+  PropertyConfigProto* property = type->add_properties();
+  property->set_property_name("title");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  type = schema.add_types();
+  type->set_schema_type("Message");
+  property = type->add_properties();
+  property->set_property_name("body");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  *schema.add_types() = schema.types(0);
+
+  EXPECT_THAT(icing.SetSchema(schema).status().code(),
+              Eq(StatusProto::ALREADY_EXISTS));
+}
+
+TEST_F(IcingSearchEngineTest,
+       SetSchemaDuplicatePropertiesReturnsAlreadyExists) {
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+  // Create a schema with an Email type with properties { "title", "body" and
+  // "title" }
+  SchemaProto schema;
+  SchemaTypeConfigProto* type = schema.add_types();
+  type->set_schema_type("Email");
+  PropertyConfigProto* property = type->add_properties();
+  property->set_property_name("title");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+  property = type->add_properties();
+  property->set_property_name("body");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+  property = type->add_properties();
+  property->set_property_name("title");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  EXPECT_THAT(icing.SetSchema(schema).status().code(),
+              Eq(StatusProto::ALREADY_EXISTS));
+}
+
 TEST_F(IcingSearchEngineTest, SetSchema) {
   IcingSearchEngine icing(GetDefaultIcingOptions());
   ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
@@ -1519,6 +1677,82 @@
               EqualsProto(expected_get_result_proto));
 }
 
+TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) {
+  // Setup filesystem to fail
+  auto mock_filesystem = std::make_unique<MockFilesystem>();
+  bool just_swapped_files = false;
+  auto create_dir_lambda = [this, &just_swapped_files](const char* dir_name) {
+    if (just_swapped_files) {
+      // We should fail the first call immediately after swapping files.
+      just_swapped_files = false;
+      return false;
+    }
+    return filesystem()->CreateDirectoryRecursively(dir_name);
+  };
+  ON_CALL(*mock_filesystem, CreateDirectoryRecursively)
+      .WillByDefault(create_dir_lambda);
+  auto swap_lambda = [&just_swapped_files](const char* first_dir,
+                                           const char* second_dir) {
+    just_swapped_files = true;
+    return false;
+  };
+  ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda);
+  TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+                              std::move(mock_filesystem),
+                              std::make_unique<FakeClock>());
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+  // The mocks should cause an unrecoverable error during Optimize - returning
+  // INTERNAL.
+  ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::INTERNAL));
+
+  // Ordinary operations should fail safely.
+  SchemaProto simple_schema;
+  auto type = simple_schema.add_types();
+  type->set_schema_type("type0");
+  auto property = type->add_properties();
+  property->set_property_name("prop0");
+  property->set_data_type(PropertyConfigProto::DataType::STRING);
+  property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  DocumentProto simple_doc = DocumentBuilder()
+                                 .SetKey("namespace0", "uri0")
+                                 .SetSchema("type0")
+                                 .AddStringProperty("prop0", "foo")
+                                 .Build();
+
+  SearchSpecProto search_spec;
+  search_spec.set_query("foo");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  ResultSpecProto result_spec;
+  ScoringSpecProto scoring_spec;
+  scoring_spec.set_rank_by(
+      ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+  EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.Put(simple_doc).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(
+      icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+      Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(
+      icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+      Eq(StatusProto::FAILED_PRECONDITION));
+
+  // Reset should get icing back to a safe (empty) and working state.
+  EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+              Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(simple_doc).status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(
+      icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+      Eq(StatusProto::OK));
+  EXPECT_THAT(
+      icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+      Eq(StatusProto::OK));
+}
+
 TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
   SchemaProto schema;
   // Add an email type
@@ -1528,6 +1762,10 @@
   property->set_property_name("subject");
   property->set_data_type(PropertyConfigProto::DataType::STRING);
   property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+  property->mutable_indexing_config()->set_term_match_type(
+      TermMatchType::EXACT_ONLY);
+  property->mutable_indexing_config()->set_tokenizer_type(
+      IndexingConfig::TokenizerType::PLAIN);
   // Add an message type
   type = schema.add_types();
   type->set_schema_type("message");
@@ -1535,6 +1773,10 @@
   property->set_property_name("body");
   property->set_data_type(PropertyConfigProto::DataType::STRING);
   property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+  property->mutable_indexing_config()->set_term_match_type(
+      TermMatchType::EXACT_ONLY);
+  property->mutable_indexing_config()->set_tokenizer_type(
+      IndexingConfig::TokenizerType::PLAIN);
   DocumentProto document1 =
       DocumentBuilder()
           .SetKey("namespace1", "uri1")
@@ -1550,10 +1792,10 @@
           .SetCreationTimestampMs(kDefaultCreationTimestampMs)
           .Build();
   IcingSearchEngine icing(GetDefaultIcingOptions());
-  EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-  EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
-  EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
-  EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
 
   GetResultProto expected_get_result_proto;
   expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1582,6 +1824,88 @@
   *expected_get_result_proto.mutable_document() = document2;
   EXPECT_THAT(icing.Get("namespace2", "uri2"),
               EqualsProto(expected_get_result_proto));
+
+  // Search for "message", only document2 should show up.
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document2;
+  SearchSpecProto search_spec;
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  search_spec.set_query("message");
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) {
+  SchemaProto schema = CreateMessageSchema();
+  // Add an email type
+  SchemaProto tmp = CreateEmailSchema();
+  *schema.add_types() = tmp.types(0);
+
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace1", "uri1")
+          .SetSchema(schema.types(0).schema_type())
+          .AddStringProperty("body", "message body1")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
+          .SetKey("namespace2", "uri2")
+          .SetSchema(schema.types(1).schema_type())
+          .AddStringProperty("subject", "subject subject2")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+  GetResultProto expected_get_result_proto;
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  // Delete the first type. The first doc should be irretrievable. The
+  // second should still be present.
+  SearchSpecProto search_spec;
+  search_spec.add_schema_type_filters(schema.types(0).schema_type());
+  EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+              Eq(StatusProto::OK));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+  expected_get_result_proto.mutable_status()->set_message(
+      "Document (namespace1, uri1) not found.");
+  expected_get_result_proto.clear_document();
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  expected_get_result_proto.mutable_status()->clear_message();
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  search_spec = SearchSpecProto::default_instance();
+  search_spec.set_query("message");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document2;
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
 }
 
 TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
@@ -1594,6 +1918,89 @@
           .Build();
   DocumentProto document2 =
       DocumentBuilder()
+          .SetKey("namespace1", "uri2")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document3 =
+      DocumentBuilder()
+          .SetKey("namespace3", "uri3")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+              Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+
+  GetResultProto expected_get_result_proto;
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace1", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document3;
+  EXPECT_THAT(icing.Get("namespace3", "uri3"),
+              EqualsProto(expected_get_result_proto));
+
+  // Delete namespace1. Document1 and document2 should be irretrievable.
+  // Document3 should still be present.
+  EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+              Eq(StatusProto::OK));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+  expected_get_result_proto.mutable_status()->set_message(
+      "Document (namespace1, uri1) not found.");
+  expected_get_result_proto.clear_document();
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+  expected_get_result_proto.mutable_status()->set_message(
+      "Document (namespace1, uri2) not found.");
+  expected_get_result_proto.clear_document();
+  EXPECT_THAT(icing.Get("namespace1", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  expected_get_result_proto.mutable_status()->clear_message();
+  *expected_get_result_proto.mutable_document() = document3;
+  EXPECT_THAT(icing.Get("namespace3", "uri3"),
+              EqualsProto(expected_get_result_proto));
+
+  // Search for "message", only document3 should show up.
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document3;
+  SearchSpecProto search_spec;
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  search_spec.set_query("message");
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) {
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace1", "uri1")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body1")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
           .SetKey("namespace2", "uri2")
           .SetSchema("Message")
           .AddStringProperty("body", "message body2")
@@ -1619,7 +2026,9 @@
 
   // Delete the first namespace. The first doc should be irretrievable. The
   // second should still be present.
-  EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+  SearchSpecProto search_spec;
+  search_spec.add_namespace_filters("namespace1");
+  EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
               Eq(StatusProto::OK));
 
   expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
@@ -1634,6 +2043,153 @@
   *expected_get_result_proto.mutable_document() = document2;
   EXPECT_THAT(icing.Get("namespace2", "uri2"),
               EqualsProto(expected_get_result_proto));
+
+  search_spec = SearchSpecProto::default_instance();
+  search_spec.set_query("message");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document2;
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQuery) {
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace1", "uri1")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body1")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
+          .SetKey("namespace2", "uri2")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+              Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+  GetResultProto expected_get_result_proto;
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  // Delete all docs containing 'body1'. The first doc should be irretrievable.
+  // The second should still be present.
+  SearchSpecProto search_spec;
+  search_spec.set_query("body1");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+              Eq(StatusProto::OK));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+  expected_get_result_proto.mutable_status()->set_message(
+      "Document (namespace1, uri1) not found.");
+  expected_get_result_proto.clear_document();
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  expected_get_result_proto.mutable_status()->clear_message();
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  search_spec = SearchSpecProto::default_instance();
+  search_spec.set_query("message");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document2;
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace1", "uri1")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body1")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
+          .SetKey("namespace2", "uri2")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+  EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+              Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+  GetResultProto expected_get_result_proto;
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  // Delete all docs containing 'foo', which should be none of them. Both docs
+  // should still be present.
+  SearchSpecProto search_spec;
+  search_spec.set_query("foo");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+              Eq(StatusProto::NOT_FOUND));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  expected_get_result_proto.mutable_status()->clear_message();
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(icing.Get("namespace1", "uri1"),
+              EqualsProto(expected_get_result_proto));
+
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  expected_get_result_proto.mutable_status()->clear_message();
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(icing.Get("namespace2", "uri2"),
+              EqualsProto(expected_get_result_proto));
+
+  search_spec = SearchSpecProto::default_instance();
+  search_spec.set_query("message");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+  SearchResultProto expected_search_result_proto;
+  expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document2;
+  *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+      document1;
+  EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+                           ResultSpecProto::default_instance()),
+              EqualsProto(expected_search_result_proto));
 }
 
 TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {

diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 00d116f..eb01731 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc

@@ -31,6 +31,7 @@
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
 #include "icing/util/logging.h"
+#include "unicode/uloc.h"
 
 // Run on a Linux workstation:
 //    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -192,8 +193,9 @@
   CleanUp(filesystem, index_dir);
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -239,8 +241,9 @@
   CleanUp(filesystem, index_dir);
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -287,8 +290,9 @@
   CleanUp(filesystem, index_dir);
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -335,8 +339,9 @@
   CleanUp(filesystem, index_dir);
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =

diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 8dfb9c2..824c440 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc

@@ -47,6 +47,7 @@
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -91,8 +92,10 @@
     ICING_ASSERT_OK_AND_ASSIGN(index_,
                                Index::Create(options, &icing_filesystem_));
 
-    ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
-                               language_segmenter_factory::Create());
+    language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+    ICING_ASSERT_OK_AND_ASSIGN(
+        lang_segmenter_,
+        language_segmenter_factory::Create(std::move(segmenter_options)));
 
     ICING_ASSERT_OK_AND_ASSIGN(
         normalizer_,

diff --git a/icing/index/index.cc b/icing/index/index.cc
index d4a2508..e7f2fbc 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc

@@ -24,8 +24,8 @@
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/index/hit/hit.h"
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
 #include "icing/index/lite/lite-index.h"
 #include "icing/index/term-id-codec.h"
 #include "icing/index/term-property-id.h"
@@ -102,10 +102,10 @@
                    TermMatchType::Code term_match_type) {
   switch (term_match_type) {
     case TermMatchType::EXACT_ONLY:
-      return std::make_unique<DocHitInfoIteratorTermExact>(
+      return std::make_unique<DocHitInfoIteratorTermLiteExact>(
           term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
     case TermMatchType::PREFIX:
-      return std::make_unique<DocHitInfoIteratorTermPrefix>(
+      return std::make_unique<DocHitInfoIteratorTermLitePrefix>(
           term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
     default:
       return absl_ports::InvalidArgumentError(
@@ -163,9 +163,14 @@
 
   // Step 2: Update the lexicon, either add the term or update its properties
   if (tvi_or.ok()) {
+    tvi = tvi_or.ValueOrDie();
+    if (seen_tokens_.find(tvi) != seen_tokens_.end()) {
+      ICING_VLOG(1) << "A hit for term " << term
+                    << " has already been added. Skipping.";
+      return libtextclassifier3::Status::OK;
+    }
     ICING_VLOG(1) << "Term " << term
                   << " is already present in lexicon. Updating.";
-    tvi = tvi_or.ValueOrDie();
     // Already in the lexicon. Just update the properties.
     ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
         tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
@@ -175,6 +180,7 @@
     ICING_ASSIGN_OR_RETURN(
         tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
   }
+  seen_tokens_.insert(tvi);
 
   // Step 3: Add the hit itself
   Hit hit(section_id_, document_id_, score,

diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 070e82a..f7ca285 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc

@@ -37,6 +37,7 @@
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/random-string.h"
 #include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
 
 namespace icing {
 namespace lib {
@@ -48,6 +49,7 @@
 using ::testing::Gt;
 using ::testing::IsEmpty;
 using ::testing::IsTrue;
+using ::testing::Ne;
 using ::testing::NiceMock;
 using ::testing::Not;
 using ::testing::SizeIs;
@@ -255,11 +257,16 @@
 }
 
 TEST_F(IndexTest, SingleHitDedupeIndex) {
+  Crc32 empty_crc = index_->ComputeChecksum();
   // Act
   Index::Editor edit = index_->Edit(
       kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
   EXPECT_THAT(edit.AddHit("foo"), IsOk());
+  Crc32 first_hit_crc = index_->ComputeChecksum();
+  EXPECT_THAT(first_hit_crc.Get(), Ne(empty_crc.Get()));
   EXPECT_THAT(edit.AddHit("foo"), IsOk());
+  Crc32 second_hit_crc = index_->ComputeChecksum();
+  EXPECT_THAT(second_hit_crc.Get(), Eq(first_hit_crc.Get()));
 
   // Assert
   ICING_ASSERT_OK_AND_ASSIGN(

diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc
index 482a5ab..c6cb86d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc

@@ -82,12 +82,10 @@
         "Couldn't get current time. Try again in a bit");
   }
 
-  if (options_.filter_deleted) {
-    if (!document_store_.DoesDocumentExist(
-            delegate_->doc_hit_info().document_id())) {
-      // Document doesn't exist, keep searching
-      return Advance();
-    }
+  if (!document_store_.DoesDocumentExist(
+          delegate_->doc_hit_info().document_id())) {
+    // Document doesn't exist, keep searching
+    return Advance();
   }
 
   // Try to get the DocumentFilterData

diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index bf027e4..9119610 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h

@@ -37,10 +37,6 @@
 class DocHitInfoIteratorFilter : public DocHitInfoIterator {
  public:
   struct Options {
-    // Filter out/don't return DocHitInfos that are associated with nonexistent
-    // Documents.
-    bool filter_deleted = true;
-
     // List of namespaces that documents must have. An empty vector means that
     // all namespaces are valid, and no documents will be filtered out.
     //

diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
index e769013..9eb147a 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc

@@ -105,33 +105,6 @@
   EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
 }
 
-TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) {
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
-                             document_store_->Put(test_document1_));
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
-                             document_store_->Put(test_document2_));
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
-                             document_store_->Put(test_document3_));
-
-  // Deletes test document 2
-  ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
-                                          test_document2_.uri()));
-
-  std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
-                                           DocHitInfo(document_id2),
-                                           DocHitInfo(document_id3)};
-  std::unique_ptr<DocHitInfoIterator> original_iterator =
-      std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
-  options_.filter_deleted = false;
-  DocHitInfoIteratorFilter filtered_iterator(
-      std::move(original_iterator), document_store_.get(), schema_store_.get(),
-      &fake_clock_, options_);
-
-  EXPECT_THAT(GetDocumentIds(&filtered_iterator),
-              ElementsAre(document_id1, document_id2, document_id3));
-}
-
 TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
   ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
                              document_store_->Put(test_document1_));

diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
similarity index 88%
rename from icing/index/iterator/doc-hit-info-iterator-term.cc
rename to icing/index/lite/doc-hit-info-iterator-term-lite.cc
index 97ca3c4..a975f86 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc

@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
 
 #include <cstdint>
 
@@ -40,7 +40,7 @@
 
 }  // namespace
 
-libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
   if (cached_hits_idx_ == -1) {
     ICING_RETURN_IF_ERROR(RetrieveMoreHits());
   } else {
@@ -59,7 +59,7 @@
   return libtextclassifier3::Status::OK;
 }
 
-libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
   // Exact match only. All hits in lite lexicon are exact.
   ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
   ICING_ASSIGN_OR_RETURN(uint32_t term_id,
@@ -70,12 +70,13 @@
   return libtextclassifier3::Status::OK;
 }
 
-std::string DocHitInfoIteratorTermExact::ToString() const {
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
                             term_);
 }
 
-libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
   // Take union of lite terms.
   int term_len = term_.length();
   int terms_matched = 0;
@@ -97,7 +98,7 @@
   return libtextclassifier3::Status::OK;
 }
 
-void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
   // Re-sort cached document_ids and merge sections.
   sort(cached_hits_.begin(), cached_hits_.end());
 
@@ -116,7 +117,7 @@
   cached_hits_.resize(idx + 1);
 }
 
-std::string DocHitInfoIteratorTermPrefix::ToString() const {
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
                             term_, "*");
 }

diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
similarity index 63%
rename from icing/index/iterator/doc-hit-info-iterator-term.h
rename to icing/index/lite/doc-hit-info-iterator-term-lite.h
index 21d1dd6..bd2de6d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h

@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
-#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
 
 #include <cstdint>
 #include <vector>
@@ -28,11 +28,12 @@
 namespace icing {
 namespace lib {
 
-class DocHitInfoIteratorTerm : public DocHitInfoIterator {
+class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
  public:
-  explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec,
-                                  LiteIndex* lite_index, const std::string term,
-                                  SectionIdMask section_restrict_mask)
+  explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+                                      LiteIndex* lite_index,
+                                      const std::string& term,
+                                      SectionIdMask section_restrict_mask)
       : term_(term),
         lite_index_(lite_index),
         cached_hits_idx_(-1),
@@ -66,14 +67,14 @@
   const SectionIdMask section_restrict_mask_;
 };
 
-class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
  public:
-  explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec,
-                                       LiteIndex* lite_index,
-                                       const std::string& term,
-                                       SectionIdMask section_id_mask)
-      : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
-                               section_id_mask) {}
+  explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+                                           LiteIndex* lite_index,
+                                           const std::string& term,
+                                           SectionIdMask section_id_mask)
+      : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+                                   section_id_mask) {}
 
   std::string ToString() const override;
 
@@ -81,14 +82,14 @@
   libtextclassifier3::Status RetrieveMoreHits() override;
 };
 
-class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
  public:
-  explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec,
-                                        LiteIndex* lite_index,
-                                        const std::string& term,
-                                        SectionIdMask section_id_mask)
-      : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
-                               section_id_mask) {}
+  explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+                                            LiteIndex* lite_index,
+                                            const std::string& term,
+                                            SectionIdMask section_id_mask)
+      : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+                                   section_id_mask) {}
 
   std::string ToString() const override;
 
@@ -105,4 +106,4 @@
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_

diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
new file mode 100644
index 0000000..0640135
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc

@@ -0,0 +1,166 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+  std::string mask(kMaxSectionId + 1, '0');
+  for (SectionId i = kMaxSectionId; i >= 0; --i) {
+    if (section_id_mask & (1U << i)) {
+      mask[kMaxSectionId - i] = '1';
+    }
+  }
+  return mask;
+}
+
+}  // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
+  if (posting_list_accessor_ == nullptr ||
+      cached_doc_hit_infos_idx_ == (cached_doc_hit_infos_.size() - 2)) {
+    // If we haven't retrieved any hits before or we've already returned all but
+    // the last cached hit, then go get some more!
+    // We hold back the last cached hit because it could have more hits on the
+    // next posting list in the chain.
+    ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+  } else {
+    ++cached_doc_hit_infos_idx_;
+  }
+  if (cached_doc_hit_infos_idx_ == -1 ||
+      cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+    // Nothing more for the iterator to return. Set these members to invalid
+    // values.
+    doc_hit_info_ = DocHitInfo();
+    hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+    return absl_ports::ResourceExhaustedError(
+        "No more DocHitInfos in iterator");
+  }
+  doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_);
+  hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+  return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() {
+  DocHitInfo last_doc_hit_info;
+  if (!cached_doc_hit_infos_.empty()) {
+    last_doc_hit_info = cached_doc_hit_infos_.back();
+  }
+  cached_doc_hit_infos_idx_ = 0;
+  cached_doc_hit_infos_.clear();
+  if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+    // Carry over the last hit. It might need to be merged with the first hit of
+    // of the next posting list in the chain.
+    cached_doc_hit_infos_.push_back(last_doc_hit_info);
+  }
+  if (posting_list_accessor_ == nullptr) {
+    ICING_ASSIGN_OR_RETURN(posting_list_accessor_,
+                           main_index_->GetAccessorForExactTerm(term_));
+  }
+
+  ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+                         posting_list_accessor_->GetNextHitsBatch());
+  ++num_blocks_inspected_;
+  cached_doc_hit_infos_.reserve(hits.size() + 1);
+  for (const Hit& hit : hits) {
+    // Check sections.
+    if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+      continue;
+    }
+    // We want exact hits, skip prefix-only hits.
+    if (hit.is_prefix_hit()) {
+      continue;
+    }
+    if (cached_doc_hit_infos_.empty() ||
+        hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+      cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+    }
+    cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+  }
+  return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainExact::ToString() const {
+  return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+                            term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() {
+  DocHitInfo last_doc_hit_info;
+  if (!cached_doc_hit_infos_.empty()) {
+    last_doc_hit_info = cached_doc_hit_infos_.back();
+  }
+  cached_doc_hit_infos_idx_ = 0;
+  cached_doc_hit_infos_.clear();
+  if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+    // Carry over the last hit. It might need to be merged with the first hit of
+    // of the next posting list in the chain.
+    cached_doc_hit_infos_.push_back(last_doc_hit_info);
+  }
+
+  ++num_blocks_inspected_;
+  if (posting_list_accessor_ == nullptr) {
+    ICING_ASSIGN_OR_RETURN(
+        MainIndex::GetPrefixAccessorResult result,
+        main_index_->GetAccessorForPrefixTerm(term_));
+    posting_list_accessor_ = std::move(result.accessor);
+    exact_ = result.exact;
+  }
+  ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+                         posting_list_accessor_->GetNextHitsBatch());
+  cached_doc_hit_infos_.reserve(hits.size());
+  for (const Hit& hit : hits) {
+    // Check sections.
+    if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+      continue;
+    }
+    // If we only want hits from prefix sections.
+    if (!exact_ && !hit.is_in_prefix_section()) {
+      continue;
+    }
+    if (cached_doc_hit_infos_.empty() ||
+        hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+      cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+    }
+    cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+  }
+  return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainPrefix::ToString() const {
+  return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+                            term_, "*");
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
new file mode 100644
index 0000000..1f77226
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h

@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermMain : public DocHitInfoIterator {
+ public:
+  explicit DocHitInfoIteratorTermMain(MainIndex* main_index,
+                                      const std::string& term,
+                                      SectionIdMask section_restrict_mask)
+      : term_(term),
+        main_index_(main_index),
+        cached_doc_hit_infos_idx_(-1),
+        num_advance_calls_(0),
+        num_blocks_inspected_(0),
+        next_posting_list_id_(PostingListIdentifier::kInvalid),
+        section_restrict_mask_(section_restrict_mask) {}
+
+  libtextclassifier3::Status Advance() override;
+
+  int32_t GetNumBlocksInspected() const override {
+    return num_blocks_inspected_;
+  }
+  int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+  // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
+  virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+  const std::string term_;
+  // The accessor of the posting list chain for the requested term.
+  std::unique_ptr<PostingListAccessor> posting_list_accessor_;
+
+  MainIndex* main_index_;
+  // Stores hits retrieved from the index. This may only be a subset of the hits
+  // that are present in the index. Current value pointed to by the Iterator is
+  // tracked by cached_doc_hit_infos_idx_.
+  std::vector<DocHitInfo> cached_doc_hit_infos_;
+  int cached_doc_hit_infos_idx_;
+  int num_advance_calls_;
+  int num_blocks_inspected_;
+  PostingListIdentifier next_posting_list_id_;
+  // Mask indicating which sections hits should be considered for.
+  // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+  const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermMainExact : public DocHitInfoIteratorTermMain {
+ public:
+  explicit DocHitInfoIteratorTermMainExact(MainIndex* main_index,
+                                           const std::string& term,
+                                           SectionIdMask section_restrict_mask)
+      : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+  std::string ToString() const override;
+
+ protected:
+  libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain {
+ public:
+  explicit DocHitInfoIteratorTermMainPrefix(MainIndex* main_index,
+                                            const std::string& term,
+                                            SectionIdMask section_restrict_mask)
+      : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+  std::string ToString() const override;
+
+ protected:
+  libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+  // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+  // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+  // merged.
+  void SortAndDedupeDocumentIds();
+  // Whether or not posting_list_accessor_ holds a posting list chain for
+  // 'term' or for a term for which 'term' is a prefix. This is necessary to
+  // determine whether to return hits that are not from a prefix section (hits
+  // not from a prefix section should only be returned if exact_ is true).
+  bool exact_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_

diff --git a/icing/index/main/flash-index-storage-header.h b/icing/index/main/flash-index-storage-header.h
new file mode 100644
index 0000000..f81e99e
--- /dev/null
+++ b/icing/index/main/flash-index-storage-header.h

@@ -0,0 +1,122 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+// The class used to manage the flash block that contains the header for
+// FlashIndexStorage. This contains information about the index blocks that
+// store the posting lists.
+class HeaderBlock {
+ public:
+  // The class used to access the actual header.
+  struct Header {
+    // A magic used to mark the beginning of a valid header.
+    static constexpr int kMagic = 0x6dfba6ae;
+    int magic;
+    int block_size;
+    int last_indexed_docid;
+    // The size of the index_block_infos array.
+    int num_index_block_infos;
+
+    struct IndexBlockInfo {
+      // The size of the posting lists that fit on all the index blocks in this
+      // chain. Each block on this posting list will have posting lists of size
+      // posting_list_bytes.
+      int posting_list_bytes;
+      // The block index of the first block in the free list chain.
+      int free_list_block_index;
+    };
+    // Variable-size array, num_index_block_infos long. Can have a max length
+    // of log(block_size). This array is used to maintain a free list for the
+    // available blocks.
+    IndexBlockInfo index_block_infos[0];
+  };
+
+  // Read HeaderBlock from the specified fd.
+  //
+  // RETURNS:
+  //  - HeaderBlock, on success
+  //  - INTERNAL if unable to read block_size bytes from fd.
+  static libtextclassifier3::StatusOr<HeaderBlock> Read(
+      const Filesystem* filesystem, int fd, int block_size) {
+    std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(block_size);
+    if (!filesystem->PRead(fd, buffer.get(), block_size, 0)) {
+      return absl_ports::InternalError("Unable to reader header block!");
+    }
+    return HeaderBlock(filesystem, std::move(buffer), block_size);
+  }
+
+  // Make a new HeaderBlock with the specified size.
+  explicit HeaderBlock(const Filesystem* filesystem, int block_size)
+      : HeaderBlock(filesystem, std::make_unique<uint8_t[]>(block_size),
+                    block_size) {
+    std::memset(header_buffer_.get(), 0, block_size);
+  }
+
+  Header* header() const {
+    return reinterpret_cast<Header*>(header_buffer_.get());
+  }
+
+  // Add another entry to the index_block_infos array and return a pointer to
+  // that entry. Returns a nullptr if the index_block_infos array is already
+  // at a max size.
+  Header::IndexBlockInfo* AddIndexBlockInfo() {
+    if (size() + sizeof(Header::IndexBlockInfo) > block_size_) {
+      return nullptr;
+    }
+    ++header()->num_index_block_infos;
+    return header()->index_block_infos + (header()->num_index_block_infos - 1);
+  }
+
+  // Returns the size of the header block currently in use.
+  int size() const {
+    return sizeof(Header) +
+           header()->num_index_block_infos * sizeof(Header::IndexBlockInfo);
+  }
+
+  // Writes the header to fd. Returns true on success.
+  bool Write(int fd) {
+    return filesystem_->PWrite(fd, 0, header_buffer_.get(), block_size_);
+  }
+
+ private:
+  explicit HeaderBlock(const Filesystem* filesystem,
+                       std::unique_ptr<uint8_t[]> buffer, int block_size)
+      : filesystem_(filesystem),
+        header_buffer_(std::move(buffer)),
+        block_size_(block_size) {}
+
+  const Filesystem* filesystem_;  // does NOT own!
+  std::unique_ptr<uint8_t[]> header_buffer_;
+  int block_size_;
+};
+static_assert(16 == sizeof(HeaderBlock::Header),
+              "Header has changed size. Consider how this change might affect "
+              "pre-existing indices.");
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_

diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc
new file mode 100644
index 0000000..b88d7fe
--- /dev/null
+++ b/icing/index/main/flash-index-storage.cc

@@ -0,0 +1,511 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <unordered_set>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t SelectBlockSize() {
+  // This should be close to the flash page size.
+  static constexpr uint32_t kMinBlockSize = 4096;
+
+  // Determine a good block size.
+  uint32_t page_size = getpagesize();
+  uint32_t block_size = std::max(kMinBlockSize, page_size);
+
+  // Align up to the nearest page size.
+  return math_util::RoundUpTo(block_size, page_size);
+}
+
+}  // namespace
+
+libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
+    const std::string& index_filename, const Filesystem* filesystem,
+    bool in_memory) {
+  ICING_RETURN_ERROR_IF_NULL(filesystem);
+  FlashIndexStorage storage(index_filename, filesystem, in_memory);
+  if (!storage.Init()) {
+    return absl_ports::InternalError(
+        "Unable to successfully read header block!");
+  }
+  return storage;
+}
+
+FlashIndexStorage::FlashIndexStorage(const std::string& index_filename,
+                                     const Filesystem* filesystem,
+                                     bool has_in_memory_freelists)
+    : index_filename_(index_filename),
+      num_blocks_(0),
+      filesystem_(filesystem),
+      has_in_memory_freelists_(has_in_memory_freelists) {}
+
+FlashIndexStorage::~FlashIndexStorage() {
+  if (header_block_ != nullptr) {
+    FlushInMemoryFreeList();
+    PersistToDisk();
+  }
+}
+
+bool FlashIndexStorage::Init() {
+  block_fd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str()));
+  if (!block_fd_.is_valid()) {
+    return false;
+  }
+
+  // Read in or create the header.
+  return InitHeader();
+}
+
+bool FlashIndexStorage::InitHeader() {
+  // Look for an existing file size.
+  int64_t file_size = filesystem_->GetFileSize(block_fd_.get());
+  if (file_size == Filesystem::kBadFileSize) {
+    ICING_LOG(ERROR) << "Could not initialize main index. Bad file size.";
+    return false;
+  }
+
+  if (file_size == 0) {
+    if (!CreateHeader()) {
+      ICING_LOG(ERROR)
+          << "Could not initialize main index. Unable to create header.";
+      return false;
+    }
+  } else {
+    if (!OpenHeader(file_size)) {
+      ICING_LOG(ERROR)
+          << "Could not initialize main index. Unable to open header.";
+      return false;
+    }
+  }
+  in_memory_freelists_.resize(header_block_->header()->num_index_block_infos);
+
+  return true;
+}
+
+bool FlashIndexStorage::CreateHeader() {
+  uint32_t block_size = SelectBlockSize();
+  header_block_ = std::make_unique<HeaderBlock>(filesystem_, block_size);
+  // Initialize.
+  header_block_->header()->magic = HeaderBlock::Header::kMagic;
+  header_block_->header()->block_size = block_size;
+  header_block_->header()->last_indexed_docid = kInvalidDocumentId;
+
+  // Work down from the largest posting list that fits in
+  // block_size. We don't care about locality of blocks because this
+  // is a flash index.
+  for (uint32_t posting_list_bytes =
+           IndexBlock::CalculateMaxPostingListBytes(block_size);
+       posting_list_bytes >= posting_list_utils::min_posting_list_size();
+       posting_list_bytes /= 2) {
+    uint32_t aligned_posting_list_bytes =
+        (posting_list_bytes / sizeof(Hit) * sizeof(Hit));
+    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+        "Block size %u: %u", header_block_->header()->num_index_block_infos,
+        aligned_posting_list_bytes);
+
+    // Initialize free list to empty.
+    HeaderBlock::Header::IndexBlockInfo* block_info =
+        header_block_->AddIndexBlockInfo();
+    if (block_info == nullptr) {
+      // This should never happen anyways. Min block size is 4k, so adding these
+      // IndexBlockInfos should never exceed the block size.
+      return false;
+    }
+    block_info->posting_list_bytes = aligned_posting_list_bytes;
+    block_info->free_list_block_index = kInvalidBlockIndex;
+  }
+
+  // Write the header.
+  if (!header_block_->Write(block_fd_.get())) {
+    filesystem_->Truncate(block_fd_.get(), 0);
+    return false;
+  }
+  num_blocks_ = 1;
+  return true;
+}
+
+bool FlashIndexStorage::OpenHeader(int64_t file_size) {
+  uint32_t block_size = SelectBlockSize();
+  // Read and validate header.
+  ICING_ASSIGN_OR_RETURN(
+      HeaderBlock read_header,
+      HeaderBlock::Read(filesystem_, block_fd_.get(), block_size), false);
+  if (read_header.header()->magic != HeaderBlock::Header::kMagic) {
+    ICING_LOG(ERROR) << "Index header block wrong magic";
+    return false;
+  }
+  if (file_size % read_header.header()->block_size != 0) {
+    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+        "Index size %" PRIu64 " not a multiple of block size %u", file_size,
+        read_header.header()->block_size);
+    return false;
+  }
+
+  if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
+    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+        "Index size %" PRIu64 " shorter than block size %u", file_size,
+        read_header.header()->block_size);
+    return false;
+  }
+
+  if (read_header.header()->block_size % getpagesize() != 0) {
+    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+        "Block size %u is not a multiple of page size %d",
+        read_header.header()->block_size, getpagesize());
+    return false;
+  }
+  num_blocks_ = file_size / read_header.header()->block_size;
+  if (block_size != read_header.header()->block_size) {
+    // The block_size changed? That's weird. But the old block_size is still
+    // valid (it must be some multiple of the new block_size). So reinitialize
+    // with that old block size. Using the old block size means that we can
+    // still use the main index, but reads/writes won't be as efficient in terms
+    // of flash IO because the 'blocks' that we're reading are actually multiple
+    // pages long.
+    ICING_LOG(ERROR) << "Block size of existing header ("
+                     << read_header.header()->block_size
+                     << ") does not match the requested block size ("
+                     << block_size << "). Defaulting to existing block size "
+                     << read_header.header()->block_size;
+    ICING_ASSIGN_OR_RETURN(HeaderBlock read_header,
+                           HeaderBlock::Read(filesystem_, block_fd_.get(),
+                                             read_header.header()->block_size),
+                           false);
+  }
+  header_block_ = std::make_unique<HeaderBlock>(std::move(read_header));
+
+  // Check for memory alignment on posting_list_bytes. See b/29983315.
+  // The issue of potential corruption to the header could also be handled by
+  // checksumming the header block.
+  for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
+    int posting_list_bytes =
+        header_block_->header()->index_block_infos[i].posting_list_bytes;
+    if (posting_list_bytes % sizeof(Hit) != 0) {
+      ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+          "Posting list size misaligned, index %u, size %u, hit %zu, "
+          "file_size %" PRIu64,
+          i, header_block_->header()->index_block_infos[i].posting_list_bytes,
+          sizeof(Hit), file_size);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool FlashIndexStorage::PersistToDisk() {
+  // First, write header.
+  if (!header_block_->Write(block_fd_.get())) {
+    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+        "Write index header failed: %s", strerror(errno));
+    return false;
+  }
+
+  // Then sync.
+  return filesystem_->DataSync(block_fd_.get());
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingList(PostingListIdentifier id) const {
+  ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(id.block_index()));
+  ICING_ASSIGN_OR_RETURN(
+      PostingListUsed posting_list,
+      block.GetAllocatedPostingList(id.posting_list_index()));
+  PostingListHolder holder = {std::move(posting_list), std::move(block), id};
+  return holder;
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock(
+    int block_index) const {
+  if (block_index >= num_blocks_) {
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "Unable to create an index block at index %d when only %d blocks have "
+        "been allocated.",
+        block_index, num_blocks_));
+  }
+  off_t offset = static_cast<off_t>(block_index) * block_size();
+  return IndexBlock::CreateFromPreexistingIndexBlockRegion(
+      *filesystem_, index_filename_, offset, block_size());
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
+    int block_index, uint32_t posting_list_size) const {
+  if (block_index >= num_blocks_) {
+    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+        "Unable to create an index block at index %d when only %d blocks have "
+        "been allocated.",
+        block_index, num_blocks_));
+  }
+  off_t offset = static_cast<off_t>(block_index) * block_size();
+  return IndexBlock::CreateFromUninitializedRegion(
+      *filesystem_, index_filename_, offset, block_size(), posting_list_size);
+}
+
+int FlashIndexStorage::FindBestIndexBlockInfo(
+    uint32_t posting_list_bytes) const {
+  int i = header_block_->header()->num_index_block_infos - 1;
+  for (; i >= 0; i--) {
+    if (header_block_->header()->index_block_infos[i].posting_list_bytes >=
+        posting_list_bytes) {
+      return i;
+    }
+  }
+  return i;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromInMemoryFreeList(int block_info_index) {
+  // Get something from in memory free list.
+  ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
+                         in_memory_freelists_[block_info_index].TryPop());
+  // Remember, posting lists stored on the in-memory free list were never
+  // actually freed. So it will still contain a valid PostingListUsed. First, we
+  // need to free this posting list.
+  ICING_ASSIGN_OR_RETURN(IndexBlock block,
+                         GetIndexBlock(posting_list_id.block_index()));
+  block.FreePostingList(posting_list_id.posting_list_index());
+
+  // Now, we can allocate a posting list from the same index block. It may not
+  // be the same posting list that was just freed, but that's okay.
+  ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+                         block.AllocatePostingList());
+  posting_list_id =
+      PostingListIdentifier(posting_list_id.block_index(), posting_list_index,
+                            posting_list_id.posting_list_index_bits());
+  ICING_ASSIGN_OR_RETURN(
+      PostingListUsed posting_list,
+      block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+  PostingListHolder holder = {std::move(posting_list), std::move(block),
+                              posting_list_id};
+  return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromOnDiskFreeList(int block_info_index) {
+  // Get something from the free list.
+  uint32_t block_index = header_block_->header()
+                             ->index_block_infos[block_info_index]
+                             .free_list_block_index;
+  if (block_index == kInvalidBlockIndex) {
+    return absl_ports::NotFoundError("No available entry in free list.");
+  }
+
+  // Get the index block
+  ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(block_index));
+  ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+                         block.AllocatePostingList());
+  PostingListIdentifier posting_list_id = PostingListIdentifier(
+      block_index, posting_list_index, block.posting_list_index_bits());
+  ICING_ASSIGN_OR_RETURN(
+      PostingListUsed posting_list,
+      block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+  if (!block.has_free_posting_lists()) {
+    RemoveFromOnDiskFreeList(block_index, block_info_index, &block);
+  }
+  PostingListHolder holder = {std::move(posting_list), std::move(block),
+                              posting_list_id};
+  return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocateNewPostingList(int block_info_index) {
+  uint32_t block_index = GrowIndex();
+  if (block_index == kInvalidBlockIndex) {
+    return absl_ports::ResourceExhaustedError(
+        "Unable to grow the index further!");
+  }
+  ICING_ASSIGN_OR_RETURN(
+      IndexBlock block,
+      CreateIndexBlock(block_index, header_block_->header()
+                                        ->index_block_infos[block_info_index]
+                                        .posting_list_bytes));
+  ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+                         block.AllocatePostingList());
+  PostingListIdentifier posting_list_id = PostingListIdentifier(
+      block_index, posting_list_index, block.posting_list_index_bits());
+  ICING_ASSIGN_OR_RETURN(
+      PostingListUsed posting_list,
+      block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+  if (block.has_free_posting_lists()) {
+    AddToOnDiskFreeList(block_index, block_info_index, &block);
+  }
+  PostingListHolder holder = {std::move(posting_list), std::move(block),
+                              posting_list_id};
+  return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) {
+  int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size());
+  if (min_posting_list_bytes > max_block_size) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Requested posting list size %d exceeds max posting list size %d",
+        min_posting_list_bytes, max_block_size));
+  }
+  int best_block_info_index = FindBestIndexBlockInfo(min_posting_list_bytes);
+
+  auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index);
+  if (holder_or.ok()) {
+    return std::move(holder_or).ValueOrDie();
+  }
+
+  // Nothing in memory. Look for something in the block file.
+  holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index);
+  if (holder_or.ok()) {
+    return std::move(holder_or).ValueOrDie();
+  }
+
+  return AllocateNewPostingList(best_block_info_index);
+}
+
+void FlashIndexStorage::AddToOnDiskFreeList(uint32_t block_index,
+                                            int block_info_index,
+                                            IndexBlock* index_block) {
+  index_block->set_next_block_index(header_block_->header()
+                                        ->index_block_infos[block_info_index]
+                                        .free_list_block_index);
+  header_block_->header()
+      ->index_block_infos[block_info_index]
+      .free_list_block_index = block_index;
+}
+
+void FlashIndexStorage::RemoveFromOnDiskFreeList(uint32_t block_index,
+                                                 int block_info_index,
+                                                 IndexBlock* index_block) {
+  // Cannot be used anymore. Move free ptr to the next block.
+  header_block_->header()
+      ->index_block_infos[block_info_index]
+      .free_list_block_index = index_block->next_block_index();
+  index_block->set_next_block_index(kInvalidBlockIndex);
+}
+
+void FlashIndexStorage::FreePostingList(PostingListHolder holder) {
+  uint32_t posting_list_bytes = holder.block.get_posting_list_bytes();
+  int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes);
+
+  // It *should* be guaranteed elsewhere that FindBestIndexBlockInfo will not
+  // return a value in >= in_memory_freelists_, but check regardless. If it
+  // doesn't fit for some reason, then put it in the Header free list instead.
+  if (has_in_memory_freelists_ &&
+      best_block_info_index < in_memory_freelists_.size()) {
+    in_memory_freelists_[best_block_info_index].Push(holder.id);
+  } else {
+    bool was_full = !holder.block.has_free_posting_lists();
+    holder.block.FreePostingList(holder.id.posting_list_index());
+    // If this block was not already full, then it is already in the free list.
+    if (was_full) {
+      AddToOnDiskFreeList(holder.id.block_index(), best_block_info_index,
+                          &holder.block);
+    }
+  }
+}
+
+int FlashIndexStorage::GrowIndex() {
+  if (num_blocks_ >= kMaxBlockIndex) {
+    ICING_VLOG(1) << IcingStringUtil::StringPrintf("Reached max block index %u",
+                                                   kMaxBlockIndex);
+    return kInvalidBlockIndex;
+  }
+
+  // Grow the index file.
+  if (!filesystem_->Grow(
+          block_fd_.get(),
+          static_cast<uint64_t>(num_blocks_ + 1) * block_size())) {
+    ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+        "Error growing index file: %s", strerror(errno));
+    return kInvalidBlockIndex;
+  }
+
+  return num_blocks_++;
+}
+
+void FlashIndexStorage::FlushInMemoryFreeList() {
+  for (int i = 0; i < in_memory_freelists_.size(); ++i) {
+    FreeList& freelist = in_memory_freelists_.at(i);
+    auto freelist_elt_or = freelist.TryPop();
+    while (freelist_elt_or.ok()) {
+      PostingListIdentifier freelist_elt = freelist_elt_or.ValueOrDie();
+      // Remember, posting lists stored on the in-memory free list were never
+      // actually freed. So it will still contain a valid PostingListUsed.
+      // First, we need to free this posting list.
+      auto block_or = GetIndexBlock(freelist_elt.block_index());
+      if (!block_or.ok()) {
+        // Can't read the block. Nothing to do here. This posting list will have
+        // to leak. Just proceed to the next freelist element.
+        freelist_elt_or = freelist.TryPop();
+        continue;
+      }
+      IndexBlock block = std::move(block_or).ValueOrDie();
+      bool was_full = !block.has_free_posting_lists();
+      block.FreePostingList(freelist_elt.posting_list_index());
+      // If this block was not already full, then it is already in the free
+      // list.
+      if (was_full) {
+        AddToOnDiskFreeList(freelist_elt.block_index(), /*block_info_index=*/i,
+                            &block);
+      }
+      freelist_elt_or = freelist.TryPop();
+    }
+  }
+}
+
+// FreeList.
+void FlashIndexStorage::FreeList::Push(PostingListIdentifier id) {
+  if (free_list_.size() >= kMaxSize) {
+    ICING_LOG(WARNING)
+        << "Freelist for posting lists of size (block_size / "
+        << (1u << id.posting_list_index_bits())
+        << ") has reached max size. Dropping freed posting list [block_index:"
+        << id.block_index()
+        << ", posting_list_index:" << id.posting_list_index() << "]";
+    return;
+  }
+
+  free_list_.push_back(id);
+}
+
+libtextclassifier3::StatusOr<PostingListIdentifier>
+FlashIndexStorage::FreeList::TryPop() {
+  if (free_list_.empty()) {
+    return absl_ports::NotFoundError("No available entry in free list.");
+  }
+
+  PostingListIdentifier id = free_list_.back();
+  free_list_.pop_back();
+  return id;
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
new file mode 100644
index 0000000..958f131
--- /dev/null
+++ b/icing/index/main/flash-index-storage.h

@@ -0,0 +1,275 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_
+#define ICING_INDEX_FLASH_INDEX_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/main/flash-index-storage-header.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// The PostingListHolder struct exists to group together related PostingListUsed
+// IndexBlock pairs and their ids.
+struct PostingListHolder {
+  // PostingListUseds interpret data that they themselves do NOT own. The data
+  // being interpreted is stored on a flash block and its memory mapping is
+  // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must
+  // NOT exceed the lifecycle of the IndexBlock.
+  PostingListUsed posting_list;
+  IndexBlock block;
+  // The PostingListIdentifier, which identifies both the IndexBlock and the
+  // PostingListUsed, is also returned for convenience.
+  PostingListIdentifier id;
+};
+
+// The FlashIndexStorage class manages the actual file that makes up the index.
+// It allocates IndexBlocks as needed and maintains freelists to prevent
+// excessive block fragmentation.
+//
+// It maintains two types of free lists:
+//   1. On-disk, Header free list - This free list is stored in the Header
+//      block. There is a free list for every possible posting list size. Each
+//      entry for a posting list size contains the block_index of the
+//      IndexBlock that starts the free list chain. Each IndexBlock in the free
+//      list chain stores the index of the next IndexBlock in the chain.
+//   2. In-memory free list - Like the Header free list, there is a free list of
+//      every possible posting list size. This free list contains not just the
+//      block_index of the available IndexBlock, but also the posting_list_index
+//      of the available PostingListUsed within the IndexBlock. This is because,
+//      unlike the Header free list, PostingListUseds are not actually freed
+//      when added to this free list.
+//
+// Whether or not the in-memory free list is used can be chosen via the
+// in_memory param to the Create factory function.
+//
+// The advantage of using the in-memory free list is that it reduces the amount
+// of flash writes made while editing the index (because actually freeing the
+// PostingLists would require writing to that flash block). The disadvantage is
+// that it introduces code complexity and potentially leaks blocks if power is
+// lost or if FlashIndexStorage is destroyed before emptying the free list.
+class FlashIndexStorage {
+ public:
+  // Creates a FlashIndexStorage at index_filename. in_memory determines whether
+  // or not the FlashIndexStorage maintains an in-memory freelist in order to
+  // avoid writes to the on-disk freelist.
+  //
+  // RETURNS:
+  //   - On success, a valid instance of FlashIndexStorage
+  //   - INTERNAL error if unable to create a new header or read the existing
+  //     one from disk.
+  static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
+      const std::string& index_filename, const Filesystem* filesystem,
+      bool in_memory = true);
+
+  // Retrieve the PostingList referred to by PostingListIdentifier. This posting
+  // list must have been previously allocated by a prior call to
+  // AllocatePostingList.
+  //
+  // RETURNS:
+  //   - On success, a valid instance of PostingListHolder containing the
+  //     requested PostingListUsed.
+  //   - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the
+  //     IndexBlock referred to by id.block_index()
+  //   - INTERNAL_ERROR if unable to access the region in file.
+  libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
+      PostingListIdentifier id) const;
+
+  // Allocates and returns a PostingListHolder containing a PostingListUsed that
+  // can fit min_posting_list_bytes.
+  //
+  // RETURNS:
+  //   - On success, a valid instance of PostingListHolder containing the
+  //     requested PostingListUsed.
+  //   - RESOURCE_EXHAUSTED error if unable to grow the index to create a
+  //     PostingListUsed of the requested size.
+  libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
+      uint32_t min_posting_list_bytes);
+
+  ~FlashIndexStorage();
+  FlashIndexStorage(FlashIndexStorage&&) = default;
+  FlashIndexStorage(const FlashIndexStorage&) = delete;
+  FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
+  FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
+
+  // Free the PostingListUsed that this holder holds.
+  void FreePostingList(PostingListHolder holder);
+
+  // Used to track the largest docid indexed in the index.
+  DocumentId get_last_indexed_docid() const {
+    return header_block_->header()->last_indexed_docid;
+  }
+  void set_last_indexed_docid(DocumentId docid) {
+    header_block_->header()->last_indexed_docid = docid;
+  }
+
+  // Updates the header and persists all changes to the index to disk. Returns
+  // true on success.
+  bool PersistToDisk();
+
+  // Returns the size of the index file in bytes.
+  int64_t GetDiskUsage() const {
+    return filesystem_->GetDiskUsage(block_fd_.get());
+  }
+
+  int num_blocks() const { return num_blocks_; }
+
+  // Info about the index based on the block size.
+  int block_size() const { return header_block_->header()->block_size; }
+
+  // Num blocks starts at 1 since the first block is the header.
+  bool empty() const { return num_blocks_ <= 1; }
+
+  // The percentage of the maximum index size that is free. Allocated blocks are
+  // treated as fully used, even if they are only partially used. In this way,
+  // min_free_fraction is a lower bound of available space.
+  double min_free_fraction() const {
+    return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
+  }
+
+ private:
+  FlashIndexStorage(const std::string& index_filename,
+                    const Filesystem* filesystem, bool has_in_memory_freelists);
+
+  // Init the index from persistence. Create if file does not exist. We do not
+  // erase corrupt files.
+  //
+  // Returns false if unable to create a new header or if the existing one is
+  // corrupt.
+  bool Init();
+
+  // Create or open the header block. Returns true on success.
+  bool InitHeader();
+
+  // Create a new header block for an empty index file.
+  bool CreateHeader();
+
+  // Loads the header stored at the beginning of the index file and validates
+  // the values stored in it.
+  bool OpenHeader(int64_t file_size);
+
+  // Add the IndexBlock referred to by block_index in the on-disk free list with
+  // index block_info_index.
+  void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
+                           IndexBlock* index_block);
+
+  // Remove the IndexBlock referred to by block_index from the Header free list
+  // with index block_info_index.
+  void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index,
+                                IndexBlock* index_block);
+
+  // Returns:
+  //   - On success, a valid PostingListHolder created from the first entry of
+  //     the in-memory freelist at block_info_index
+  //   - NOT_FOUND if there was no entry in the freelist
+  //   - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+  //     allocated for some reason.
+  libtextclassifier3::StatusOr<PostingListHolder>
+  GetPostingListFromInMemoryFreeList(int block_info_index);
+
+  // Returns:
+  //   - On success, a valid PostingListHolder created from the first entry of
+  //     the on-disk freelist at block_info_index
+  //   - NOT_FOUND if there was no entry in the freelist
+  //   - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+  //     allocated for some reason.
+  libtextclassifier3::StatusOr<PostingListHolder>
+  GetPostingListFromOnDiskFreeList(int block_info_index);
+
+  // Returns:
+  //   - On success, a valid PostingListHolder created from a newly allocated
+  //     IndexBlock.
+  //   - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
+  //     IndexBlock.
+  libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
+      int block_info_index);
+
+  // Returns:
+  //   - On success, a newly created IndexBlock at block_index with posting
+  //     lists of size posting_list_size
+  //   - INTERNAL_ERROR if unable to access the region in file representing the
+  //   IndexBlock
+  libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
+      int block_index, uint32_t posting_list_size) const;
+
+  // Returns:
+  //   - On success, the IndexBlock that exists at block_index
+  //   - INTERNAL_ERROR if unable to access the region in file representing the
+  //   IndexBlock
+  libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const;
+
+  // Add a new block to the end of the file and return its block
+  // index. Returns kInvalidBlockIndex if unable to grow the index file.
+  int GrowIndex();
+
+  // Return the index into index_block_infos of the smallest posting_list free
+  // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
+  // the max-sized posting list.
+  int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
+
+  // Flushes the in-memory free list to disk.
+  void FlushInMemoryFreeList();
+
+  // Underlying filename.
+  std::string index_filename_;
+
+  // We open the index file into this fd.
+  ScopedFd block_fd_;
+  int num_blocks_;  // can be inferred from index file size
+
+  std::unique_ptr<HeaderBlock> header_block_;
+
+  // In-memory cache of free posting lists.
+  struct FreeList {
+    // Experimentally determined that high watermark for largest
+    // freelist was ~3500.
+    static constexpr size_t kMaxSize = 4096;
+
+    // Push a new PostingListIdentifier if there is space.
+    void Push(PostingListIdentifier id);
+
+    // Attempt to pop a PostingListIdentifier.
+    //
+    // RETURNS:
+    //  - identifier of a free posting list, on success
+    //  - NOT_FOUND if there are no free posting lists on this free list.
+    libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
+
+   private:
+    std::vector<PostingListIdentifier> free_list_;
+  };
+  std::vector<FreeList> in_memory_freelists_;
+
+  const Filesystem* filesystem_;  // not owned; can't be null
+
+  bool has_in_memory_freelists_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_FLASH_INDEX_STORAGE_H_

diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc
new file mode 100644
index 0000000..cf899b3
--- /dev/null
+++ b/icing/index/main/flash-index-storage_test.cc

@@ -0,0 +1,540 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Not;
+
+class FlashIndexStorageTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    test_dir_ = GetTestTempDir() + "/test_dir";
+    file_name_ = test_dir_ + "/test_file.idx.index";
+    ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+  }
+
+  void TearDown() override {
+    ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+  }
+
+ protected:
+  std::string test_dir_;
+  std::string file_name_;
+  Filesystem filesystem_;
+};
+
+TEST_F(FlashIndexStorageTest, CorruptHeader) {
+  {
+    // Create the header file
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+  }
+  {
+    // Read the valid header - should pass
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+  }
+  {
+    // Corrupt the header file by changing pl_bytes
+    ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+    off_t offset = 16;
+    uint32_t pl_bytes = sizeof(Hit) - 1;  // This is intentionally invalid
+    filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+  }
+  {
+    // Read the header file - should fail because pl_bytes is not divisible
+    // by sizeof(Hit), which is 5 as of writing
+    ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_),
+                StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+  }
+  {
+    // Correct the pl_bytes header alignment
+    ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+    off_t offset = 16;
+    uint32_t pl_bytes = 2 * sizeof(Hit);  // Should be valid
+    filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+  }
+  {
+    // Read the valid header - should pass
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+  }
+
+  // Delete the file
+  filesystem_.DeleteFile(file_name_.c_str());
+}
+
+TEST_F(FlashIndexStorageTest, EmptyStorage) {
+  {
+    // Create the header file
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+    // An 'empty' FlashIndexStorage should have:
+    //   1. One block allocated for the header
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+    EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+    //   2. The invalid DocumentId stored in its header
+    EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+                Eq(kInvalidDocumentId));
+    //   3. It's disk usage should be the equivalent of one block.
+    EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+                Eq(flash_index_storage.block_size()));
+  }
+  {
+    // Read the valid header. All functions should return the same values.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+    EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+    EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+                Eq(kInvalidDocumentId));
+    EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+                Eq(flash_index_storage.block_size()));
+  }
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemory) {
+  // Create the header file
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FlashIndexStorage flash_index_storage,
+      FlashIndexStorage::Create(file_name_, &filesystem_));
+  {
+    // 1. Request a PL that is 1/2 block size. Remember that block size also
+    // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+    // subtract 100 bytes to be sure. AllocatePostingList will round up from
+    // kHalfBlockPostingListSize to whatever the correct size is.
+    const int kHalfBlockPostingListSize =
+        (flash_index_storage.block_size() - 100) / 2;
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder1,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    PostingListIdentifier id1 = posting_list_holder1.id;
+    EXPECT_THAT(id1.is_valid(), IsTrue());
+    //   2. The index file should have grown by exactly one flash block.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits1 = {
+        Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+        Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+        Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+    for (const Hit& hit : hits1) {
+      ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+    // 2. Get another PL. This should be on the same flash block. There should
+    // be no allocation.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder2,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+    //   2. The index file should not have grown.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits2 = {
+        Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+        Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+        Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+    for (const Hit& hit : hits2) {
+      ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+    // 3. Now, free the first posting list. This should add it to the free list
+    flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+    // 4. Request another posting list. This should NOT grow the index because
+    // the first posting list is free.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder3,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+    //   2. The index file should not have grown.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+    //   3. The returned posting list holder should have the same id as the
+    //   first posting list holder.
+    EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+                Eq(id1.posting_list_index()));
+    EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+    // Make sure this pl is empty. The hits that used to be there should be
+    // gone.
+    EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                IsOkAndHolds(IsEmpty()));
+    std::vector<Hit> hits3 = {
+        Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+        Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+        Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+        Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+    for (const Hit& hit : hits3) {
+      ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+  }
+  EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+              Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
+  // Create the header file
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FlashIndexStorage flash_index_storage,
+      FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false));
+
+  {
+    // 1. Request a PL that is 1/2 block size. Remember that block size also
+    // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+    // subtract 100 bytes to be sure. AllocatePostingList will round up from
+    // kHalfBlockPostingListSize to whatever the correct size is.
+    const int kHalfBlockPostingListSize =
+        (flash_index_storage.block_size() - 100) / 2;
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder1,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    PostingListIdentifier id1 = posting_list_holder1.id;
+    EXPECT_THAT(id1.is_valid(), IsTrue());
+    //   2. The index file should have grown by exactly one flash block.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits1 = {
+        Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+        Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+        Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+    for (const Hit& hit : hits1) {
+      ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+    // 2. Get another PL. This should be on the same flash block. There should
+    // be no allocation.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder2,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+    //   2. The index file should not have grown.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits2 = {
+        Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+        Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+        Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+    for (const Hit& hit : hits2) {
+      ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+    // 3. Now, free the first posting list. This should add it to the free list
+    flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+    // 4. Request another posting list. This should NOT grow the index because
+    // the first posting list is free.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder3,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+    //   2. The index file should not have grown.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+    //   3. The returned posting list holder should have the same id as the
+    //   first posting list holder.
+    EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+                Eq(id1.posting_list_index()));
+    EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+    // Make sure this pl is empty. The hits that used to be there should be
+    // gone.
+    EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                IsOkAndHolds(IsEmpty()));
+    std::vector<Hit> hits3 = {
+        Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+        Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+        Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+        Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+    for (const Hit& hit : hits3) {
+      ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+  }
+  EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+              Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
+  PostingListIdentifier id1 = PostingListIdentifier::kInvalid;
+  int half_block_posting_list_size = 0;
+  {
+    // Create the header file
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+
+    {
+      // 1. Request a PL that is 1/2 block size. Remember that block size also
+      // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+      // subtract 100 bytes to be sure. AllocatePostingList will round up from
+      // kHalfBlockPostingListSize to whatever the correct size is.
+      half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2;
+      ICING_ASSERT_OK_AND_ASSIGN(
+          PostingListHolder posting_list_holder1,
+          flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+      // We expect:
+      //   1. FlashIndexStorage will return a valid id.
+      id1 = posting_list_holder1.id;
+      EXPECT_THAT(id1.is_valid(), IsTrue());
+      //   2. The index file should have grown by exactly one flash block.
+      EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+      EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+      std::vector<Hit> hits1 = {
+          Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+          Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+          Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+          Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+      for (const Hit& hit : hits1) {
+        ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+      }
+      EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+                  IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+      // 2. Get another PL. This should be on the same flash block. There should
+      // be no allocation.
+      ICING_ASSERT_OK_AND_ASSIGN(
+          PostingListHolder posting_list_holder2,
+          flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+      // We expect:
+      //   1. FlashIndexStorage will return a valid id.
+      EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+      //   2. The index file should not have grown.
+      EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+      EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+      std::vector<Hit> hits2 = {
+          Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+          Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+          Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+          Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+      for (const Hit& hit : hits2) {
+        ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+      }
+      EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+                  IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+      // 3. Now, free the first posting list. This should add it to the free list
+      flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+    }
+
+    EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+            Eq(2 * flash_index_storage.block_size()));
+    // 4. The FlashIndexStorage should go out of scope and flush the in-memory
+    // posting list to disk
+  }
+
+  {
+    // Recreate the flash index.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        FlashIndexStorage flash_index_storage,
+        FlashIndexStorage::Create(file_name_, &filesystem_));
+
+    {
+      // 5. Request another posting list. This should NOT grow the index because
+      // the first posting list is free.
+      ICING_ASSERT_OK_AND_ASSIGN(
+          PostingListHolder posting_list_holder3,
+          flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+      // We expect:
+      //   1. FlashIndexStorage will return a valid id.
+      EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+      //   2. The index file should not have grown.
+      EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+      EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+      //   3. The returned posting list holder should have the same id as the
+      //   first posting list holder.
+      EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+                  Eq(id1.posting_list_index()));
+      EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+      // Make sure this pl is empty. The hits that used to be there should be
+      // gone.
+      EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                  IsOkAndHolds(IsEmpty()));
+      std::vector<Hit> hits3 = {
+          Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+          Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+          Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+          Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+      for (const Hit& hit : hits3) {
+        ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+      }
+      EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+                  IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+    }
+    EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+        Eq(2 * flash_index_storage.block_size()));
+  }
+}
+
+TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
+  // Create the header file
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FlashIndexStorage flash_index_storage,
+      FlashIndexStorage::Create(file_name_, &filesystem_));
+  {
+    // 1. Request a PL that is 1/2 block size. Remember that block size also
+    // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+    // subtract 100 bytes to be sure. AllocatePostingList will round up from
+    // kHalfBlockPostingListSize to whatever the correct size is.
+    const int kHalfBlockPostingListSize =
+        (flash_index_storage.block_size() - 100) / 2;
+    const int kQuarterBlockPostingListSize =
+        (flash_index_storage.block_size() - 100) / 4;
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder1,
+        flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    PostingListIdentifier id1 = posting_list_holder1.id;
+    EXPECT_THAT(id1.is_valid(), IsTrue());
+    //   2. The index file should have grown by exactly one flash block.
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits1 = {
+        Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+        Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+        Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+    for (const Hit& hit : hits1) {
+      ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+    // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could
+    // theoretically fit in the same block, we'll allocate a new one because PLs
+    // on a block are required to be the same size.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder2,
+        flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+    //   2. The index file should have grown by one block.
+    EXPECT_THAT(posting_list_holder2.id.block_index(),
+                Not(Eq(id1.block_index())));
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+    std::vector<Hit> hits2 = {
+        Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+        Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+        Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+        Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+    for (const Hit& hit : hits2) {
+      ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+    }
+    EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+                IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+    // 3. Request another 1/4 block-size posting list. This should NOT grow the
+    // index because there should be three free posting lists on block2.
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PostingListHolder posting_list_holder3,
+        flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+    // We expect:
+    //   1. FlashIndexStorage will return a valid id.
+    EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+    //   2. The index file should have remained the same size as before and the
+    //      third posting list holder should use the same block as the second
+    //      posting list holder.
+    EXPECT_THAT(posting_list_holder3.id.block_index(),
+                Eq(posting_list_holder2.id.block_index()));
+    EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+    EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+  }
+  EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+              Eq(3 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) {
+  // Create the header file
+  ICING_ASSERT_OK_AND_ASSIGN(
+      FlashIndexStorage flash_index_storage,
+      FlashIndexStorage::Create(file_name_, &filesystem_));
+
+  // Request a PL that is 2x block size.
+  const int kDoubleBlockSize = flash_index_storage.block_size() * 2;
+  EXPECT_THAT(flash_index_storage.AllocatePostingList(kDoubleBlockSize),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc
index 9d7df3c..652dbc6 100644
--- a/icing/index/main/index-block.cc
+++ b/icing/index/main/index-block.cc

@@ -105,11 +105,12 @@
       posting_lists_start_ptr_(mmapped_block.mutable_region() +
                                sizeof(BlockHeader)),
       block_size_in_bytes_(mmapped_block.region_size()),
-      mmapped_block_(std::move(mmapped_block)) {}
+      mmapped_block_(
+          std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {}
 
 libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) {
-  ICING_RETURN_IF_ERROR(ValidatePostingListBytes(posting_list_bytes,
-                                                 mmapped_block_.region_size()));
+  ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
+      posting_list_bytes, mmapped_block_->region_size()));
   header_->free_list_posting_list_index = kInvalidPostingListIndex;
   header_->next_block_index = kInvalidBlockIndex;
   header_->posting_list_bytes = posting_list_bytes;

diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h
index 1d17e34..edf9a79 100644
--- a/icing/index/main/index-block.h
+++ b/icing/index/main/index-block.h

@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -95,6 +96,12 @@
   IndexBlock(IndexBlock&&) = default;
   IndexBlock& operator=(IndexBlock&&) = default;
 
+  ~IndexBlock() {
+    if (mmapped_block_ != nullptr) {
+      mmapped_block_->PersistToDisk();
+    }
+  }
+
   // Instantiate a PostingListUsed at posting_list_index with the existing
   // content in the IndexBlock.
   //
@@ -206,7 +213,7 @@
   uint32_t block_size_in_bytes_;
 
   // MemoryMappedFile used to interact with the underlying flash block.
-  MemoryMappedFile mmapped_block_;
+  std::unique_ptr<MemoryMappedFile> mmapped_block_;
 };
 
 }  // namespace lib

diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
new file mode 100644
index 0000000..878038f
--- /dev/null
+++ b/icing/index/main/main-index.cc

@@ -0,0 +1,339 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index.h"
+
+#include <cstring>
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Finds the best prefix term in lexicon for which "prefix" is a prefix.
+// 'Best' is defined as the shortest term that holds a valid posting list id.
+// Returns a valid FindTermResult with found=true if either:
+//   1. prefix exists as a term in lexicon.
+//   2. the shortest, valid prefix in the lexicon exists and contains prefix
+//      hits.
+// Returns a FindTermResult with found=false and undefined values of tvi and
+// exact if no term was found.
+struct FindTermResult {
+  // TVI of the term that was found. Undefined if found=false.
+  uint32_t tvi;
+  // Whether or not a valid term with prefix hits was found.
+  bool found;
+  // Whether or not that term is equal to 'prefix'
+  bool exact;
+};
+FindTermResult FindShortestValidTermWithPrefixHits(
+    const IcingDynamicTrie* lexicon, const std::string& prefix) {
+  // For prefix indexing: when we are doing a prefix match for "prefix", find
+  // the tvi to the equivalent posting list. prefix's own posting list might not
+  // exist but one of its children acts as a proxy.
+  IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+      *lexicon, GetHasHitsInPrefixSectionPropertyId());
+  uint32_t tvi = 0;
+  bool found = false;
+  bool exact = false;
+  for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
+       it.Advance()) {
+    PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+    memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
+
+    // Posting list id might be invalid if this is also a backfill term.
+    // Suppose that the main index has two pre-existing prefix hits "foot" and
+    // "fool" - it will have a branch point posting list for "foo". Then, let's
+    // suppose that the other index adds hits for "foul", "four" and "far". This
+    // will result in branch points for "fo" and "f".
+    // If "fo" was added before "f", then the iterator would first give us "fo".
+    // "fo" will have an invalid posting_list_id because it hasn't been
+    // backfilled yet, so we need to continue iterating to "foo".
+    if (posting_list_id.is_valid()) {
+      exact = (prefix.size() == strlen(it.GetKey()));
+      tvi = it.GetValueIndex();
+      // Found it. Does it have prefix hits?
+      found = exact || hits_in_prefix_section.HasProperty(tvi);
+      break;
+    }
+  }
+  FindTermResult result = {tvi, found, exact};
+  return result;
+}
+
+}  // namespace
+
+libtextclassifier3::StatusOr<MainIndex> MainIndex::Create(
+    const string& index_filename, const Filesystem* filesystem,
+    const IcingFilesystem* icing_filesystem) {
+  MainIndex main_index;
+  ICING_RETURN_IF_ERROR(
+      main_index.Init(index_filename, filesystem, icing_filesystem));
+  return main_index;
+}
+
+// TODO(b/139087650) : Migrate off of IcingFilesystem.
+libtextclassifier3::Status MainIndex::Init(
+    const string& index_filename, const Filesystem* filesystem,
+    const IcingFilesystem* icing_filesystem) {
+  std::string flash_index_file = index_filename + "-main-index";
+  ICING_ASSIGN_OR_RETURN(
+      FlashIndexStorage flash_index,
+      FlashIndexStorage::Create(flash_index_file, filesystem));
+  flash_index_ = std::make_unique<FlashIndexStorage>(std::move(flash_index));
+
+  std::string lexicon_file = index_filename + "-main-lexicon";
+  IcingDynamicTrie::RuntimeOptions runtime_options;
+  main_lexicon_ = std::make_unique<IcingDynamicTrie>(
+      lexicon_file, runtime_options, icing_filesystem);
+  IcingDynamicTrie::Options lexicon_options;
+  if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
+      !main_lexicon_->Init()) {
+    return absl_ports::InternalError("Failed to initialize lexicon trie");
+  }
+  return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+MainIndex::GetAccessorForExactTerm(const std::string& term) {
+  PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+  if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Term %s is not present in main lexicon.", term.c_str()));
+  }
+  ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor,
+                         PostingListAccessor::CreateFromExisting(
+                             flash_index_.get(), posting_list_id));
+  return std::make_unique<PostingListAccessor>(std::move(accessor));
+}
+
+libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
+MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
+  bool exact = false;
+  // For prefix indexing: when we are doing a prefix match for
+  // "prefix", find the tvi to the equivalent posting list. prefix's
+  // own posting list might not exist but its shortest child acts as a proxy.
+  //
+  // For example, if there are only two hits in the index are prefix hits for
+  // "bar" and "bat", then both will appear on a posting list for "ba". "b"
+  // won't have a posting list, but "ba" will suffice.
+  IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+      *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
+  IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
+  if (!main_itr.IsValid()) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Term: %s is not present in the main lexicon.", prefix.c_str()));
+  }
+  exact = (prefix.length() == strlen(main_itr.GetKey()));
+
+  if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
+    // Found it, but it doesn't have prefix hits. Exit early. No need to
+    // retrieve the posting list because there's nothing there for us.
+    return libtextclassifier3::Status::OK;
+  }
+  PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+  memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
+  ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+                         PostingListAccessor::CreateFromExisting(
+                             flash_index_.get(), posting_list_id));
+  GetPrefixAccessorResult result = {std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact};
+  return result;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
+  // Maps new branching points in main lexicon to the term such that
+  // branching_point_term is a prefix of term and there are no terms smaller
+  // than term and greater than branching_point_term.
+  std::string prefix;
+  LexiconMergeOutputs outputs;
+  for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+       other_term_itr.IsValid(); other_term_itr.Advance()) {
+    // If term were inserted in the main lexicon, what new branching would it
+    // create? (It always creates at most one.)
+    int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
+        other_term_itr.GetKey(), /*utf8=*/true);
+    if (prefix_len <= 0) {
+      continue;
+    }
+    prefix.assign(other_term_itr.GetKey(), prefix_len);
+
+    // Figure out backfill tvi. Might not exist since all children terms could
+    // only contain hits from non-prefix sections.
+    //
+    // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
+    // we're adding "foul". The new branching prefix will be "fo". The backfill
+    // prefix will be "foo" - all hits in prefix section on "foo" will need to
+    // be added to the new "fo" posting list later.
+    FindTermResult result =
+        FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
+    if (!result.found || result.exact) {
+      continue;
+    }
+
+    // This is a new prefix that will need backfilling from its next-in-line
+    // posting list. This new prefix will have to have a posting list eventually
+    // so insert a default PostingListIdentifier as a placeholder.
+    uint32_t branching_prefix_tvi;
+    bool new_key;
+    PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+    if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id,
+                               &branching_prefix_tvi, false, &new_key)) {
+      return absl_ports::InternalError("Could not insert branching prefix");
+    }
+
+    // Backfills only contain prefix hits by default. So set these here but
+    // could be overridden when adding hits from the other index later.
+    if (!main_lexicon_->SetProperty(branching_prefix_tvi,
+                                    GetHasNoExactHitsPropertyId()) ||
+        !main_lexicon_->SetProperty(branching_prefix_tvi,
+                                    GetHasHitsInPrefixSectionPropertyId())) {
+      return absl_ports::InternalError("Setting prefix prop failed");
+    }
+
+    outputs.backfill_map[branching_prefix_tvi] = result.tvi;
+  }
+  return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
+                    LexiconMergeOutputs&& outputs) {
+  IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
+  for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, "");
+       other_term_itr.IsValid(); other_term_itr.Advance()) {
+    uint32_t new_main_tvi;
+    PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+    if (!main_lexicon_->Insert(other_term_itr.GetKey(), &posting_list_id,
+                               &new_main_tvi,
+                               /*replace=*/false)) {
+      return absl_ports::InternalError(absl_ports::StrCat(
+          "Could not insert term: ", other_term_itr.GetKey()));
+    }
+
+    // Copy the properties from the other lexicon over to the main lexicon.
+    uint32_t other_tvi = other_term_itr.GetValueIndex();
+    if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
+                        new_main_tvi)) {
+      return absl_ports::InternalError("Could not insert term");
+    }
+
+    // Add other to main mapping.
+    outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
+  }
+  return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
+                           LexiconMergeOutputs&& outputs) {
+  IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
+      other_lexicon, GetHasHitsInPrefixSectionPropertyId());
+  if (!has_prefix_prop_reader.Exists()) {
+    return outputs;
+  }
+  std::string prefix;
+  for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, "");
+       other_term_itr.IsValid(); other_term_itr.Advance()) {
+    // Only expand terms that have hits in prefix sections.
+    if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
+      continue;
+    }
+
+    // Get prefixes where there is already a branching point in the main
+    // lexicon. We skip prefixes which don't already have a branching point.
+    std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
+        other_term_itr.GetKey(), /*utf8=*/true);
+
+    int buf_start = outputs.prefix_tvis_buf.size();
+    // Add prefixes.
+    for (int prefix_length : prefix_lengths) {
+      if (prefix_length <= 0) {
+        continue;
+      }
+
+      prefix.assign(other_term_itr.GetKey(), prefix_length);
+      uint32_t prefix_tvi;
+      bool new_key;
+      PostingListIdentifier posting_list_identifier =
+          PostingListIdentifier::kInvalid;
+      if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_identifier,
+                                 &prefix_tvi, /*replace=*/false, &new_key)) {
+        return absl_ports::InternalError("Could not insert prefix");
+      }
+
+      // Prefix tvi will have hits in prefix section.
+      if (!main_lexicon_->SetProperty(prefix_tvi,
+                                      GetHasHitsInPrefixSectionPropertyId())) {
+        return absl_ports::InternalError(
+            "Setting has hits in prefix section prop failed");
+      }
+
+      // If it hasn't been added by non-prefix term insertions in
+      // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
+      if (new_key && !main_lexicon_->SetProperty(
+                         prefix_tvi, GetHasNoExactHitsPropertyId())) {
+        return absl_ports::InternalError("Setting no exact hits prop failed");
+      }
+
+      outputs.prefix_tvis_buf.push_back(prefix_tvi);
+    }
+
+    // Any prefixes added? Then add to map.
+    if (buf_start < outputs.prefix_tvis_buf.size()) {
+      outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
+          buf_start, outputs.prefix_tvis_buf.size() - buf_start};
+    }
+  }
+  return outputs;
+}
+
+bool MainIndex::CopyProperties(
+    const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+    const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+    uint32_t new_main_tvi) {
+  for (uint32_t property_id = 0; property_id < prop_reader.size();
+       ++property_id) {
+    if (property_id == GetHasNoExactHitsPropertyId()) {
+      // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
+      // for this term, then HasNoExactHits needs to be set to false in
+      // main_lexicon. If other_lexicon has no exact hits for this term, then
+      // HasNoExactHits in the main_lexicon should not be modified.
+      if (!prop_reader.HasProperty(property_id, other_tvi) &&
+          !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
+        LOG(ERROR) << "Clearing prefix prop failed";
+        return false;
+      }
+    } else {
+      // If other_lexicon has this property set for this term, then that
+      // property needs to be set for the main_lexicon. If other_lexicon
+      // doesn't have this property set, then
+      if (prop_reader.HasProperty(property_id, other_tvi) &&
+          !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
new file mode 100644
index 0000000..15bec1f
--- /dev/null
+++ b/icing/index/main/main-index.h

@@ -0,0 +1,182 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class MainIndex {
+ public:
+  static libtextclassifier3::StatusOr<MainIndex> Create(
+      const string& index_filename, const Filesystem* filesystem,
+      const IcingFilesystem* icing_filesystem);
+
+  // Get a PostingListAccessor that holds the posting list chain for 'term'.
+  //
+  // RETURNS:
+  //  - On success, a valid PostingListAccessor
+  //  - NOT_FOUND if term is not present in the main index.
+  libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+  GetAccessorForExactTerm(const std::string& term);
+
+  // Get a PostingListAccessor for 'prefix'.
+  //
+  // RETURNS:
+  //  - On success, a result containing a valid PostingListAccessor.
+  //  - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
+  //    prefix are present in the main index.
+  struct GetPrefixAccessorResult {
+    // A PostingListAccessor that holds the posting list chain for the term
+    // that best represents 'prefix' in the main index.
+    std::unique_ptr<PostingListAccessor> accessor;
+    // True if the returned posting list chain is for 'prefix' or false if the
+    // returned posting list chain is for a term for which 'prefix' is a prefix.
+    bool exact;
+  };
+  libtextclassifier3::StatusOr<GetPrefixAccessorResult>
+  GetAccessorForPrefixTerm(const std::string& prefix);
+
+  struct LexiconMergeOutputs {
+    // Maps from main_lexicon tvi for new branching point to the main_lexicon
+    // tvi for posting list whose hits must be backfilled.
+    std::unordered_map<uint32_t, uint32_t> backfill_map;
+
+    // Maps from lexicon tvis to main_lexicon tvis.
+    std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
+
+    // Maps from the lexicon tvi to the beginning position in
+    // prefix_tvis_buf and the length.
+    std::unordered_map<uint32_t, std::pair<int, int>>
+        other_tvi_to_prefix_main_tvis;
+
+    // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
+    std::vector<uint32_t> prefix_tvis_buf;
+  };
+
+  // Merge the lexicon into the main lexicon and populate the data
+  // structures necessary to translate lite tvis to main tvis, track backfilling
+  // and expanding lite terms to prefix terms.
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL on IO error while writing to the main lexicon.
+  libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
+      const IcingDynamicTrie& other_lexicon) {
+    // Backfill branch points need to be added first so that the backfill_map
+    // can be correctly populated.
+    ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
+                           AddBackfillBranchPoints(other_lexicon));
+    ICING_ASSIGN_OR_RETURN(outputs,
+                           AddTerms(other_lexicon, std::move(outputs)));
+    // Non-backfill branch points need to be added last so that the mapping of
+    // newly added terms to prefix terms can be correctly populated (prefix
+    // terms might be branch points between two new terms or between a
+    // pre-existing term and a new term).
+    ICING_ASSIGN_OR_RETURN(outputs,
+                           AddBranchPoints(other_lexicon, std::move(outputs)));
+    return outputs;
+  }
+
+  // Add hits to the main index and backfill from existing posting lists to new
+  // backfill branch points.
+  //
+  // RETURNS:
+  //  - OK on success
+  //  - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+  //  exceeds the max TermId, is not valid or is not less than pre-existing hits
+  //  in the main index.
+  //  - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
+  //  - RESOURCE_EXHAUSTED error if unable to grow the index
+  libtextclassifier3::Status AddHits(
+      const TermIdCodec& term_id_codec,
+      std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+      std::vector<LiteIndex::Element>&& hits);
+
+ private:
+  libtextclassifier3::Status Init(const string& index_filename,
+                                  const Filesystem* filesystem,
+                                  const IcingFilesystem* icing_filesystem);
+
+  // Helpers for merging the lexicon
+  // Add all 'backfill' branch points. Backfill branch points are prefix
+  // branch points that are a prefix of terms that existed in the lexicon
+  // to the merge.
+  //
+  // For example, if the main lexicon only contains "foot" and is then merged
+  // with a lite lexicon containing only "fool", then a backfill branch point
+  // for "foo" will be added to contain prefix hits from both the pre-existing
+  // posting list for "foot" and the new posting list for "fool".
+  //
+  // Populates LexiconMergeOutputs.backfill_map
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL on IO error while writing to the main lexicon.
+  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
+      const IcingDynamicTrie& other_lexicon);
+
+  // Add all terms from the lexicon.
+  //
+  // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL on IO error while writing to the main lexicon.
+  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
+      const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+  // Add all branch points for terms added from the lexicon.
+  // For example, if the main lexicon is empty and is then merged with a
+  // lexicon containing only "foot" and "fool", then a branch point for "foo"
+  // will be added to contain prefix hits from both "foot" and "fool".
+  //
+  // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
+  // LexiconMergeOutputs.prefix_tvis_buf;
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL on IO error while writing to the main lexicon.
+  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
+      const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+  // Copies all properties from old_tvi in the other lexicon to the new_tvi in
+  // the main lexicon.
+  // Returns true on success, false if an IO error is encountered.
+  bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+                      const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+                      uint32_t new_main_tvi);
+
+  std::unique_ptr<FlashIndexStorage> flash_index_;
+  std::unique_ptr<IcingDynamicTrie> main_lexicon_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_MAIN_MAIN_INDEX_H_

diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc
new file mode 100644
index 0000000..a4f8ca7
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.cc

@@ -0,0 +1,194 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create(
+    FlashIndexStorage *storage) {
+  uint32_t max_posting_list_bytes =
+      IndexBlock::CalculateMaxPostingListBytes(storage->block_size());
+  std::unique_ptr<uint8_t[]> posting_list_buffer_array =
+      std::make_unique<uint8_t[]>(max_posting_list_bytes);
+  ICING_ASSIGN_OR_RETURN(
+      PostingListUsed posting_list_buffer,
+      PostingListUsed::CreateFromUnitializedRegion(
+          posting_list_buffer_array.get(), max_posting_list_bytes));
+  return PostingListAccessor(storage, std::move(posting_list_buffer_array),
+                             std::move(posting_list_buffer));
+}
+
+libtextclassifier3::StatusOr<PostingListAccessor>
+PostingListAccessor::CreateFromExisting(
+    FlashIndexStorage *storage,
+    PostingListIdentifier existing_posting_list_id) {
+  // Our posting_list_buffer_ will start as empty.
+  ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage));
+  ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+                         storage->GetPostingList(existing_posting_list_id));
+  pl_accessor.preexisting_posting_list_ =
+      std::make_unique<PostingListHolder>(std::move(holder));
+  return pl_accessor;
+}
+
+// Returns the next batch of hits for the provided posting list.
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListAccessor::GetNextHitsBatch() {
+  if (preexisting_posting_list_ == nullptr) {
+    if (has_reached_posting_list_chain_end_) {
+      return std::vector<Hit>();
+    }
+    return absl_ports::FailedPreconditionError(
+        "Cannot retrieve hits from a PostingListAccessor that was not creaated "
+        "from a preexisting posting list.");
+  }
+  ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch,
+                         preexisting_posting_list_->posting_list.GetHits());
+  uint32_t block_index = preexisting_posting_list_->block.next_block_index();
+  if (block_index != kInvalidBlockIndex) {
+    PostingListIdentifier next_posting_list_id(
+        block_index, /*posting_list_index=*/0,
+        preexisting_posting_list_->block.posting_list_index_bits());
+    ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+                           storage_->GetPostingList(next_posting_list_id));
+    preexisting_posting_list_ =
+        std::make_unique<PostingListHolder>(std::move(holder));
+  } else {
+    has_reached_posting_list_chain_end_ = true;
+    preexisting_posting_list_.reset();
+  }
+  return batch;
+}
+
+libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) {
+  PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr)
+                                   ? preexisting_posting_list_->posting_list
+                                   : posting_list_buffer_;
+  libtextclassifier3::Status status = active_pl.PrependHit(hit);
+  if (!absl_ports::IsResourceExhausted(status)) {
+    return status;
+  }
+  // There is no more room to add hits to this current posting list! Therefore,
+  // we need to either move those hits to a larger posting list or flush this
+  // posting list and create another max-sized posting list in the chain.
+  if (preexisting_posting_list_ != nullptr) {
+    FlushPreexistingPostingList();
+  } else {
+    ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+  }
+
+  // Re-add hit. Should always fit since we just cleared posting_list_buffer_.
+  // It's fine to explicitly reference posting_list_buffer_ here because there's
+  // no way of reaching this line while preexisting_posting_list_ is still in
+  // use.
+  return posting_list_buffer_.PrependHit(hit);
+}
+
+void PostingListAccessor::FlushPreexistingPostingList() {
+  if (preexisting_posting_list_->block.max_num_posting_lists() == 1) {
+    // If this is a max-sized posting list, then just keep track of the id for
+    // chaining. It'll be flushed to disk when preexisting_posting_list_ is
+    // destructed.
+    prev_block_identifier_ = preexisting_posting_list_->id;
+  } else {
+    // If this is NOT a max-sized posting list, then our hits have outgrown this
+    // particular posting list. Move the hits into the in-memory posting list
+    // and free this posting list.
+    //
+    // Move will always succeed since posting_list_buffer_ is max_pl_bytes.
+    posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list);
+
+    // Now that all the contents of this posting list have been copied, there's
+    // no more use for it. Make it available to be used for another posting
+    // list.
+    storage_->FreePostingList(std::move(*preexisting_posting_list_));
+  }
+  preexisting_posting_list_.reset();
+}
+
+libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() {
+  // We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update
+  // the chain.
+  uint32_t max_posting_list_bytes =
+      IndexBlock::CalculateMaxPostingListBytes(storage_->block_size());
+  ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+                         storage_->AllocatePostingList(max_posting_list_bytes));
+  holder.block.set_next_block_index(prev_block_identifier_.block_index());
+  prev_block_identifier_ = holder.id;
+  return holder.posting_list.MoveFrom(&posting_list_buffer_);
+}
+
+PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
+    PostingListAccessor accessor) {
+  if (accessor.preexisting_posting_list_ != nullptr) {
+    // Our hits are already in an existing posting list. Nothing else to do, but
+    // return its id.
+    FinalizeResult result = {libtextclassifier3::Status::OK,
+                             accessor.preexisting_posting_list_->id};
+    return result;
+  }
+  if (accessor.posting_list_buffer_.BytesUsed() <= 0) {
+    FinalizeResult result = {absl_ports::InvalidArgumentError(
+                                 "Can't finalize an empty PostingListAccessor. "
+                                 "There's nothing to Finalize!"),
+                             PostingListIdentifier::kInvalid};
+    return result;
+  }
+  uint32_t posting_list_bytes =
+      accessor.posting_list_buffer_.MinPostingListSizeToFit();
+  if (accessor.prev_block_identifier_.is_valid()) {
+    posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+        accessor.storage_->block_size());
+  }
+  auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes);
+  if (!holder_or.ok()) {
+    FinalizeResult result = {holder_or.status(),
+                             accessor.prev_block_identifier_};
+    return result;
+  }
+  PostingListHolder holder = std::move(holder_or).ValueOrDie();
+  if (accessor.prev_block_identifier_.is_valid()) {
+    holder.block.set_next_block_index(
+        accessor.prev_block_identifier_.block_index());
+  }
+
+  // Move to allocated area. This should never actually return an error. We know
+  // that editor.posting_list() is valid because it wouldn't have successfully
+  // returned by AllocatePostingList if it wasn't. We know posting_list_buffer_
+  // is valid because we created it in-memory. And finally, we know that the
+  // hits from posting_list_buffer_ will fit in editor.posting_list() because we
+  // requested it be at at least posting_list_bytes large.
+  auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_);
+  if (!status.ok()) {
+    FinalizeResult result = {std::move(status),
+                             accessor.prev_block_identifier_};
+    return result;
+  }
+  FinalizeResult result = {libtextclassifier3::Status::OK, holder.id};
+  return result;
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h
new file mode 100644
index 0000000..e1bb3c0
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.h

@@ -0,0 +1,168 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+#define ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+
+namespace icing {
+namespace lib {
+
+// This class serves to:
+//  1. Expose PostingListUseds to clients of FlashIndexStorage
+//  2. Ensure the corresponding instance of IndexBlock has the same lifecycle as
+//     the instance of PostingListUsed that the client has access to, while
+//     not exposing IndexBlock's api surface.
+//  3. Ensure that PostingListUseds can only be freed by calling methods which
+//     will also properly maintain the FlashIndexStorage free list and prevent
+//     callers from modifying the Posting List after freeing.
+
+// This class is used to provide a simple abstraction for adding hits to posting
+// lists. PostingListAccessor handles 1) selection of properly-sized posting
+// lists for the accumulated hits during Finalize() and 2) chaining of max-sized
+// posting lists.
+class PostingListAccessor {
+ public:
+  // Creates an empty PostingListAccessor.
+  //
+  // RETURNS:
+  //   - On success, a valid instance of PostingListAccessor
+  //   - INVALID_ARGUMENT error if storage has an invalid block_size.
+  static libtextclassifier3::StatusOr<PostingListAccessor> Create(
+      FlashIndexStorage* storage);
+
+  // Create a PostingListAccessor with an existing posting list identified by
+  // existing_posting_list_id.
+  //
+  // The PostingListAccessor will add hits to this posting list until it is
+  // necessary either to 1) chain the posting list (if it is max-sized) or 2)
+  // move its hits to a larger posting list.
+  //
+  // RETURNS:
+  //   - On success, a valid instance of PostingListAccessor
+  //   - INVALID_ARGUMENT if storage has an invalid block_size.
+  static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting(
+      FlashIndexStorage* storage,
+      PostingListIdentifier existing_posting_list_id);
+
+  // Retrieve the next batch of hits for the posting list chain
+  //
+  // RETURNS:
+  //   - On success, a vector of hits in the posting list chain
+  //   - INTERNAL if called on an instance of PostingListAccessor that was
+  //     created via PostingListAccessor::Create, if unable to read the next
+  //     posting list in the chain or if the posting list has been corrupted
+  //     somehow.
+  libtextclassifier3::StatusOr<std::vector<Hit>> GetNextHitsBatch();
+
+  // Prepend one hit. This may result in flushing the posting list to disk (if
+  // the PostingListAccessor holds a max-sized posting list that is full) or
+  // freeing a pre-existing posting list if it is too small to fit all hits
+  // necessary.
+  //
+  // RETURNS:
+  //   - OK, on success
+  //   - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+  //   previously added hit.
+  //   - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+  //   posting list.
+  libtextclassifier3::Status PrependHit(const Hit& hit);
+
+  struct FinalizeResult {
+    //   - OK on success
+    //   - INVALID_ARGUMENT if there was no pre-existing posting list and no
+    //     hits were added
+    //   - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a
+    //     new posting list.
+    libtextclassifier3::Status status;
+    // Id of the posting list chain that was finalized. Guaranteed to be valid
+    // if status is OK. May be valid if status is non-OK, but previous blocks
+    // were written.
+    PostingListIdentifier id;
+  };
+  // Write all accumulated hits to storage.
+  //
+  // If accessor points to a posting list chain with multiple posting lists in
+  // the chain and unable to write the last posting list in the chain, Finalize
+  // will return the error and also populate id with the id of the
+  // second-to-last posting list.
+  static FinalizeResult Finalize(PostingListAccessor accessor);
+
+ private:
+  explicit PostingListAccessor(
+      FlashIndexStorage* storage,
+      std::unique_ptr<uint8_t[]> posting_list_buffer_array,
+      PostingListUsed posting_list_buffer)
+      : storage_(storage),
+        prev_block_identifier_(PostingListIdentifier::kInvalid),
+        posting_list_buffer_array_(std::move(posting_list_buffer_array)),
+        posting_list_buffer_(std::move(posting_list_buffer)),
+        has_reached_posting_list_chain_end_(false) {}
+
+  // Flushes preexisting_posting_list_ to disk if it's a max-sized posting list
+  // and populates prev_block_identifier.
+  // If it's not a max-sized posting list, moves the contents of
+  // preexisting_posting_list_ to posting_list_buffer_ and frees
+  // preexisting_posting_list_.
+  // Sets preexisting_posting_list_ to nullptr.
+  void FlushPreexistingPostingList();
+
+  // Flushes posting_list_buffer_ to a max-sized posting list on disk, setting
+  // its next pointer to prev_block_identifier_ and updating
+  // prev_block_identifier_ to point to the just-written posting list.
+  libtextclassifier3::Status FlushInMemoryPostingList();
+
+  // Frees all posting lists in the posting list chain starting at
+  // prev_block_identifier_.
+  libtextclassifier3::Status FreePostingListChain();
+
+  FlashIndexStorage* storage_;  // Does not own.
+
+  // The PostingListIdentifier of the first max-sized posting list in the
+  // posting list chain or PostingListIdentifier::kInvalid if there is no
+  // posting list chain.
+  PostingListIdentifier prev_block_identifier_;
+
+  // An editor to an existing posting list on disk. If available (non-NULL),
+  // we'll try to add all hits to this posting list. Once this posting list
+  // fills up, we'll either 1) chain it (if a max-sized posting list) and put
+  // future hits in posting_list_buffer_ or 2) copy all of its hits into
+  // posting_list_buffer_ and free this pl (if not a max-sized posting list).
+  // TODO(tjbarron) provide a benchmark to demonstrate the effects that re-using
+  // existing posting lists has on latency.
+  std::unique_ptr<PostingListHolder> preexisting_posting_list_;
+
+  // In-memory posting list used to buffer hits before writing them to the
+  // smallest on-disk posting list that will fit them.
+  // posting_list_buffer_array_ owns the memory region that posting_list_buffer_
+  // interprets. Therefore, posting_list_buffer_array_ must have the same
+  // lifecycle as posting_list_buffer_.
+  std::unique_ptr<uint8_t[]> posting_list_buffer_array_;
+  PostingListUsed posting_list_buffer_;
+
+  bool has_reached_posting_list_chain_end_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_POSTING_LIST_ACCESSOR_H_

diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc
new file mode 100644
index 0000000..8a5ef07
--- /dev/null
+++ b/icing/index/main/posting-list-accessor_test.cc

@@ -0,0 +1,384 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::SizeIs;
+
+TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  // Add some hits! Any hits!
+  std::vector<Hit> hits1 =
+      CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  for (const Hit& hit : hits1) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result.status);
+  EXPECT_THAT(result.id.block_index(), Eq(1));
+  EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+  // Retrieve some hits.
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+                             flash_index_storage.GetPostingList(result.id));
+  EXPECT_THAT(pl_holder.posting_list.GetHits(),
+              IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+  EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  // Add a single hit. This will fit in a min-sized posting list.
+  Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore);
+  ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result1.status);
+  // Should have been allocated to the first block.
+  EXPECT_THAT(result1.id.block_index(), Eq(1));
+  EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+  // Add one more hit. The minimum size for a posting list must be able to fit
+  // at least two hits, so this should NOT cause the previous pl to be
+  // reallocated.
+  ICING_ASSERT_OK_AND_ASSIGN(
+      pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+                                                           result1.id));
+  Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1);
+  ICING_ASSERT_OK(pl_accessor.PrependHit(hit2));
+  PostingListAccessor::FinalizeResult result2 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result2.status);
+  // Should have been allocated to the same posting list as the first hit.
+  EXPECT_THAT(result2.id, Eq(result1.id));
+
+  // The posting list at result2.id should hold all of the hits that have been
+  // added.
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+                             flash_index_storage.GetPostingList(result2.id));
+  EXPECT_THAT(pl_holder.posting_list.GetHits(),
+              IsOkAndHolds(ElementsAre(hit2, hit1)));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  // The smallest posting list size is 15 bytes. The first four hits will be
+  // compressed to one byte each and will be able to fit in the 5 byte padded
+  // region. The last hit will fit in one of the special hits. The posting list
+  // will be ALMOST_FULL and can fit at most 2 more hits.
+  std::vector<Hit> hits1 =
+      CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+  for (const Hit& hit : hits1) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result1.status);
+  // Should have been allocated to the first block.
+  EXPECT_THAT(result1.id.block_index(), Eq(1));
+  EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+  // Now let's add some more hits!
+  ICING_ASSERT_OK_AND_ASSIGN(
+      pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+                                                           result1.id));
+  // The current posting list can fit at most 2 more hits. Adding 12 more hits
+  // should result in these hits being moved to a larger posting list.
+  std::vector<Hit> hits2 = CreateHits(
+      /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12,
+      /*desired_byte_length=*/1);
+
+  for (const Hit& hit : hits2) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result2 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result2.status);
+  // Should have been allocated to the second (new) block because the posting
+  // list should have grown beyond the size that the first block maintains.
+  EXPECT_THAT(result2.id.block_index(), Eq(2));
+  EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+  // The posting list at result2.id should hold all of the hits that have been
+  // added.
+  for (const Hit& hit : hits2) {
+    hits1.push_back(hit);
+  }
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+                             flash_index_storage.GetPostingList(result2.id));
+  EXPECT_THAT(pl_holder.posting_list.GetHits(),
+              IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  // Add some hits! Any hits!
+  std::vector<Hit> hits1 =
+      CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+  for (const Hit& hit : hits1) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result1.status);
+  PostingListIdentifier second_block_id = result1.id;
+  // Should have been allocated to the second block, which holds a max-sized
+  // posting list.
+  EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+                                   /*block_index=*/2, /*posting_list_index=*/0,
+                                   /*posting_list_index_bits=*/0)));
+
+  // Now let's retrieve them!
+  ICING_ASSERT_OK_AND_ASSIGN(
+      PostingListHolder pl_holder,
+      flash_index_storage.GetPostingList(second_block_id));
+  // This pl_holder will only hold a posting list with the hits that didn't fit
+  // on the first block.
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+                             pl_holder.posting_list.GetHits());
+  ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+  auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+  EXPECT_THAT(second_block_hits,
+              ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+  // Now retrieve all of the hits that were on the first block.
+  uint32_t first_block_id = pl_holder.block.next_block_index();
+  EXPECT_THAT(first_block_id, Eq(1));
+
+  PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+                              /*posting_list_index_bits=*/0);
+  ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+                             flash_index_storage.GetPostingList(pl_id));
+  EXPECT_THAT(
+      pl_holder.posting_list.GetHits(),
+      IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest,
+     PreexistingMultiBlockReusesBlocksProperly) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  // Add some hits! Any hits!
+  std::vector<Hit> hits1 =
+      CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+  for (const Hit& hit : hits1) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result1.status);
+  PostingListIdentifier first_add_id = result1.id;
+  EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+                                /*block_index=*/2, /*posting_list_index=*/0,
+                                /*posting_list_index_bits=*/0)));
+
+  // Now add a couple more hits. These should fit on the existing, not full
+  // second block.
+  ICING_ASSERT_OK_AND_ASSIGN(
+      pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+                                                           first_add_id));
+  std::vector<Hit> hits2 = CreateHits(
+      /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50,
+      /*desired_byte_length=*/1);
+
+  for (const Hit& hit : hits2) {
+    ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+  }
+  PostingListAccessor::FinalizeResult result2 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_EXPECT_OK(result2.status);
+  PostingListIdentifier second_add_id = result2.id;
+  EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+  // We should be able to retrieve all 5050 hits.
+  for (const Hit& hit : hits2) {
+    hits1.push_back(hit);
+  }
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+                             flash_index_storage.GetPostingList(second_add_id));
+  // This pl_holder will only hold a posting list with the hits that didn't fit
+  // on the first block.
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+                             pl_holder.posting_list.GetHits());
+  ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+  auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+  EXPECT_THAT(second_block_hits,
+              ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+  // Now retrieve all of the hits that were on the first block.
+  uint32_t first_block_id = pl_holder.block.next_block_index();
+  EXPECT_THAT(first_block_id, Eq(1));
+
+  PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+                              /*posting_list_index_bits=*/0);
+  ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+                             flash_index_storage.GetPostingList(pl_id));
+  EXPECT_THAT(
+      pl_holder.posting_list.GetHits(),
+      IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  Hit invalid_hit;
+  EXPECT_THAT(pl_accessor.PrependHit(invalid_hit),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+  ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+
+  Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kMaxHitScore);
+  EXPECT_THAT(pl_accessor.PrependHit(hit2),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+  Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore);
+  EXPECT_THAT(pl_accessor.PrependHit(hit3),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  EXPECT_THAT(result1.status,
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) {
+  std::string test_dir = GetTestTempDir() + "/test_dir";
+  std::string file_name = test_dir + "/test_file.idx.index";
+  Filesystem filesystem;
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+                             FlashIndexStorage::Create(file_name, &filesystem));
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+                             PostingListAccessor::Create(&flash_index_storage));
+  Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+  ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+  PostingListAccessor::FinalizeResult result1 =
+      PostingListAccessor::Finalize(std::move(pl_accessor));
+  ICING_ASSERT_OK(result1.status);
+
+  ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2,
+                             PostingListAccessor::CreateFromExisting(
+                                 &flash_index_storage, result1.id));
+  PostingListAccessor::FinalizeResult result2 =
+      PostingListAccessor::Finalize(std::move(pl_accessor2));
+  ICING_ASSERT_OK(result2.status);
+}
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/index/main/posting-list-identifier.cc b/icing/index/main/posting-list-identifier.cc
new file mode 100644
index 0000000..1cdac65
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.cc

@@ -0,0 +1,25 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-identifier.h"
+
+namespace icing {
+namespace lib {
+
+PostingListIdentifier PostingListIdentifier::kInvalid(
+    kInvalidBlockIndex, /*posting_list_index=*/0,
+    PostingListIdentifier::kEncodedPostingListIndexBits - 1);
+
+}  //  namespace lib
+}  //  namespace icing

diff --git a/icing/index/main/posting-list-identifier.h b/icing/index/main/posting-list-identifier.h
new file mode 100644
index 0000000..4953865
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.h

@@ -0,0 +1,116 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+// 1M blocks * 4K page size = 4GB index
+inline constexpr int kBlockIndexBits = 20;
+inline constexpr int kMaxBlockIndex = (1u << kBlockIndexBits) - 1;
+
+// Class used to store information necessary to identify any posting list within
+// the index.
+//
+// The 20 leftmost bits in this identifier encode the block index. The 12
+// rightmost bits encode both the posting list index and the maximum number of
+// bits required to encode a posting list index on that block.
+//
+// Ex. An index block containing a max of 68 posting lists each of size 60
+// bytes (and thus 7 posting list bits), with a block index of 13 and a posting
+// list index of 5.
+//   0000  0000  0000  0000  1101  1111  0000  0101
+//  |__________block-index_______|__pad__|_pl-index_|
+//
+// "pad" is some region starting at kEncodedPostingListIndexBits (12) bit and
+// continuing rightward until reaching a terminating "0". This padding encodes
+// the posting list bits value - posting list bits value is the number of bits
+// after the terminating '0' of the "pad" region.
+//
+// This value will eventually be stored in the Main Lexicon.
+class PostingListIdentifier {
+  // 1 bit is wasted to encode max pl index bits so there can be at most 2^11
+  // posting lists per block. Block size would have to be >=40020 bytes for
+  // there to be more than 2K+ posting lists in a block.
+  static constexpr int kEncodedPostingListIndexBits = 12;
+  static_assert(kEncodedPostingListIndexBits + kBlockIndexBits <=
+                    8 * sizeof(uint32_t),
+                "Not enough room in PostingListIdentifier value to encode "
+                "block index and posting list index.");
+
+ public:
+  static PostingListIdentifier kInvalid;
+
+  // 1. block_index - the index of this block within the FlashIndexStorage file
+  // 2. posting_list_index - the index of this posting list within the block
+  // 3. posting_list_index_bits - the number of bits needed to encode the
+  //    largest posting_list_index that this block can have.
+  PostingListIdentifier(uint32_t block_index,
+                        PostingListIndex posting_list_index,
+                        int posting_list_index_bits) {
+    val_ = 0;
+    BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits,
+                /*val=*/static_cast<uint64_t>(posting_list_index));
+    BITFIELD_OR(
+        val_, /*offset=*/posting_list_index_bits + 1,
+        /*len=*/kEncodedPostingListIndexBits - posting_list_index_bits - 1,
+        /*val=*/~0u);
+    BITFIELD_OR(val_, /*offset=*/kEncodedPostingListIndexBits,
+                /*len=*/kBlockIndexBits,
+                /*val=*/block_index);
+  }
+
+  int block_index() const {
+    return BITFIELD_GET(val_, kEncodedPostingListIndexBits, kBlockIndexBits);
+  }
+
+  PostingListIndex posting_list_index() const {
+    return BITFIELD_GET(val_, 0, posting_list_index_bits());
+  }
+
+  // Returns the maximum number of bits that a posting list index on the block
+  // referred to by block_index could use.
+  int posting_list_index_bits() const {
+    for (int bits = kEncodedPostingListIndexBits - 1; bits >= 0; --bits) {
+      if (((1u << bits) & val_) == 0) {
+        // Got to the zero bit. This is the start of pl index.
+        return bits;
+      }
+    }
+    return -1;
+  }
+
+  bool is_valid() const { return *this != kInvalid; }
+
+  bool operator==(const PostingListIdentifier& rhs) const {
+    return val_ == rhs.val_;
+  }
+  bool operator!=(const PostingListIdentifier& rhs) const {
+    return !(*this == rhs);
+  }
+
+ private:
+  uint32_t val_;
+};
+
+}  //  namespace lib
+}  //  namespace icing
+
+#endif  // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_

diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 4396007..71752dd 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc

@@ -302,6 +302,24 @@
 }
 
 JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery(
+    JNIEnv* env, jclass clazz, jlong native_pointer,
+    jbyteArray search_spec_bytes) {
+  icing::lib::IcingSearchEngine* icing =
+      GetIcingSearchEnginePointer(native_pointer);
+
+  icing::lib::SearchSpecProto search_spec_proto;
+  if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) {
+    ICING_LOG(ERROR) << "Failed to parse SearchSpecProto in nativeSearch";
+    return nullptr;
+  }
+  icing::lib::DeleteResultProto delete_result_proto =
+      icing->DeleteByQuery(search_spec_proto);
+
+  return SerializeProtoToJniByteArray(env, delete_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
 Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk(
     JNIEnv* env, jclass clazz, jlong native_pointer) {
   icing::lib::IcingSearchEngine* icing =

diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
index 1954cd3..2eb64ac 100644
--- a/icing/legacy/core/icing-string-util.cc
+++ b/icing/legacy/core/icing-string-util.cc

@@ -11,13 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-//         sbanacho@google.com (Scott Banachowski)
-//
-// This is a list of IsGoogleLetter letters. It is copied from
-// google3/util/utf8/proptables/letters.txt CL 19164202.
 #include "icing/legacy/core/icing-string-util.h"
 
 #include <stdarg.h>
@@ -34,7 +27,6 @@
 namespace icing {
 namespace lib {
 
-namespace {}  // namespace
 uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) {
   if (len > 0) {
     crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len);

diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index 4ea93ec..767e581 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h

@@ -12,10 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-//         sbanacho@google.com (Scott Banachowski)
-
 #ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
 #define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
 

diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index ee3d3a2..29843ba 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc

@@ -96,14 +96,28 @@
 namespace icing {
 namespace lib {
 
+namespace {
+constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
+constexpr uint32_t kInvalidNextIndex = ~0U;
+
+// Returns the number of valid nexts in the array.
+int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start,
+                      int next_array_length) {
+  int valid_nexts_length = 0;
+  for (; valid_nexts_length < next_array_length &&
+         next_array_start[valid_nexts_length].node_index() != kInvalidNodeIndex;
+       ++valid_nexts_length) {
+  }
+  return valid_nexts_length;
+}
+}  // namespace
+
 // Based on the bit field widths.
 const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
 const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
 const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
 const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
 
-const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1;
-const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U;
 const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
 
 const int IcingDynamicTrie::kMaxNextArraySize;
@@ -891,7 +905,7 @@
 
 bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
     uint8_t *buf, uint32_t buf_size) const {
-  uint32_t size = hdr.ByteSize();
+  uint32_t size = hdr.ByteSizeLong();
   if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
   memcpy(buf, &kMagic, sizeof(kMagic));
   memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
@@ -1502,6 +1516,53 @@
   deleted_bitmap_->Truncate(0);
 }
 
+bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
+  // The size 1 below is for a '\0' between the suffix and the value.
+  size_t suffix_and_value_length =
+      strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
+      this->value_size();
+  char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
+      suffix_value_index, suffix_and_value_length);
+
+  if (mutable_suffix_and_value == nullptr) {
+    return false;
+  }
+
+  memset(mutable_suffix_and_value, 0, suffix_and_value_length);
+  return true;
+}
+
+bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
+  Next *mutable_next =
+      this->storage_->GetMutableNextArray(next_index, /*len=*/1);
+
+  if (mutable_next == nullptr) {
+    return false;
+  }
+
+  mutable_next->set_val(0);
+  mutable_next->set_node_index(kInvalidNodeIndex);
+  return true;
+}
+
+bool IcingDynamicTrie::SortNextArray(const Node *node) {
+  if (node == nullptr) {
+    // Nothing to sort, return success directly.
+    return true;
+  }
+
+  uint32_t next_array_buffer_size = 1u << node->log2_num_children();
+  Next *next_array_start = this->storage_->GetMutableNextArray(
+      node->next_index(), next_array_buffer_size);
+
+  if (next_array_start == nullptr) {
+    return false;
+  }
+
+  std::sort(next_array_start, next_array_start + next_array_buffer_size - 1);
+  return true;
+}
+
 bool IcingDynamicTrie::Insert(const char *key, const void *value,
                               uint32_t *value_index, bool replace,
                               bool *pnew_key) {
@@ -1641,15 +1702,12 @@
     new_leaf_node->set_log2_num_children(0);
 
     // Figure out the real length of the existing next array.
-    Next *cur_next = storage_->GetMutableNextArray(
-        best_node->next_index(), 1 << best_node->log2_num_children());
-    int next_len = 0;
-    for (; next_len < (1 << best_node->log2_num_children()) &&
-           cur_next[next_len].node_index() != kInvalidNodeIndex;
-         next_len++) {
-    }
+    uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
+    Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
+                                                   next_array_buffer_size);
+    int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
     Next *new_next = cur_next;
-    if (next_len == (1 << best_node->log2_num_children())) {
+    if (next_len == (next_array_buffer_size)) {
       // Allocate a new, larger, array.
       new_next = storage_->AllocNextArray(next_len + 1);
       memcpy(new_next, cur_next, sizeof(Next) * next_len);
@@ -2072,7 +2130,8 @@
 }
 
 void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
-                                    int *key_offset, bool prefix) const {
+                                    int *key_offset, bool prefix,
+                                    bool utf8) const {
   // Find the best node such that:
   //
   // - If key is NOT in the trie, key[0..key_offset) is a prefix to
@@ -2093,6 +2152,8 @@
 
   const Node *cur_node = storage_->GetRootNode();
   const char *cur_key = key;
+  const Node *utf8_node = cur_node;
+  const char *utf8_key = cur_key;
   while (!cur_node->is_leaf()) {
     const Next *found = GetNextByChar(cur_node, *cur_key);
     if (!found) break;
@@ -2108,12 +2169,101 @@
       break;
     }
     cur_key++;
+
+    if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) {
+      utf8_node = cur_node;
+      utf8_key = cur_key;
+    }
+  }
+
+  if (utf8) {
+    // Rewind.
+    cur_node = utf8_node;
+    cur_key = utf8_key;
   }
 
   *best_node_index = storage_->GetNodeIndex(cur_node);
   *key_offset = reinterpret_cast<const char *>(cur_key) - key;
 }
 
+int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key,
+                                                   bool utf8) const {
+  if (storage_->empty()) {
+    return kNoBranchFound;
+  }
+
+  uint32_t best_node_index;
+  int key_offset;
+  FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
+  const Node *cur_node = storage_->GetNode(best_node_index);
+  const char *cur_key = key + key_offset;
+  if (cur_node->is_leaf()) {
+    // Prefix in the trie. Split at leaf.
+    const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
+    while (*prev_suffix != '\0' && *prev_suffix == *cur_key) {
+      prev_suffix++;
+      cur_key++;
+    }
+
+    // Equal strings? No branching.
+    if (*prev_suffix == '\0' && *cur_key == '\0') {
+      return kNoBranchFound;
+    }
+
+    if (utf8) {
+      // Rewind to utf8 boundary.
+      size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key);
+      cur_key = key + offset;
+    }
+
+    return cur_key - key;
+  } else if (cur_node->log2_num_children() == 0) {
+    // Intermediate node going from no branching to branching.
+    return cur_key - key;
+  }
+
+  // If we've reached this point, then we're already at a branch point. So there
+  // is no *new* branch point.
+  return kNoBranchFound;
+}
+
+std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key,
+                                                              bool utf8) const {
+  std::vector<int> prefix_lengths;
+
+  if (storage_->empty()) {
+    return prefix_lengths;
+  }
+
+  const Node *cur_node = storage_->GetRootNode();
+  const char *cur_key = key;
+  while (*cur_key && !cur_node->is_leaf()) {
+    // Branching prefix?
+    if (cur_node->log2_num_children() > 0) {
+      int len = cur_key - key;
+      if (utf8) {
+        // Do not cut mid-utf8. Walk up to utf8 boundary.
+        len = i18n_utils::SafeTruncateUtf8Length(key, len);
+        if (prefix_lengths.empty() || len != prefix_lengths.back()) {
+          prefix_lengths.push_back(len);
+        }
+      } else {
+        prefix_lengths.push_back(len);
+      }
+    }
+
+    // Move to next.
+    const Next *found = GetNextByChar(cur_node, *cur_key);
+    if (found == nullptr) {
+      break;
+    }
+    cur_node = storage_->GetNode(found->node_index());
+
+    ++cur_key;
+  }
+  return prefix_lengths;
+}
+
 void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
   Stats stats;
   CollectStats(&stats);
@@ -2248,6 +2398,102 @@
   return deleted_bitmap_->SetBit(idx, false);
 }
 
+// Steps:
+// 1. Find the key in the trie.
+// 2. Remove the suffix and the value.
+// 3. Reset the nexts that point to the nodes to be removed.
+// 4. Sort any next array if needed.
+bool IcingDynamicTrie::Delete(const std::string_view key) {
+  if (!is_initialized()) {
+    ICING_LOG(ERROR) << "DynamicTrie not initialized";
+    return false;
+  }
+
+  if (storage_->empty()) {
+    // Nothing to delete.
+    return true;
+  }
+
+  // Tries to find the key in the trie, starting from the root.
+  const Node *current_node = storage_->GetRootNode();
+
+  // The node after which we start to remove data.
+  const Node *last_multichild_node = nullptr;
+
+  // While visiting the trie nodes, we store the indices of Nexts that point
+  // to all the nodes after last_multichild_node. Those nodes must be
+  // consecutive and all have only one child. Resetting those Nexts means that
+  // we remove the data of the key.
+  std::vector<uint32_t> nexts_to_reset;
+  nexts_to_reset.reserve(key.length());
+
+  // Iterates through chars in the key, finds nodes in the trie until a leaf
+  // node is reached. The max number of loops is key.length() + 1 because we
+  // start from the root.
+  for (size_t i = 0; i <= key.length(); ++i) {
+    if (current_node->is_leaf()) {
+      // Leaf node, now check the suffix.
+      if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
+        // Key does not exist in the trie, nothing to delete.
+        return true;
+      }
+      // Otherwise, key is found.
+      break;
+    }
+
+    // Finds the next char.
+    const Next *next;
+    if (i == key.length()) {
+      // When we're at the end of the key, the next char is the termination char
+      // '\0'.
+      next = GetNextByChar(current_node, '\0');
+    } else {
+      next = GetNextByChar(current_node, key[i]);
+    }
+
+    if (next == nullptr) {
+      // Key does not exist in the trie, nothing to delete.
+      return true;
+    }
+
+    // Checks the real size of next array.
+    uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
+    Next *next_array_start = storage_->GetMutableNextArray(
+        current_node->next_index(), next_array_buffer_size);
+    int valid_next_array_size =
+        GetValidNextsSize(next_array_start, next_array_buffer_size);
+    if (valid_next_array_size == 0) {
+      // Key does not exist in the trie, nothing to delete.
+      // This shouldn't happen, but we put a sanity check here in case something
+      // is wrong.
+      return true;
+    } else if (valid_next_array_size == 1) {
+      // Single-child branch will be deleted.
+      nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+    } else {
+      // We see a new node with multiple children, all the previously seen nodes
+      // shouldn't be removed.
+      last_multichild_node = current_node;
+      nexts_to_reset.clear();
+      nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+    }
+
+    // Updates current_node.
+    current_node = storage_->GetNode(next->node_index());
+  }
+  // Now we've found the key in the trie.
+
+  ClearSuffixAndValue(current_node->next_index());
+
+  // Resets nexts to remove key information.
+  for (uint32_t next_index : nexts_to_reset) {
+    ResetNext(next_index);
+  }
+  SortNextArray(last_multichild_node);
+
+  return true;
+}
+
 bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
   if (!is_initialized()) {
     ICING_LOG(FATAL) << "DynamicTrie not initialized";

diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index c33be96..7fe290b 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h

@@ -288,6 +288,16 @@
   // Empty out the trie without closing or removing.
   void Clear();
 
+  // Clears the suffix and value at the given index. Returns true on success.
+  bool ClearSuffixAndValue(uint32_t suffix_value_index);
+
+  // Resets the next at the given index so that it points to no node.
+  // Returns true on success.
+  bool ResetNext(uint32_t next_index);
+
+  // Sorts the next array of the node. Returns true on success.
+  bool SortNextArray(const Node *node);
+
   // Sync to disk.
   bool Sync() override;
 
@@ -375,6 +385,16 @@
     bool is_full_match() const { return value_index != kInvalidValueIndex; }
   };
 
+  static constexpr int kNoBranchFound = -1;
+  // Return prefix of any new branches created if key were inserted. If utf8 is
+  // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
+  // would be created.
+  int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
+
+  // Find all prefixes of key where the trie branches. Excludes the key
+  // itself. If utf8 is true, does not cut key mid-utf8.
+  std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
+
   void GetDebugInfo(int verbosity, std::string *out) const override;
 
   double min_free_fraction() const;
@@ -402,6 +422,10 @@
   // Clears the deleted property for each value.
   bool ClearDeleted(uint32_t value_index);
 
+  // Deletes the entry associated with the key. Data can not be recovered after
+  // the deletion. Returns true on success.
+  bool Delete(std::string_view key);
+
   // Clear a specific property id from all values.  For each value that has this
   // property cleared, also check to see if it was the only property set;  if
   // so, set the deleted property for the value to indicate it no longer has any
@@ -575,8 +599,6 @@
   void GetHeader(IcingDynamicTrieHeader *hdr) const;
   void SetHeader(const IcingDynamicTrieHeader &new_hdr);
 
-  static const uint32_t kInvalidNodeIndex;
-  static const uint32_t kInvalidNextIndex;
   static const uint32_t kInvalidSuffixIndex;
 
   // Stats helpers.
@@ -587,7 +609,7 @@
   const Next *LowerBound(const Next *start, const Next *end,
                          uint8_t key_char) const;
   void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
-                    bool prefix) const;
+                    bool prefix, bool utf8 = false) const;
 
   // For value properties.  This truncates the data by clearing it, but leaving
   // the storage intact.

diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
index 4fae52a..193765b 100644
--- a/icing/legacy/index/icing-dynamic-trie_test.cc
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc

@@ -746,6 +746,222 @@
   }
 }
 
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenRootIsLeaf) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts a key, the root is a leaf.
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("foo", &value));
+  ASSERT_TRUE(trie.Find("foo", &value));
+
+  // Deletes the key.
+  EXPECT_TRUE(trie.Delete("foo"));
+  EXPECT_FALSE(trie.Find("foo", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenLastCharIsLeaf) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts "bar" and "ba", the trie structure looks like:
+  //       root
+  //         |
+  //         b
+  //         |
+  //         a
+  //        / \
+  //     null  r
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Insert("ba", &value));
+  ASSERT_TRUE(trie.Find("bar", &value));
+  ASSERT_TRUE(trie.Find("ba", &value));
+
+  // Deletes "bar". "r" is a leaf node in the trie.
+  EXPECT_TRUE(trie.Delete("bar"));
+  EXPECT_FALSE(trie.Find("bar", &value));
+  EXPECT_TRUE(trie.Find("ba", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithTerminationNode) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts "bar" and "ba", the trie structure looks like:
+  //       root
+  //         |
+  //         b
+  //         |
+  //         a
+  //        / \
+  //     null  r
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Insert("ba", &value));
+  ASSERT_TRUE(trie.Find("bar", &value));
+  ASSERT_TRUE(trie.Find("ba", &value));
+
+  // Deletes "ba" which is a key with termination node in the trie.
+  EXPECT_TRUE(trie.Delete("ba"));
+  EXPECT_FALSE(trie.Find("ba", &value));
+  EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleNexts) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts "ba", "bb", "bc", and "bd", the trie structure looks like:
+  //       root
+  //         |
+  //         b
+  //      / | | \
+  //     a  b c  d
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("ba", &value));
+  ASSERT_TRUE(trie.Insert("bb", &value));
+  ASSERT_TRUE(trie.Insert("bc", &value));
+  ASSERT_TRUE(trie.Insert("bd", &value));
+  ASSERT_TRUE(trie.Find("ba", &value));
+  ASSERT_TRUE(trie.Find("bb", &value));
+  ASSERT_TRUE(trie.Find("bc", &value));
+  ASSERT_TRUE(trie.Find("bd", &value));
+
+  // Deletes "bc".
+  EXPECT_TRUE(trie.Delete("bc"));
+  EXPECT_FALSE(trie.Find("bc", &value));
+  EXPECT_TRUE(trie.Find("ba", &value));
+  EXPECT_TRUE(trie.Find("bb", &value));
+  EXPECT_TRUE(trie.Find("bd", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleTrieBranches) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts "batter", "battle", and "bar", the trie structure looks like:
+  //       root
+  //         |
+  //         b
+  //         |
+  //         a
+  //        / \
+  //       t   r
+  //       |
+  //       t
+  //      / \
+  //     e   l
+  //     |   |
+  //     r   e
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("batter", &value));
+  ASSERT_TRUE(trie.Insert("battle", &value));
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Find("batter", &value));
+  ASSERT_TRUE(trie.Find("battle", &value));
+  ASSERT_TRUE(trie.Find("bar", &value));
+
+  // Deletes "batter".
+  EXPECT_TRUE(trie.Delete("batter"));
+  EXPECT_FALSE(trie.Find("batter", &value));
+  EXPECT_TRUE(trie.Find("battle", &value));
+  EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, InsertionShouldWorkAfterDeletion) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts some keys.
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Insert("bed", &value));
+  ASSERT_TRUE(trie.Insert("foo", &value));
+
+  // Deletes a key
+  ASSERT_TRUE(trie.Delete("bed"));
+  ASSERT_FALSE(trie.Find("bed", &value));
+
+  // Inserts after deletion
+  EXPECT_TRUE(trie.Insert("bed", &value));
+  EXPECT_TRUE(trie.Insert("bedroom", &value));
+  EXPECT_TRUE(trie.Find("bed", &value));
+  EXPECT_TRUE(trie.Find("bedroom", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorShouldWorkAfterDeletion) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts some keys.
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Insert("bed", &value));
+  ASSERT_TRUE(trie.Insert("foo", &value));
+
+  // Deletes a key
+  ASSERT_TRUE(trie.Delete("bed"));
+
+  // Iterates through all keys
+  IcingDynamicTrie::Iterator iterator_all(trie, "");
+  std::vector<std::string> results;
+  for (; iterator_all.IsValid(); iterator_all.Advance()) {
+    results.emplace_back(iterator_all.GetKey());
+  }
+  EXPECT_THAT(results, ElementsAre("bar", "foo"));
+
+  // Iterates through keys that start with "b"
+  IcingDynamicTrie::Iterator iterator_b(trie, "b");
+  results.clear();
+  for (; iterator_b.IsValid(); iterator_b.Advance()) {
+    results.emplace_back(iterator_b.GetKey());
+  }
+  EXPECT_THAT(results, ElementsAre("bar"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) {
+  IcingFilesystem filesystem;
+  IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+                        &filesystem);
+  ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+  ASSERT_TRUE(trie.Init());
+
+  // Inserts some keys.
+  uint32_t value = 1;
+  ASSERT_TRUE(trie.Insert("bar", &value));
+  ASSERT_TRUE(trie.Insert("bed", &value));
+
+  // "ba" and bedroom are not keys in the trie.
+  EXPECT_TRUE(trie.Delete("ba"));
+  EXPECT_TRUE(trie.Delete("bedroom"));
+
+  // The original keys are not affected.
+  EXPECT_TRUE(trie.Find("bar", &value));
+  EXPECT_TRUE(trie.Find("bed", &value));
+}
+
 }  // namespace
 
 // The tests below are accessing private methods and fields of IcingDynamicTrie

diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
index 31e012a..5a064ea 100644
--- a/icing/legacy/index/icing-mock-filesystem.h
+++ b/icing/legacy/index/icing-mock-filesystem.h

@@ -31,65 +31,78 @@
 
 class IcingMockFilesystem : public IcingFilesystem {
  public:
-  MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name));
+  MOCK_METHOD(bool, DeleteFile, (const char *file_name), (const, override));
 
-  MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name));
+  MOCK_METHOD(bool, DeleteDirectory, (const char *dir_name), (const, override));
 
-  MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name));
+  MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char *dir_name),
+              (const, override));
 
-  MOCK_CONST_METHOD1(FileExists, bool(const char *file_name));
+  MOCK_METHOD(bool, FileExists, (const char *file_name), (const, override));
 
-  MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name));
+  MOCK_METHOD(bool, DirectoryExists, (const char *dir_name), (const, override));
 
-  MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name));
+  MOCK_METHOD(int, GetBasenameIndex, (const char *file_name),
+              (const, override));
 
-  MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name));
+  MOCK_METHOD(std::string, GetBasename, (const char *file_name),
+              (const, override));
 
-  MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name));
+  MOCK_METHOD(std::string, GetDirname, (const char *file_name),
+              (const, override));
 
-  MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name,
-                                         std::vector<std::string> *entries));
+  MOCK_METHOD(bool, ListDirectory,
+              (const char *dir_name, std::vector<std::string> *entries),
+              (const, override));
 
-  MOCK_CONST_METHOD2(GetMatchingFiles,
-                     bool(const char *glob, std::vector<std::string> *matches));
+  MOCK_METHOD(bool, GetMatchingFiles,
+              (const char *glob, std::vector<std::string> *matches),
+              (const, override));
 
-  MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name));
+  MOCK_METHOD(int, OpenForWrite, (const char *file_name), (const, override));
 
-  MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name));
+  MOCK_METHOD(int, OpenForAppend, (const char *file_name), (const, override));
 
-  MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name));
+  MOCK_METHOD(int, OpenForRead, (const char *file_name), (const, override));
 
-  MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd));
+  MOCK_METHOD(uint64_t, GetFileSize, (int fd), (const, override));
 
-  MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename));
+  MOCK_METHOD(uint64_t, GetFileSize, (const char *filename), (const, override));
 
-  MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size));
+  MOCK_METHOD(bool, Truncate, (int fd, uint64_t new_size), (const, override));
 
-  MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size));
+  MOCK_METHOD(bool, Truncate, (const char *filename, uint64_t new_size),
+              (const, override));
 
-  MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size));
+  MOCK_METHOD(bool, Grow, (int fd, uint64_t new_size), (const, override));
 
-  MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size));
-  MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data,
-                                  size_t data_size));
+  MOCK_METHOD(bool, Write, (int fd, const void *data, size_t data_size),
+              (const, override));
+  MOCK_METHOD(bool, PWrite,
+              (int fd, off_t offset, const void *data, size_t data_size),
+              (const, override));
 
-  MOCK_CONST_METHOD1(DataSync, bool(int fd));
+  MOCK_METHOD(bool, DataSync, (int fd), (const, override));
 
-  MOCK_CONST_METHOD2(RenameFile,
-                     bool(const char *old_name, const char *new_name));
+  MOCK_METHOD(bool, RenameFile, (const char *old_name, const char *new_name),
+              (const, override));
 
-  MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two));
+  MOCK_METHOD(bool, SwapFiles, (const char *one, const char *two),
+              (const, override));
 
-  MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name));
+  MOCK_METHOD(bool, CreateDirectory, (const char *dir_name), (const, override));
 
-  MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name));
+  MOCK_METHOD(bool, CreateDirectoryRecursively, (const char *dir_name),
+              (const, override));
 
-  MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst));
+  MOCK_METHOD(bool, CopyFile, (const char *src, const char *dst),
+              (const, override));
 
-  MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum,
-                                           uint64_t offset, uint64_t length));
+  MOCK_METHOD(bool, ComputeChecksum,
+              (int fd, uint32_t *checksum, uint64_t offset, uint64_t length),
+              (const, override));
 
-  MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path));
+  MOCK_METHOD(uint64_t, GetDiskUsage, (const char *path), (const, override));
 };
 
 }  // namespace lib

diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 000bf3a..29404d9 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc

@@ -30,6 +30,7 @@
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/util/logging.h"
+#include "unicode/uloc.h"
 
 // Run on a Linux workstation:
 //    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -107,8 +108,9 @@
   }
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -219,8 +221,9 @@
   }
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -349,8 +352,9 @@
   }
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -464,8 +468,9 @@
   }
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 

diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 36dbfd9..0d2c2c5 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc

@@ -36,6 +36,7 @@
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -59,8 +60,10 @@
         // File generated via icu_data_file rule in //icing/BUILD.
         icu_data_file_helper::SetUpICUDataFile(
             GetTestFilePath("icing/icu.dat")));
-    ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
-                               language_segmenter_factory::Create());
+    language_segmenter_factory::SegmenterOptions options(ULOC_US);
+    ICING_ASSERT_OK_AND_ASSIGN(
+        language_segmenter_,
+        language_segmenter_factory::Create(std::move(options)));
 
     ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
                                SchemaStore::Create(&filesystem_, test_dir_));

diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 3b3bf61..676ea92 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc

@@ -40,6 +40,7 @@
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -60,8 +61,10 @@
         // File generated via icu_data_file rule in //icing/BUILD.
         icu_data_file_helper::SetUpICUDataFile(
             GetTestFilePath("icing/icu.dat")));
-    ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
-                               language_segmenter_factory::Create());
+    language_segmenter_factory::SegmenterOptions options(ULOC_US);
+    ICING_ASSERT_OK_AND_ASSIGN(
+        language_segmenter_,
+        language_segmenter_factory::Create(std::move(options)));
 
     // Setup the schema
     ICING_ASSERT_OK_AND_ASSIGN(schema_store_,

diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
index 198bc49..3970132 100644
--- a/icing/store/document-filter-data.h
+++ b/icing/store/document-filter-data.h

@@ -25,6 +25,7 @@
 namespace lib {
 
 using SchemaTypeId = int16_t;
+inline constexpr SchemaTypeId kInvalidSchemaTypeId = -1;
 
 class DocumentFilterData {
  public:

diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 93cebaa..79b91df 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc

@@ -329,8 +329,22 @@
   auto iterator = document_log_->GetIterator();
   auto iterator_status = iterator.Advance();
   while (iterator_status.ok()) {
-    ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper,
-                           document_log_->ReadProto(iterator.GetOffset()));
+    libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
+        document_log_->ReadProto(iterator.GetOffset());
+
+    if (absl_ports::IsNotFound(document_wrapper_or.status())) {
+      // The erased document still occupies 1 document id.
+      DocumentId new_document_id = document_id_mapper_->num_elements();
+      ICING_RETURN_IF_ERROR(
+          ClearDerivedData(/*name_space=*/"", /*uri=*/"", new_document_id));
+      iterator_status = iterator.Advance();
+      continue;
+    } else if (!document_wrapper_or.ok()) {
+      return document_wrapper_or.status();
+    }
+
+    DocumentWrapper document_wrapper =
+        std::move(document_wrapper_or).ValueOrDie();
     if (document_wrapper.deleted()) {
       if (!document_wrapper.document().uri().empty()) {
         // Individual document deletion.
@@ -351,17 +365,22 @@
         }
       } else if (!document_wrapper.document().namespace_().empty()) {
         // Namespace deletion.
-        ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted(
-            document_wrapper.document().namespace_()));
-
+        ICING_ASSIGN_OR_RETURN(
+            NamespaceId namespace_id,
+            namespace_mapper_->Get(document_wrapper.document().namespace_()));
+        // Tombstone indicates it's a soft delete.
+        ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId,
+                                          /*soft_delete=*/true));
       } else if (!document_wrapper.document().schema().empty()) {
         // SchemaType deletion.
         auto schema_type_id_or = schema_store_->GetSchemaTypeId(
             document_wrapper.document().schema());
 
         if (schema_type_id_or.ok()) {
-          ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(
-              schema_type_id_or.ValueOrDie()));
+          // Tombstone indicates it's a soft delete.
+          ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId,
+                                            schema_type_id_or.ValueOrDie(),
+                                            /*soft_delete=*/true));
         } else {
           // The deleted schema type doesn't have a SchemaTypeId we can refer
           // to in the FilterCache.
@@ -845,7 +864,8 @@
 }
 
 libtextclassifier3::Status DocumentStore::Delete(
-    const std::string_view name_space, const std::string_view uri) {
+    const std::string_view name_space, const std::string_view uri,
+    bool soft_delete) {
   // Try to get the DocumentId first
   auto document_id_or = GetDocumentId(name_space, uri);
   if (!document_id_or.ok()) {
@@ -865,25 +885,63 @@
                            ", uri: ", uri));
   }
 
+  if (soft_delete) {
+    return SoftDelete(name_space, uri, document_id);
+  } else {
+    uint64_t document_log_offset = file_offset_or.ValueOrDie();
+    return HardDelete(name_space, uri, document_id, document_log_offset);
+  }
+}
+
+libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
+                                                 bool soft_delete) {
+  // Copy out the document to get namespace and uri.
+  ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
+                         DoesDocumentExistAndGetFileOffset(document_id));
+  auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
+  if (!document_wrapper_or.ok()) {
+    ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
+                     << "Failed to read from document log";
+    return document_wrapper_or.status();
+  }
+  DocumentWrapper document_wrapper =
+      std::move(document_wrapper_or).ValueOrDie();
+
+  if (soft_delete) {
+    return SoftDelete(document_wrapper.document().namespace_(),
+                      document_wrapper.document().uri(), document_id);
+  } else {
+    return HardDelete(document_wrapper.document().namespace_(),
+                      document_wrapper.document().uri(), document_id,
+                      document_log_offset);
+  }
+}
+
+libtextclassifier3::Status DocumentStore::SoftDelete(
+    std::string_view name_space, std::string_view uri, DocumentId document_id) {
   // Update ground truth first.
-  // To delete a proto we don't directly remove it. Instead, we mark it as
-  // deleted first by appending a tombstone of it and actually remove it from
-  // file later in Optimize()
-  // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR
-  // that can support error logging.
+  // Mark the document as deleted by appending a tombstone of it and actually
+  // remove it from file later in Optimize()
+  // TODO(b/144458732): Implement a more robust version of
+  // ICING_RETURN_IF_ERROR that can support error logging.
   libtextclassifier3::Status status =
       document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
           .status();
   if (!status.ok()) {
     return absl_ports::Annotate(
-        status, absl_ports::StrCat("Failed to delete Document. namespace: ",
+        status, absl_ports::StrCat("Failed to delete Document. namespace:",
                                    name_space, ", uri: ", uri));
   }
 
-  ICING_RETURN_IF_ERROR(
-      document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag));
+  return document_id_mapper_->Set(document_id, kDocDeletedFlag);
+}
 
-  return libtextclassifier3::Status::OK;
+libtextclassifier3::Status DocumentStore::HardDelete(
+    std::string_view name_space, std::string_view uri, DocumentId document_id,
+    uint64_t document_log_offset) {
+  // Erases document proto.
+  ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
+  return ClearDerivedData(name_space, uri, document_id);
 }
 
 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
@@ -899,7 +957,14 @@
                      << " from score_cache_";
     return score_data_or.status();
   }
-  return *std::move(score_data_or).ValueOrDie();
+
+  DocumentAssociatedScoreData document_associated_score_data =
+      *std::move(score_data_or).ValueOrDie();
+  if (document_associated_score_data.document_score() < 0) {
+    // An negative / invalid score means that the score data has been deleted.
+    return absl_ports::NotFoundError("Document score data not found.");
+  }
+  return document_associated_score_data;
 }
 
 libtextclassifier3::StatusOr<DocumentFilterData>
@@ -910,68 +975,134 @@
                      << " from filter_cache_";
     return filter_data_or.status();
   }
-  return *std::move(filter_data_or).ValueOrDie();
+  DocumentFilterData document_filter_data =
+      *std::move(filter_data_or).ValueOrDie();
+  if (document_filter_data.namespace_id() == kInvalidNamespaceId) {
+    // An invalid namespace id means that the filter data has been deleted.
+    return absl_ports::NotFoundError("Document filter data not found.");
+  }
+  return document_filter_data;
 }
 
 libtextclassifier3::Status DocumentStore::DeleteByNamespace(
-    std::string_view name_space) {
+    std::string_view name_space, bool soft_delete) {
   auto namespace_id_or = namespace_mapper_->Get(name_space);
   if (!namespace_id_or.ok()) {
     return absl_ports::Annotate(
         namespace_id_or.status(),
-        absl_ports::StrCat("Failed to delete by namespace. namespace: ",
-                           name_space));
+        absl_ports::StrCat("Failed to find namespace: ", name_space));
+  }
+  NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+
+  int num_updated_documents = 0;
+  if (soft_delete) {
+    // To delete an entire namespace, we append a tombstone that only contains
+    // the deleted bit and the name of the deleted namespace.
+    // TODO(b/144458732): Implement a more robust version of
+    // ICING_RETURN_IF_ERROR that can support error logging.
+    libtextclassifier3::Status status =
+        document_log_->WriteProto(CreateNamespaceTombstone(name_space))
+            .status();
+    if (!status.ok()) {
+      ICING_LOG(ERROR) << status.error_message()
+                       << "Failed to delete namespace. namespace = "
+                       << name_space;
+      return status;
+    }
   }
 
-  // Update ground truth first.
-  // To delete an entire namespace, we append a tombstone that only contains
-  // the deleted bit and the name of the deleted namespace.
-  // TODO(b/144458732): Implement a more robust version of
-  // ICING_RETURN_IF_ERROR that can support error logging.
-  libtextclassifier3::Status status =
-      document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status();
-  if (!status.ok()) {
-    ICING_LOG(ERROR) << status.error_message()
-                     << "Failed to delete namespace. namespace = "
-                     << name_space;
-    return status;
-  }
+  ICING_ASSIGN_OR_RETURN(
+      num_updated_documents,
+      BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete));
 
-  ICING_ASSIGN_OR_RETURN(bool updated_existing_document,
-                         UpdateDerivedFilesNamespaceDeleted(name_space));
-  if (!updated_existing_document) {
+  if (num_updated_documents <= 0) {
     // Treat the fact that no existing documents had this namespace to be the
     // same as this namespace not existing at all.
     return absl_ports::NotFoundError(
         absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
   }
+
   return libtextclassifier3::Status::OK;
 }
 
-libtextclassifier3::StatusOr<bool>
-DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) {
-  auto namespace_id_or = namespace_mapper_->Get(name_space);
-  if (!namespace_id_or.ok()) {
-    return namespace_id_or.status();
+libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
+    std::string_view schema_type, bool soft_delete) {
+  auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
+  if (!schema_type_id_or.ok()) {
+    return absl_ports::Annotate(
+        schema_type_id_or.status(),
+        absl_ports::StrCat("Failed to find schema type. schema_type: ",
+                           schema_type));
+  }
+  SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+
+  int num_updated_documents = 0;
+  if (soft_delete) {
+    // To soft-delete an entire schema type, we append a tombstone that only
+    // contains the deleted bit and the name of the deleted schema type.
+    // TODO(b/144458732): Implement a more robust version of
+    // ICING_RETURN_IF_ERROR that can support error logging.
+    libtextclassifier3::Status status =
+        document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
+            .status();
+    if (!status.ok()) {
+      ICING_LOG(ERROR) << status.error_message()
+                       << "Failed to delete schema_type. schema_type = "
+                       << schema_type;
+      return status;
+    }
   }
 
-  // Guaranteed to have a NamespaceId now.
-  NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+  ICING_ASSIGN_OR_RETURN(
+      num_updated_documents,
+      BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete));
 
+  if (num_updated_documents <= 0) {
+    return absl_ports::NotFoundError(absl_ports::StrCat(
+        "No documents found with schema type '", schema_type, "'"));
+  }
+
+  return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
+    NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) {
   // Tracks if there were any existing documents with this namespace that we
   // will mark as deleted.
-  bool updated_existing_document = false;
+  int num_updated_documents = 0;
 
-  // Traverse FilterCache and delete all docs that match namespace_id
+  // Traverse FilterCache and delete all docs that match namespace_id and
+  // schema_type_id.
   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
        ++document_id) {
     // filter_cache_->Get can only fail if document_id is < 0
     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
                            filter_cache_->Get(document_id));
-    if (data->namespace_id() == namespace_id) {
+
+    // Check namespace only when the input namespace id is valid.
+    if (namespace_id != kInvalidNamespaceId &&
+        (data->namespace_id() == kInvalidNamespaceId ||
+         data->namespace_id() != namespace_id)) {
+      // The document has already been hard-deleted or isn't from the desired
+      // namespace.
+      continue;
+    }
+
+    // Check schema type only when the input schema type id is valid.
+    if (schema_type_id != kInvalidSchemaTypeId &&
+        (data->schema_type_id() == kInvalidSchemaTypeId ||
+         data->schema_type_id() != schema_type_id)) {
+      // The document has already been hard-deleted or doesn't have the
+      // desired schema type.
+      continue;
+    }
+
+    // The document has the desired namespace and schema type, it either exists
+    // or has been soft-deleted / expired.
+    if (soft_delete) {
       if (DoesDocumentExist(document_id)) {
-        updated_existing_document = true;
+        ++num_updated_documents;
       }
 
       // docid_mapper_->Set can only fail if document_id is < 0
@@ -980,65 +1111,29 @@
       // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
       ICING_RETURN_IF_ERROR(
           document_id_mapper_->Set(document_id, kDocDeletedFlag));
+    } else {
+      // Hard delete. Try to copy out the document to get namespace and uri.
+      // Getting namespace and uri is necessary to delete entries in
+      // document_key_mapper_.
+      auto document_or = Get(document_id);
+      if (absl_ports::IsNotFound(document_or.status())) {
+        // Document not found.
+        continue;
+      } else if (!document_or.ok()) {
+        // Real error, pass up.
+        return document_or.status();
+      }
+      DocumentProto document_copy = std::move(document_or).ValueOrDie();
+
+      // Erase from the ground truth. Delete() won't return NOT_FOUND because
+      // NOT_FOUND should have been caught by Get() above.
+      ICING_RETURN_IF_ERROR(Delete(document_copy.namespace_(),
+                                   document_copy.uri(), /*soft_delete=*/false));
+      ++num_updated_documents;
     }
   }
 
-  return updated_existing_document;
-}
-
-libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
-    std::string_view schema_type) {
-  auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
-  if (!schema_type_id_or.ok()) {
-    return absl_ports::Annotate(
-        schema_type_id_or.status(),
-        absl_ports::StrCat("Failed to delete by schema type. schema_type: ",
-                           schema_type));
-  }
-
-  // Update ground truth first.
-  // To delete an entire schema type, we append a tombstone that only contains
-  // the deleted bit and the name of the deleted schema type.
-  // TODO(b/144458732): Implement a more robust version of
-  // ICING_RETURN_IF_ERROR that can support error logging.
-  libtextclassifier3::Status status =
-      document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
-          .status();
-  if (!status.ok()) {
-    ICING_LOG(ERROR) << status.error_message()
-                     << "Failed to delete schema_type. schema_type = "
-                     << schema_type;
-    return status;
-  }
-
-  // Guaranteed to have a SchemaTypeId now
-  SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
-
-  ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id));
-
-  return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted(
-    SchemaTypeId schema_type_id) {
-  // Traverse FilterCache and delete all docs that match schema_type_id.
-  for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
-       ++document_id) {
-    // filter_cache_->Get can only fail if document_id is < 0
-    // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
-    ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
-                           filter_cache_->Get(document_id));
-    if (data->schema_type_id() == schema_type_id) {
-      // docid_mapper_->Set can only fail if document_id is < 0
-      // or >= docid_mapper_->num_elements. So the only possible way to get an
-      // error here would be if filter_cache_->num_elements >
-      // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
-      ICING_RETURN_IF_ERROR(
-          document_id_mapper_->Set(document_id, kDocDeletedFlag));
-    }
-  }
-
-  return libtextclassifier3::Status::OK;
+  return num_updated_documents;
 }
 
 libtextclassifier3::Status DocumentStore::PersistToDisk() {
@@ -1328,5 +1423,27 @@
   return filter_cache_->Set(document_id, filter_data);
 }
 
+libtextclassifier3::Status DocumentStore::ClearDerivedData(
+    const std::string_view name_space, const std::string_view uri,
+    DocumentId document_id) {
+  if (!name_space.empty() && !uri.empty()) {
+    document_key_mapper_->Delete(MakeFingerprint(name_space, uri));
+  }
+
+  ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
+
+  // Resets the score cache entry
+  ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+      document_id, DocumentAssociatedScoreData(/*document_score=*/-1,
+                                               /*creation_timestamp_ms=*/-1)));
+
+  // Resets the filter cache entry
+  ICING_RETURN_IF_ERROR(UpdateFilterCache(
+      document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
+                                      /*expiration_timestamp_ms=*/-1)));
+
+  return libtextclassifier3::Status::OK;
+}
+
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 3f4b72f..52ea176 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h

@@ -147,17 +147,40 @@
   //   boolean whether a document exists or not
   bool DoesDocumentExist(DocumentId document_id) const;
 
-  // Deletes the document identified by the given namespace and uri
+  // Deletes the document identified by the given namespace and uri. The
+  // document proto will be marked as deleted if 'soft_delete' is true,
+  // otherwise the document proto will be erased immediately.
   //
-  // NOTE: Space is not reclaimed for deleted documents until Optimize() is
-  // called.
+  // NOTE:
+  // 1. The soft deletion uses less CPU power, it can be applied on
+  //    non-sensitive data.
+  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if no document exists with namespace, uri
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status Delete(std::string_view name_space,
-                                    std::string_view uri);
+                                    std::string_view uri,
+                                    bool soft_delete = false);
+
+  // Deletes the document identified by the given document_id. The
+  // document proto will be marked as deleted if 'soft_delete' is true,
+  // otherwise the document proto will be erased immediately.
+  //
+  // NOTE:
+  // 1. The soft deletion uses less CPU power, it can be applied on
+  //    non-sensitive data.
+  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    called.
+  //
+  // Returns:
+  //   OK on success
+  //   INTERNAL_ERROR on IO error
+  //   INVALID_ARGUMENT if document_id is invalid.
+  libtextclassifier3::Status Delete(DocumentId document_id,
+                                    bool soft_delete = false);
 
   // Returns the NamespaceId of the string namespace
   //
@@ -180,6 +203,7 @@
   //   DocumentAssociatedScoreData on success
   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
   //                DocumentIds
+  //   NOT_FOUND if no score data is found
   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
   GetDocumentAssociatedScoreData(DocumentId document_id) const;
 
@@ -194,30 +218,43 @@
   //   DocumentFilterData on success
   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
   //                DocumentIds
+  //   NOT_FOUND if no filter data is found
   libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
       DocumentId document_id) const;
 
-  // Deletes all documents belonging to the given namespace.
+  // Deletes all documents belonging to the given namespace. The documents will
+  // be marked as deleted if 'soft_delete' is true, otherwise they will be
+  // erased immediately.
   //
-  // NOTE: Space is not reclaimed for deleted documents until Optimize() is
-  // called.
+  // NOTE:
+  // 1. The soft deletion uses less CPU power, it can be applied on
+  //    non-sensitive data.
+  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if namespace doesn't exist
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
+  libtextclassifier3::Status DeleteByNamespace(std::string_view name_space,
+                                               bool soft_delete = false);
 
-  // Deletes all documents belonging to the given schema type
+  // Deletes all documents belonging to the given schema type. The documents
+  // will be marked as deleted if 'soft_delete' is true, otherwise they will be
+  // erased immediately.
   //
-  // NOTE: Space is not reclaimed for deleted documents until Optimize() is
-  // called.
+  // NOTE:
+  // 1. The soft deletion uses less CPU power, it can be applied on
+  //    non-sensitive data.
+  // 2. Space is not reclaimed for deleted documents until Optimize() is
+  //    called.
   //
   // Returns:
   //   OK on success
   //   NOT_FOUND if schema_type doesn't exist
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
+  libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type,
+                                                bool soft_delete = false);
 
   // Syncs all the data and metadata changes to disk.
   //
@@ -424,32 +461,44 @@
   //   INTERNAL on I/O error
   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
 
-  // Update derived files that `name_space` has been deleted. This is primarily
-  // useful if we're trying to update derived files when we've already seen a
-  // namespace tombstone, and don't need to write another tombstone.
+  // Helper function to do batch deletes. Documents with the given
+  // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
+  // to specify the namespace or schema type, pass in kInvalidNamespaceId or
+  // kInvalidSchemaTypeId. The document protos will be marked as deleted if
+  // 'soft_delete' is true, otherwise the document protos with their derived
+  // data will be erased / cleared immediately.
   //
   // NOTE: Space is not reclaimed in the derived files until Optimize() is
   // called.
   //
   // Returns:
-  //   bool on whether an existing document was actually updated to be deleted
+  //   Number of documents that were actually updated to be deleted
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted(
-      std::string_view name_space);
+  libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
+                                                SchemaTypeId schema_type_id,
+                                                bool soft_delete);
 
-  // Update derived files that the schema type schema_type_id has been deleted.
-  // This is primarily useful if we're trying to update derived files when we've
-  // already seen a schema type tombstone, and don't need to write another
-  // tombstone.
-  //
-  // NOTE: Space is not reclaimed in the derived files until Optimize() is
-  // called.
+  // Marks the document identified by the given name_space, uri and document_id
+  // as deleted, to be removed later during Optimize().
   //
   // Returns:
   //   OK on success
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted(
-      SchemaTypeId schema_type_id);
+  libtextclassifier3::Status SoftDelete(std::string_view name_space,
+                                        std::string_view uri,
+                                        DocumentId document_id);
+
+  // Erases the document identified by the given name_space, uri and document_id
+  // from the document_log and erases its uri from the document_key_mapper_, the
+  // space will be reclaimed later during Optimize().
+  //
+  // Returns:
+  //   OK on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::Status HardDelete(std::string_view name_space,
+                                        std::string_view uri,
+                                        DocumentId document_id,
+                                        uint64_t document_log_offset);
 
   // Helper method to find a DocumentId that is associated with the given
   // namespace and uri.
@@ -488,6 +537,11 @@
   // Updates the entry in the filter cache for document_id.
   libtextclassifier3::Status UpdateFilterCache(
       DocumentId document_id, const DocumentFilterData& filter_data);
+
+  // Helper method to clear the derived data of a document
+  libtextclassifier3::Status ClearDerivedData(std::string_view name_space,
+                                              std::string_view uri,
+                                              DocumentId document_id);
 };
 
 }  // namespace lib

diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index ad56b9a..f857481 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc

@@ -60,9 +60,6 @@
       : test_dir_(GetTestTempDir() + "/icing"),
         document_store_dir_(test_dir_ + "/document_store"),
         schema_store_dir_(test_dir_ + "/schema_store") {
-    filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
-    filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
-    filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
     test_document1_ =
         DocumentBuilder()
             .SetKey("icing", "email/1")
@@ -88,6 +85,11 @@
   }
 
   void SetUp() override {
+    filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+    filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+    filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+    filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
     SchemaProto schema;
     auto type_config = schema.add_types();
     type_config->set_schema_type("email");
@@ -270,7 +272,7 @@
               IsFalse());
 }
 
-TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
+TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> document_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -281,7 +283,26 @@
       IsOkAndHolds(EqualsProto(test_document1_)));
 
   ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
-                                         test_document1_.uri()));
+                                         test_document1_.uri(),
+                                         /*soft_delete=*/true));
+  EXPECT_THAT(
+      document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+      StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+  EXPECT_THAT(
+      document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+      IsOkAndHolds(EqualsProto(test_document1_)));
+
+  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+                                         test_document1_.uri(),
+                                         /*soft_delete=*/false));
   EXPECT_THAT(
       document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
       StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -343,20 +364,6 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, DeleteOk) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<DocumentStore> doc_store,
-      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
-                            schema_store_.get()));
-
-  // Get() after Delete() returns NOT_FOUND
-  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
-                             doc_store->Put(DocumentProto(test_document1_)));
-  EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
-  EXPECT_THAT(doc_store->Get(document_id),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
 TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> document_store,
@@ -394,7 +401,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> doc_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -422,7 +429,8 @@
 
   // DELETE namespace.1. document1 and document 4 should be deleted. document2
   // and document3 should still be retrievable.
-  ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+  ICING_EXPECT_OK(
+      doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true));
   EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
@@ -433,7 +441,47 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> doc_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  DocumentProto document1 = test_document1_;
+  document1.set_namespace_("namespace.1");
+  document1.set_uri("uri1");
+  ICING_ASSERT_OK(doc_store->Put(document1));
+
+  DocumentProto document2 = test_document1_;
+  document2.set_namespace_("namespace.2");
+  document2.set_uri("uri1");
+  ICING_ASSERT_OK(doc_store->Put(document2));
+
+  DocumentProto document3 = test_document1_;
+  document3.set_namespace_("namespace.3");
+  document3.set_uri("uri1");
+  ICING_ASSERT_OK(doc_store->Put(document3));
+
+  DocumentProto document4 = test_document1_;
+  document4.set_namespace_("namespace.1");
+  document4.set_uri("uri2");
+  ICING_ASSERT_OK(doc_store->Put(document4));
+
+  // DELETE namespace.1. document1 and document 4 should be deleted. document2
+  // and document3 should still be retrievable.
+  ICING_EXPECT_OK(
+      doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false));
+  EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
+              IsOkAndHolds(EqualsProto(document2)));
+  EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
+              IsOkAndHolds(EqualsProto(document3)));
+  EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> doc_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -444,7 +492,8 @@
   int64_t ground_truth_size_before = filesystem_.GetFileSize(
       absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
 
-  EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"),
+  EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+                                           /*soft_delete=*/true),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 
   int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -452,7 +501,27 @@
   EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
 }
 
-TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> doc_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  // Validates that deleting something non-existing won't append anything to
+  // ground truth
+  int64_t ground_truth_size_before = filesystem_.GetFileSize(
+      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+  EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+                                           /*soft_delete=*/false),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+  int64_t ground_truth_size_after = filesystem_.GetFileSize(
+      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+  EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> document_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -464,7 +533,25 @@
   // At this point, there are no existing documents with the namespace, even
   // though Icing's derived files know about this namespace. We should still
   // return NOT_FOUND since nothing existing has this namespace.
-  EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()),
+  EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+                                                /*soft_delete=*/true),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  ICING_EXPECT_OK(document_store->Put(test_document1_));
+  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+                                         test_document1_.uri()));
+
+  // At this point, there are no existing documents with the namespace, even
+  // though Icing's derived files know about this namespace. We should still
+  // return NOT_FOUND since nothing existing has this namespace.
+  EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+                                                /*soft_delete=*/false),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
@@ -536,7 +623,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) {
   SchemaProto schema;
   auto type_config = schema.add_types();
   type_config->set_schema_type("email");
@@ -593,7 +680,8 @@
 
   // Delete the "email" type and ensure that it works across both
   // email_document's namespaces. And that other documents aren't affected.
-  ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+  ICING_EXPECT_OK(
+      document_store->DeleteBySchemaType("email", /*soft_delete=*/true));
   EXPECT_THAT(document_store->Get(email_1_document_id),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -604,7 +692,8 @@
               IsOkAndHolds(EqualsProto(person_document)));
 
   // Delete the "message" type and check that other documents aren't affected
-  ICING_EXPECT_OK(document_store->DeleteBySchemaType("message"));
+  ICING_EXPECT_OK(
+      document_store->DeleteBySchemaType("message", /*soft_delete=*/true));
   EXPECT_THAT(document_store->Get(email_1_document_id),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -615,7 +704,88 @@
               IsOkAndHolds(EqualsProto(person_document)));
 }
 
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) {
+  SchemaProto schema;
+  auto type_config = schema.add_types();
+  type_config->set_schema_type("email");
+  type_config = schema.add_types();
+  type_config->set_schema_type("message");
+  type_config = schema.add_types();
+  type_config->set_schema_type("person");
+
+  std::string schema_store_dir = schema_store_dir_ + "_custom";
+  filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir));
+
+  ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store.get()));
+
+  DocumentProto email_document_1 = DocumentBuilder()
+                                       .SetKey("namespace1", "1")
+                                       .SetSchema("email")
+                                       .SetCreationTimestampMs(1)
+                                       .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id,
+                             document_store->Put(email_document_1));
+
+  DocumentProto email_document_2 = DocumentBuilder()
+                                       .SetKey("namespace2", "2")
+                                       .SetSchema("email")
+                                       .SetCreationTimestampMs(1)
+                                       .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id,
+                             document_store->Put(email_document_2));
+
+  DocumentProto message_document = DocumentBuilder()
+                                       .SetKey("namespace", "3")
+                                       .SetSchema("message")
+                                       .SetCreationTimestampMs(1)
+                                       .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+                             document_store->Put(message_document));
+
+  DocumentProto person_document = DocumentBuilder()
+                                      .SetKey("namespace", "4")
+                                      .SetSchema("person")
+                                      .SetCreationTimestampMs(1)
+                                      .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
+                             document_store->Put(person_document));
+
+  // Delete the "email" type and ensure that it works across both
+  // email_document's namespaces. And that other documents aren't affected.
+  ICING_EXPECT_OK(
+      document_store->DeleteBySchemaType("email", /*soft_delete=*/false));
+  EXPECT_THAT(document_store->Get(email_1_document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(document_store->Get(email_2_document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(document_store->Get(message_document_id),
+              IsOkAndHolds(EqualsProto(message_document)));
+  EXPECT_THAT(document_store->Get(person_document_id),
+              IsOkAndHolds(EqualsProto(person_document)));
+
+  // Delete the "message" type and check that other documents aren't affected
+  ICING_EXPECT_OK(
+      document_store->DeleteBySchemaType("message", /*soft_delete=*/false));
+  EXPECT_THAT(document_store->Get(email_1_document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(document_store->Get(email_2_document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(document_store->Get(message_document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(document_store->Get(person_document_id),
+              IsOkAndHolds(EqualsProto(person_document)));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> document_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -626,7 +796,8 @@
   int64_t ground_truth_size_before = filesystem_.GetFileSize(
       absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
 
-  EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"),
+  EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+                                                 /*soft_delete=*/true),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 
   int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -635,7 +806,28 @@
   EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
 }
 
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  // Validates that deleting something non-existing won't append anything to
+  // ground truth
+  int64_t ground_truth_size_before = filesystem_.GetFileSize(
+      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+  EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+                                                 /*soft_delete=*/false),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+  int64_t ground_truth_size_after = filesystem_.GetFileSize(
+      absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+  EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> document_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -644,10 +836,23 @@
   ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
                                          test_document1_.uri()));
 
-  // At this point, there are no existing documents with the schema type, but we
-  // still return OK because the SchemaStore is the ground truth on schemas and
-  // knows about the type
-  ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema()));
+  EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+                                                 /*soft_delete=*/true),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  ICING_EXPECT_OK(document_store->Put(test_document1_));
+  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+                                         test_document1_.uri()));
+
+  EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+                                                 /*soft_delete=*/false),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
@@ -1177,7 +1382,7 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<DocumentStore> doc_store,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1193,14 +1398,71 @@
           /*schema_type_id=*/0,
           /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
 
-  // FilterCache doesn't care if the document has been deleted
-  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+  // Associated entry of the deleted document is removed.
+  EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> doc_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             doc_store->Put(test_document1_));
+
   EXPECT_THAT(
       doc_store->GetDocumentFilterData(document_id),
       IsOkAndHolds(DocumentFilterData(
           /*namespace_id=*/0,
           /*schema_type_id=*/0,
           /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
+
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+  // Associated entry of the deleted document is removed.
+  EXPECT_THAT(doc_store->GetDocumentFilterData(document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> doc_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             doc_store->Put(test_document1_));
+
+  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+              IsOkAndHolds(DocumentAssociatedScoreData(
+                  /*document_score=*/document1_score_,
+                  /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+  // Associated entry of the deleted document is removed.
+  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(),
+              IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> doc_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+                             doc_store->Put(test_document1_));
+
+  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+              IsOkAndHolds(DocumentAssociatedScoreData(
+                  /*document_score=*/document1_score_,
+                  /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+  ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+  // Associated entry of the deleted document is removed.
+  EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(DocumentStoreTest,

diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index 4571df2..23c7b69 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h

@@ -84,6 +84,9 @@
   // Returns any encountered IO errors.
   libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
 
+  // Deletes data related to the given key. Returns true on success.
+  bool Delete(std::string_view key);
+
   // Returns a map of values to keys. Empty map if the mapper is empty.
   std::unordered_map<T, std::string> GetValuesToKeys() const;
 
@@ -255,6 +258,11 @@
 }
 
 template <typename T>
+bool KeyMapper<T>::Delete(std::string_view key) {
+  return trie_.Delete(key);
+}
+
+template <typename T>
 std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const {
   std::unordered_map<T, std::string> values_to_keys;
   for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid();

diff --git a/icing/store/namespace-id.h b/icing/store/namespace-id.h
index 4225be3..374e7a8 100644
--- a/icing/store/namespace-id.h
+++ b/icing/store/namespace-id.h

@@ -22,6 +22,7 @@
 
 // Id of unique namespace in DocumentProto. Generated in DocumentStore.
 using NamespaceId = int16_t;
+inline constexpr NamespaceId kInvalidNamespaceId = -1;
 
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 0ef1824..9213fbe 100644
--- a/icing/tokenization/icu/icu-language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc

@@ -15,6 +15,7 @@
 #include "icing/tokenization/icu/icu-language-segmenter.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/util/logging.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {

diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 31c2726..d0b90d1 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc

@@ -409,6 +409,71 @@
   EXPECT_THAT(word2_address, Eq(word2_result_address));
 }
 
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());  // itr points to 'How'
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       IteratorMultipleAdvancesResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());  // itr points to ' '
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  while (itr->Advance()) {
+    // Do nothing.
+  }
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
 TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
   ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
                              language_segmenter_factory::Create(GetOptions()));
@@ -992,6 +1057,19 @@
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 }
 
+TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Validates that the input strings are not copied
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::string_view> terms,
+      language_segmenter->GetAllTerms(
+          "(-term1 OR term2) AND property1.subproperty2:term3"));
+  EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+                                 ")", " ", "AND", " ", "property1", ".",
+                                 "subproperty2", ":", "term3"));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     LocaleName, IcuLanguageSegmenterAllLocalesTest,
     testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,

diff --git a/icing/tokenization/ios/ios-language-segmenter-factory.cc b/icing/tokenization/ios/ios-language-segmenter-factory.cc
new file mode 100644
index 0000000..3af7914
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter-factory.cc

@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/ios/ios-language-segmenter.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+}  // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+//   A LanguageSegmenter on success
+//   INVALID_ARGUMENT if locale string is invalid
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+    SegmenterOptions options) {
+  // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+  // different from other locales. E.g. "email.subject" will be split into 3
+  // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+  // term in other locales. Our current LanguageSegmenter doesn't handle this
+  // special rule, so we replace it with "en_US".
+  if (options.locale == kLocaleAmericanEnglishComputer) {
+    ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+                       << " not supported. Converting to locale en_US";
+    options.locale = "en_US";
+  }
+  return std::make_unique<IosLanguageSegmenter>(std::move(options.locale));
+}
+
+}  // namespace language_segmenter_factory
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/tokenization/ios/ios-language-segmenter.h b/icing/tokenization/ios/ios-language-segmenter.h
new file mode 100644
index 0000000..1aa1f1b
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter.h

@@ -0,0 +1,88 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to segment sentences into words based on rules from
+// CFStringTokenizer, some extra rules are applied in this class:
+//
+// 1. All ASCII terms will be returned.
+// 2. For non-ASCII terms, only the alphabetic terms are returned, which means
+//    non-ASCII punctuation and special characters are left out.
+// 3. Multiple continuous whitespaces are treated as one.
+//
+// The rules above are common to the high-level tokenizers that might use this
+// class. Other special tokenization logic will be in each tokenizer.
+//
+// This implementation has a few notable deviations from the ICU-based
+// implementations:
+//   1. This implementation doesn't treat ':' as a word connector. ICU does.
+//   2. When the locale is Japanese, this implementation treats internal periods
+//      are as word breaks rather than connectors. "N.B.A." becomes {"N", ".",
+//      "B", ".", "A", "."} rather than {"N.B.A", "."} (which is what ICU and
+//      all other locales do.
+//   3. Locale can have other effects on segmentation - this is often when the
+//      wrong locale is specified for CJKT text.
+//   4. Some CJKT segmentation deviates from ICU results even when the correct
+//      locale is specified.
+class IosLanguageSegmenter : public LanguageSegmenter {
+ public:
+  explicit IosLanguageSegmenter(std::string locale)
+      : locale_(std::move(locale)) {}
+
+  IosLanguageSegmenter(const IosLanguageSegmenter&) = delete;
+  IosLanguageSegmenter& operator=(const IosLanguageSegmenter&) = delete;
+
+  // The segmentation depends on the language detected in the input text.
+  //
+  // Note: It could happen that the language detected from text is wrong, then
+  // there would be a small chance that the text is segmented incorrectly.
+  //
+  // Returns:
+  //   An iterator of terms on success
+  //   INTERNAL_ERROR if any error occurs
+  libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+  Segment(std::string_view text) const override;
+
+  // The segmentation depends on the language detected in the input text.
+  //
+  // Note: It could happen that the language detected from text is wrong, then
+  // there would be a small chance that the text is segmented incorrectly.
+  //
+  // Returns:
+  //   A list of terms on success
+  //   INTERNAL_ERROR if any error occurs
+  libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+      std::string_view text) const override;
+
+ private:
+  std::string locale_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_

diff --git a/icing/tokenization/ios/ios-language-segmenter_test.cc b/icing/tokenization/ios/ios-language-segmenter_test.cc
new file mode 100644
index 0000000..b6831e2
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter_test.cc

@@ -0,0 +1,1265 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling
+// ResetToStart/ResetAfter with the current position to simulate Advancing on
+// the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  if (!itr->ResetToStart().ok()) {
+    return terms;
+  }
+  terms.push_back(itr->GetTerm());
+  const char* text_begin = itr->GetTerm().data();
+  // Calling ResetToTermStartingAfter with the current position should get the
+  // very next term in the sequence.
+  for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  std::vector<std::string_view> terms;
+
+  bool is_ok = true;
+  int current_pos = 0;
+  while (is_ok) {
+    // Alternate between using Advance and ResetToTermAfter.
+    if (terms.size() % 2 == 0) {
+      is_ok = itr->Advance();
+    } else {
+      // Calling ResetToTermStartingAfter with the current position should get
+      // the very next term in the sequence.
+      current_pos = itr->GetTerm().data() - text_begin;
+      is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+    }
+    if (is_ok) {
+      terms.push_back(itr->GetTerm());
+    }
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  int last_pos = 0;
+  while (itr->Advance()) {
+    last_pos = itr->GetTerm().data() - text_begin;
+  }
+  std::vector<std::string_view> terms;
+  // Calling ResetToTermEndingBefore with the current position should get the
+  // previous term in the sequence.
+  for (int current_pos = last_pos;
+       itr->ResetToTermEndingBefore(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+class IosLanguageSegmenterAllLocalesTest
+    : public testing::TestWithParam<const char*> {
+ protected:
+  static std::string GetLocale() { return GetParam(); }
+  static language_segmenter_factory::SegmenterOptions GetOptions() {
+    return language_segmenter_factory::SegmenterOptions(GetLocale());
+  }
+};
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, EmptyText) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, SimpleText) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // ASCII punctuation marks are kept
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("Hello, World!!!"),
+      IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+              IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+              IsOkAndHolds(ElementsAre("100", "%")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+              IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // ASCII special characters are kept
+  EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+              IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+              IsOkAndHolds(ElementsAre("A", "+", "B")));
+  // 0x0009 is the unicode for tab (within ASCII range).
+  std::string text_with_tab = absl_ports::StrCat(
+      "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+              IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+                                       UCharToString(0x0009), "World")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Full-width (non-ASCII) punctuation marks and special characters are left
+  // out.
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+                             language_segmenter->GetAllTerms("。？·Hello！×"));
+  EXPECT_THAT(terms, ElementsAre("Hello"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Acronym) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // LOCALE DEVIATION!! When the locale is Japanese, internal periods are
+  // considered word breaks.
+  std::vector<std::string> exp_terms;
+  if (GetOptions().locale == ULOC_JAPAN) {
+    exp_terms = {"U", ".", "S", ".", " ", "Bank"};
+  } else {
+    exp_terms = {"U.S", ".", " ", "Bank"};
+  }
+  EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+              IsOkAndHolds(ElementsAreArray(exp_terms)));
+
+  // LOCALE DEVIATION!! When the locale is Japanese, internal periods are
+  // considered word breaks.
+  if (GetOptions().locale == ULOC_JAPAN) {
+    exp_terms = {"I", ".", "B", ".", "M", "."};
+  } else {
+    exp_terms = {"I.B.M", "."};
+  }
+  EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+              IsOkAndHolds(ElementsAreArray(exp_terms)));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+              IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+              IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, WordConnector) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // According to unicode word break rules
+  // WB6(https://unicode.org/reports/tr29/#WB6),
+  // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+  // punctuation characters are used as word connecters. That is, words don't
+  // break before and after them. Here we just test some that we care about.
+
+  // Word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+              IsOkAndHolds(ElementsAre("com.google.android")));
+  // DIFFERENCE!! iOS doesn't agree that ':' is a word connector
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::string_view> term,
+      language_segmenter->GetAllTerms("com:google:android"));
+  EXPECT_THAT(term, ElementsAre("com", ":", "google", ":", "android"));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+              IsOkAndHolds(ElementsAre("com'google'android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+              IsOkAndHolds(ElementsAre("com_google_android")));
+
+  // Word connecters can be mixed
+  // DIFFERENCE!! iOS doesn't agree that ':' is a word connector
+  // TODO(b/157565185) resolve the handling of ':' as a connector.
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+              IsOkAndHolds(ElementsAre("com.google.android", ":", "icing")));
+
+  // Any heading and trailing characters are not connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+              IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+  // Not word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+              IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+              IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+              IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+              IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+              IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+              IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+              IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+              IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+              IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+              IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("com\"google\"android"),
+      IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Apostrophes) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+              IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+              IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+              IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+              IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+  // 0x2019 is the single right quote, should be treated the same as "'"
+  std::string token_with_quote =
+      absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+  std::string text_with_quote =
+      absl_ports::StrCat(token_with_quote, " be back.");
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms(text_with_quote),
+      IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Parentheses) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+              IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+              IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Quotes) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+              IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+              IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Alphanumeric) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+              IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Number) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+      IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+              IsOkAndHolds(ElementsAre("3,456.789")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+              IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  const int kNumSeparators = 256;
+  std::string text_with_spaces =
+      absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+  // Multiple continuous whitespaces are treated as one. Whitespace at the
+  // beginning of the text doesn't affect the results of GetTerm() after the
+  // iterator is done.
+  text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+                                        "Hello", " ", "World");
+  ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+                             language_segmenter->Segment(text_with_spaces));
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, CJKT) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+  // have whitespaces as word delimiter.
+
+  // Chinese
+  // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+  // "每天" should be treated as a single token. All other locales split it into
+  // two tokens.
+  std::vector<std::string> exp_terms;
+  if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+    exp_terms = {"我", "每天", "走路", "去", "上班"};
+  } else if (GetOptions().locale == ULOC_JAPAN) {
+    // LOCALE DEVIATION!! JAPANESE groups "去上" and leaves "班" on its own.
+    // All other locales which, like ICU, breaks the text into "去" and "上班".
+    exp_terms = {"我", "每", "天", "走路", "去上", "班"};
+  } else {
+    exp_terms = {"我", "每", "天", "走路", "去", "上班"};
+  }
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::string_view> terms,
+      language_segmenter->GetAllTerms("我每天走路去上班。"));
+  EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+  // Japanese
+  // DIFFERENCE!! Disagreement over how to segment "歩い" (iOS groups) and
+  // "てい" (iOS splits). This difference persists even when locale is set to
+  // JAPAN.
+  if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE ||
+      GetOptions().locale == ULOC_TRADITIONAL_CHINESE) {
+    // LOCALE DEVIATION!! There is also disagreement when locale is CHINESE
+    // about how to tokenize "毎日", "仕事", "歩い", which are all split, and
+    // "てい" which is grouped.
+    exp_terms = {"私", "は", "毎", "日",   "仕",  "事",
+                 "に", "歩", "い", "てい", "ます"};
+  } else {
+    exp_terms = {"私", "は", "毎日", "仕事", "に", "歩い", "て", "い", "ます"};
+  }
+  ICING_ASSERT_OK_AND_ASSIGN(
+      terms, language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"));
+  EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+  // Khmer
+  ICING_ASSERT_OK_AND_ASSIGN(
+      terms, language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"));
+  EXPECT_THAT(terms, ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"));
+
+  // Thai
+  // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+  // This difference persists even when locale is set to THAI
+  ICING_ASSERT_OK_AND_ASSIGN(
+      terms, language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"));
+  EXPECT_THAT(terms, ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุกวัน"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+              IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(IosLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Turkish
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+                             language_segmenter->GetAllTerms("merhaba dünya"));
+  EXPECT_THAT(terms, ElementsAre("merhaba", " ", "dünya"));
+  // Korean
+  ICING_ASSERT_OK_AND_ASSIGN(
+      terms, language_segmenter->GetAllTerms("나는 매일 출근합니다."));
+  EXPECT_THAT(terms, ElementsAre("나는", " ", "매일", " ", "출근합니다", "."));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // DIFFERENCE/LOCALE DEVIATION!! JAPANESE agrees with ICU that "你好" should
+  // be treated as a single token. All other locales other than
+  // SIMPLIFIED_CHINESE split it into two tokens.
+  std::vector<std::string> exp_terms;
+  if (GetOptions().locale == ULOC_JAPAN) {
+    exp_terms = {"How", " ",  "are",  " ",    "you", "你好",
+                 "吗",  "お", "元気", "です", "か"};
+  } else if (GetOptions().locale == ULOC_TRADITIONAL_CHINESE) {
+    // LOCALE DEVIATION!! TRADITIONAL_CHINESE disagrees over tokenization of
+    // "你好" and "元気", both of which it breaks up.
+    exp_terms = {"How", " ",  "are", " ",  "you",  "你", "好",
+                 "吗",  "お", "元",  "気", "です", "か"};
+  } else if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+    // LOCALE DEVIATION!! SIMPLIFIED_CHINESE disagrees over tokenization of
+    // "元気", which it breaks up.
+    exp_terms = {"How", " ",  "are", " ",  "you",  "你好",
+                 "吗",  "お", "元",  "気", "です", "か"};
+  } else {
+    // LOCALE DEVIATION!! All other locales disagree over the tokenization of
+    // "你好", which it breaks up.
+    exp_terms = {"How", " ",  "are", " ",    "you",  "你",
+                 "好",  "吗", "お",  "元気", "です", "か"};
+  }
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::string_view> terms,
+      language_segmenter->GetAllTerms("How are you你好吗お元気ですか"));
+  EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      terms, language_segmenter->GetAllTerms("나는 California에 산다"));
+  EXPECT_THAT(terms, ElementsAre("나는", " ", "California", "에", " ", "산다"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, NotCopyStrings) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Validates that the input strings are not copied
+  const std::string text = "Hello World";
+  const char* word1_address = text.c_str();
+  const char* word2_address = text.c_str() + 6;
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+                             language_segmenter->GetAllTerms(text));
+  ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+  const char* word1_result_address = terms.at(0).data();
+  const char* word2_result_address = terms.at(2).data();
+
+  // The underlying char* should be the same
+  EXPECT_THAT(word1_address, Eq(word1_result_address));
+  EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());  // itr points to 'How'
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       IteratorMultipleAdvancesResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());  // itr points to ' '
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  while (itr->Advance()) {
+    // Do nothing.
+  }
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  auto position_or = itr->ResetToTermStartingAfter(7);
+  EXPECT_THAT(position_or, IsOk());
+  EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
+  ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^ ^^ ^ ^  ^  ^
+  // Bytes:   0  3 4 78 1114172023 29 35
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  // DIFFERENCE/LOCALE DEVIATION!! JAPANESE and SIMPLIFIED_CHINESE agrees with
+  // ICU that "你好" should be treated as a single token. All other locales
+  // other than SIMPLIFIED_CHINESE split it into two tokens.
+  std::string exp_token;
+  if (GetLocale() == ULOC_JAPAN || GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+    exp_token = "你好";
+  } else {
+    exp_token = "你";
+  }
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+  EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+  EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       ContinuousWhitespacesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  auto offset_or = itr->ResetToTermStartingAfter(0);
+  EXPECT_THAT(offset_or.status(), IsOk());
+  EXPECT_THAT(offset_or.ValueOrDie(), Eq(5));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^^ ^   ^^
+  // Bytes:   0 3 6 9 15 18
+  std::string exp_token;
+  // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+  // "每天" should be treated as a single token. All other locales split it into
+  // two tokens.
+  if (GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+    exp_token = "每天";
+  } else {
+    exp_token = "每";
+  }
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(21),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  // LOCALE DEVIATION!! There is disagreement when locale is CHINESE about how
+  // to tokenize "毎日", "仕事", "歩い", which are all split, and "てい" which
+  // is grouped.
+  std::string exp_term;
+  int exp_offset;
+  if (GetLocale() == ULOC_SIMPLIFIED_CHINESE ||
+      GetLocale() == ULOC_TRADITIONAL_CHINESE) {
+    // Since "毎日" is broken up when the locale is CHINESE, ResetAfter(7) will
+    // point to "日" instead of the next segment ("仕事") like other locales.
+    exp_term = "日";
+    exp_offset = 9;
+  } else {
+    exp_term = "仕事";
+    exp_offset = 12;
+  }
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(exp_offset)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_term));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^
+  // Bytes:   0 9 21 27  42
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+  // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+  // This difference persists even when locale is set to THAI
+  EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทุกวัน"));
+}
+TEST_P(IosLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+  EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  std::string exp_token;
+  int exp_offset;
+  if (GetOptions().locale == ULOC_TRADITIONAL_CHINESE ||
+      GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+    // LOCALE DEVIATION!! SIMPLIFIED_CHINESE disagrees over tokenization of
+    // "元気", which it breaks up.
+    exp_offset = 26;
+    exp_token = "気";
+  } else {
+    exp_offset = 23;
+    exp_token = "元気";
+  }
+  EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(exp_offset)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+  // DIFFERENCE/LOCALE DEVIATION!! JAPANESE and SIMPLIFIED_CHINESE agrees with
+  // ICU that "你好" should be treated as a single token. All other locales
+  // split it into two tokens.
+  if (GetLocale() == ULOC_JAPAN || GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+    exp_offset = 8;
+    exp_token = "you";
+  } else {
+    exp_offset = 11;
+    exp_token = "你";
+  }
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(exp_offset)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+  EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+       ContinuousWhitespacesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^^ ^  ^ ^
+  // Bytes:   0 3 6 9 15 18
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  std::string exp_token;
+  int exp_offset;
+  // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+  // "每天" should be treated as a single token. All other locales split it into
+  // two tokens.
+  if (GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+    exp_offset = 0;
+    exp_token = "我";
+  } else {
+    exp_offset = 3;
+    exp_token = "每";
+  }
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(exp_offset)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+  if (GetOptions().locale == ULOC_JAPAN) {
+    // LOCALE DEVIATION!! JAPANESE groups "去上" and leaves "班" on its own.
+    // All other locales which, like ICU, breaks the text into "去" and "上班".
+    exp_offset = 9;
+    exp_token = "走路";
+  } else {
+    exp_offset = 15;
+    exp_token = "去";
+  }
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(exp_offset)));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  // LOCALE DEVIATION!! There is disagreement when locale is CHINESE about how
+  // to tokenize "毎日", "仕事", "歩い", which are all split, and "てい" which
+  // is grouped.
+  std::string exp_term;
+  int exp_offset;
+  if (GetLocale() == ULOC_SIMPLIFIED_CHINESE ||
+      GetLocale() == ULOC_TRADITIONAL_CHINESE) {
+    // TODO(b/157565185) For some reason, CFStringTokenizerGoToTokenAtIndex
+    // believes that "いています" is one token when locale is
+    // SIMPLIFIED/TRADITIONAL CHINESE, but CFStringTokenizerAdvanceToNextToken
+    // thinks that it is three: "い" "てい", "ます". Other locales and ICU agree
+    // that that segment should be "歩い", "て", "い", "ます".
+    // This is the only case where CFStringTokenizerGoToTokenAtIndex and
+    // CFStringTokenizerAdvanceToNextToken disagree. Find a way around this
+    // (such as rewinding past the desired segment and then advancing to it) if
+    // this is still an issue after adding language detection.
+    exp_term = "歩";
+    exp_offset = 21;
+  } else {
+    // Since "てい" is broken up when the locale is not CHINESE,
+    // ResetBefore(33) will point to "い" at offset 30.
+    exp_term = "い";
+    exp_offset = 30;
+  }
+  auto offset_or = itr->ResetToTermEndingBefore(33);
+  EXPECT_THAT(offset_or, IsOk());
+  EXPECT_THAT(offset_or.ValueOrDie(), Eq(exp_offset));
+  EXPECT_THAT(itr->GetTerm(), Eq(exp_term));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^
+  // Bytes:   0 9 21 27  42
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+  // This difference persists even when locale is set to THAI
+  EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(27)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทำงาน"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    LocaleName, IosLanguageSegmenterAllLocalesTest,
+    testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+                    ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+                    ULOC_KOREA,
+                    ULOC_SIMPLIFIED_CHINESE,
+                    ULOC_TRADITIONAL_CHINESE,
+                     "es_ES",        // Spanish
+                     "hi_IN",        // Hindi
+                     "th_TH",        // Thai
+                     "lo_LA",        // Lao
+                     "km_KH",        // Khmer
+                     "ar_DZ",        // Arabic
+                     "ru_RU",        // Russian
+                     "pt_PT",        // Portuguese
+                     "en_US_POSIX"   // American English (Computer)
+                     "wrong_locale"  // Will fall back to ICU default locale
+                     ""              // Will fall back to ICU default locale
+                    ));
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index ce50d0b..e60c168 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h

@@ -18,11 +18,14 @@
 #include <memory>
 #include <string_view>
 
+#ifdef __ANDROID__
 #include "icing/jni/jni-cache.h"
+#else   // __ANDROID__
+class JniCache;  // forward declaration to let non-Android builds work.
+#endif  // __ANDROID__
+
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/tokenization/language-segmenter.h"
-#include "icing/util/i18n-utils.h"
-#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -30,7 +33,7 @@
 namespace language_segmenter_factory {
 
 struct SegmenterOptions {
-  explicit SegmenterOptions(std::string locale = ULOC_US,
+  explicit SegmenterOptions(std::string locale,
                             const JniCache* jni_cache = nullptr)
       : locale(std::move(locale)), jni_cache(jni_cache) {}
 
@@ -46,7 +49,7 @@
 //   A LanguageSegmenter on success
 //   INVALID_ARGUMENT if locale string is invalid
 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
-    SegmenterOptions options = SegmenterOptions());
+    SegmenterOptions options);
 
 }  // namespace language_segmenter_factory
 

diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index c7b068d..a1b031a 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc

@@ -43,8 +43,10 @@
 };
 
 TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -62,8 +64,10 @@
 
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermStartingAfterWithOffsetInText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -77,8 +81,10 @@
 
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermStartingAfterWithNegativeOffsetNotOk) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -95,8 +101,10 @@
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
   std::string text = "foo bar";
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
@@ -106,8 +114,10 @@
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
   std::string text = "foo bar";
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
@@ -115,8 +125,10 @@
 }
 
 TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -130,8 +142,10 @@
 
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermEndingBeforeWithZeroNotFound) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -142,8 +156,10 @@
 
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -157,8 +173,10 @@
 TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
   std::string text = "foo bar";
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),

diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 49ddfca..bd86169 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc

@@ -20,6 +20,7 @@
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
 
 // Run on a Linux workstation:
 //    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -59,8 +60,9 @@
         GetTestFilePath("icing/icu.dat")));
   }
 
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
 
   std::string input_string(state.range(0), 'A');
 
@@ -95,8 +97,9 @@
         GetTestFilePath("icing/icu.dat")));
   }
 
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
 
   std::string input_string(state.range(0), 'A');
   for (int i = 1; i < input_string.length(); i += 2) {
@@ -134,8 +137,9 @@
         GetTestFilePath("icing/icu.dat")));
   }
 
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create().ValueOrDie();
+      language_segmenter_factory::Create(std::move(options)).ValueOrDie();
 
   std::string input_string;
   while (input_string.length() < state.range(0)) {

diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f2fc678..d9db75a 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc

@@ -24,6 +24,7 @@
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -49,8 +50,10 @@
 }
 
 TEST_F(PlainTokenizerTest, Simple) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -81,8 +84,10 @@
 }
 
 TEST_F(PlainTokenizerTest, Whitespace) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -107,8 +112,10 @@
 }
 
 TEST_F(PlainTokenizerTest, Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -136,8 +143,10 @@
 }
 
 TEST_F(PlainTokenizerTest, SpecialCharacters) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -157,8 +166,10 @@
 }
 
 TEST_F(PlainTokenizerTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -209,8 +220,10 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -226,8 +239,10 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -243,8 +258,10 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -291,8 +308,10 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(

diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 351f7c1..9b71e8a 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc

@@ -22,6 +22,7 @@
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -46,8 +47,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, Simple) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -59,8 +62,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, Parentheses) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -159,8 +164,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, Exclustion) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -226,8 +233,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -314,8 +323,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, OR) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -435,8 +446,10 @@
 // CJKT are treated the same way by language segmenter and raw tokenizer, so
 // here we test Chinese and Japanese to represent CJKT.
 TEST_F(RawQueryTokenizerTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -488,8 +501,10 @@
 // Raw tokenizer identifies all characters that it doesn't know as OTHER type,
 // so we can choose comma "," to represent all OTHER characters.
 TEST_F(RawQueryTokenizerTest, OtherChars) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -533,8 +548,10 @@
 }
 
 TEST_F(RawQueryTokenizerTest, Mix) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,

diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
index f79bc68..db973f3 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc

@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "icing/jni/jni-cache.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
 #include "icing/util/logging.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {

diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
index a01d944..4b50231 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc

@@ -443,6 +443,74 @@
   EXPECT_THAT(word2_address, Eq(word2_result_address));
 }
 
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());  // itr points to 'How'
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());
+  ASSERT_TRUE(itr->Advance());  // itr points to ' '
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  while (itr->Advance()) {
+    // Do nothing.
+  }
+  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
 TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
@@ -1060,6 +1128,21 @@
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 }
 
+TEST_P(ReverseJniLanguageSegmenterTest, QuerySyntax) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Validates that the input strings are not copied
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::string_view> terms,
+      language_segmenter->GetAllTerms(
+          "(-term1 OR term2) AND property1.subproperty2:term3"));
+  EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+                                 ")", " ", "AND", " ", "property1", ".",
+                                 "subproperty2", ":", "term3"));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     LocaleName, ReverseJniLanguageSegmenterTest,
     testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,

diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 2256022..bb26364 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc

@@ -24,164 +24,13 @@
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/legacy/core/icing-string-util.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
 namespace icing {
 namespace lib {
 
-namespace {
-
-// Returns the lead byte of the UTF-8 character that includes the byte at
-// current_byte_index within it.
-int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
-  while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
-    --current_byte_index;
-  }
-  return current_byte_index;
-}
-
-class CharacterIterator {
- public:
-  explicit CharacterIterator(std::string_view text)
-      : CharacterIterator(text, 0, 0) {}
-  CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
-      : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
-
-  // Moves from current position to the character that includes the specified
-  // UTF-8 index.
-  // REQUIRES: desired_utf8_index <= text_.length()
-  // desired_utf8_index is allowed to point one index past the end, but no
-  // further.
-  bool AdvanceToUtf8(int desired_utf8_index) {
-    if (desired_utf8_index > text_.length()) {
-      // Enforce the requirement.
-      return false;
-    }
-    // Need to work forwards.
-    while (utf8_index_ < desired_utf8_index) {
-      UChar32 uchar32 =
-          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
-      if (uchar32 == i18n_utils::kInvalidUChar32) {
-        // Unable to retrieve a valid UTF-32 character at the previous position.
-        return false;
-      }
-      int utf8_length = i18n_utils::GetUtf8Length(uchar32);
-      if (utf8_index_ + utf8_length > desired_utf8_index) {
-        // Ah! Don't go too far!
-        break;
-      }
-      utf8_index_ += utf8_length;
-      utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
-    }
-    return true;
-  }
-
-  // Moves from current position to the character that includes the specified
-  // UTF-8 index.
-  // REQUIRES: 0 <= desired_utf8_index
-  bool RewindToUtf8(int desired_utf8_index) {
-    if (desired_utf8_index < 0) {
-      // Enforce the requirement.
-      return false;
-    }
-    // Need to work backwards.
-    while (utf8_index_ > desired_utf8_index) {
-      --utf8_index_;
-      utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
-      if (utf8_index_ < 0) {
-        // Somehow, there wasn't a single UTF-8 lead byte at
-        // requested_byte_index or an earlier byte.
-        return false;
-      }
-      // We've found the start of a unicode char!
-      UChar32 uchar32 =
-          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
-      if (uchar32 == i18n_utils::kInvalidUChar32) {
-        // Unable to retrieve a valid UTF-32 character at the previous position.
-        return false;
-      }
-      utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
-    }
-    return true;
-  }
-
-  // Advances current position to desired_utf16_index.
-  // REQUIRES: desired_utf16_index <= text_.utf16_length()
-  // desired_utf16_index is allowed to point one index past the end, but no
-  // further.
-  bool AdvanceToUtf16(int desired_utf16_index) {
-    while (utf16_index_ < desired_utf16_index) {
-      UChar32 uchar32 =
-          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
-      if (uchar32 == i18n_utils::kInvalidUChar32) {
-        // Unable to retrieve a valid UTF-32 character at the previous position.
-        return false;
-      }
-      int utf16_length = i18n_utils::GetUtf16Length(uchar32);
-      if (utf16_index_ + utf16_length > desired_utf16_index) {
-        // Ah! Don't go too far!
-        break;
-      }
-      int utf8_length = i18n_utils::GetUtf8Length(uchar32);
-      if (utf8_index_ + utf8_length > text_.length()) {
-        // Enforce the requirement.
-        return false;
-      }
-      utf8_index_ += utf8_length;
-      utf16_index_ += utf16_length;
-    }
-    return true;
-  }
-
-  // Rewinds current position to desired_utf16_index.
-  // REQUIRES: 0 <= desired_utf16_index
-  bool RewindToUtf16(int desired_utf16_index) {
-    if (desired_utf16_index < 0) {
-      return false;
-    }
-    while (utf16_index_ > desired_utf16_index) {
-      --utf8_index_;
-      utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
-      // We've found the start of a unicode char!
-      UChar32 uchar32 =
-          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
-      if (uchar32 == i18n_utils::kInvalidUChar32) {
-        // Unable to retrieve a valid UTF-32 character at the previous position.
-        return false;
-      }
-      utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
-    }
-    return true;
-  }
-
-  bool IsValidCharacter() const {
-    // Rule 1: all ASCII terms will be returned.
-    // We know it's a ASCII term by checking the first char.
-    if (i18n_utils::IsAscii(text_[utf8_index_])) {
-      return true;
-    }
-
-    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
-    // We know it's an alphabetic term by checking the first unicode character.
-    if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
-      return true;
-    }
-
-    return false;
-  }
-
-  int utf8_index() const { return utf8_index_; }
-  int utf16_index() const { return utf16_index_; }
-
- private:
-  std::string_view text_;
-  int utf8_index_;
-  int utf16_index_;
-};
-
-}  // namespace
-
 class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
  public:
   explicit ReverseJniLanguageSegmenterIterator(
@@ -229,7 +78,7 @@
     // Check if the current term is valid. We consider any term valid if its
     // first character is valid. If it's not valid, then we need to advance to
     // the next term.
-    if (term_start_.IsValidCharacter()) {
+    if (IsValidTerm()) {
       return true;
     }
     return Advance();
@@ -382,8 +231,7 @@
     // 4. The start and end indices point to a segment, but we need to ensure
     // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
     // need a segment prior to this one.
-    if (term_end_exclusive_.utf8_index() > offset ||
-        !term_start_.IsValidCharacter()) {
+    if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) {
       return ResetToTermEndingBefore(term_start_.utf8_index());
     }
     return term_start_.utf8_index();
@@ -414,6 +262,21 @@
                           /*utf16_index=*/ReverseJniBreakIterator::kDone);
   }
 
+  bool IsValidTerm() const {
+    // Rule 1: all ASCII terms will be returned.
+    // We know it's a ASCII term by checking the first char.
+    if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
+      return true;
+    }
+
+    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+    // We know it's an alphabetic term by checking the first unicode character.
+    if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+      return true;
+    }
+    return false;
+  }
+
   // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
   // this class needs to maintain state to convert between UTF-16 and UTF-8.
   std::unique_ptr<ReverseJniBreakIterator> break_iterator_;

diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
index 8ed38b2..6c5e3f6 100644
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc

@@ -18,6 +18,7 @@
 #include "icing/testing/common-matchers.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
@@ -28,21 +29,27 @@
 using ::testing::IsEmpty;
 
 TEST(SpaceLanguageSegmenterTest, EmptyText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
 }
 
 TEST(SpaceLanguageSegmenterTest, SimpleText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
               IsOkAndHolds(ElementsAre("Hello", " ", "World")));
 }
 
 TEST(SpaceLanguageSegmenterTest, Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
 
   EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
               IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
@@ -55,8 +62,10 @@
 }
 
 TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -64,8 +73,10 @@
 }
 
 TEST(SpaceLanguageSegmenterTest, Number) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(
@@ -80,8 +91,10 @@
 }
 
 TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
 
   // Multiple continuous whitespaces are treated as one.
   const int kNumSeparators = 256;
@@ -92,8 +105,10 @@
 }
 
 TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create());
+  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(std::move(options)));
   // Validates that the input strings are not copied
   const std::string text = "Hello World";
   const char* word1_address = text.c_str();

diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
new file mode 100644
index 0000000..3707f95
--- /dev/null
+++ b/icing/util/character-iterator.cc

@@ -0,0 +1,127 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/character-iterator.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+  while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+    --current_byte_index;
+  }
+  return current_byte_index;
+}
+
+}  // namespace
+
+bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
+  if (desired_utf8_index > text_.length()) {
+    // Enforce the requirement.
+    return false;
+  }
+  // Need to work forwards.
+  while (utf8_index_ < desired_utf8_index) {
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+    if (utf8_index_ + utf8_length > desired_utf8_index) {
+      // Ah! Don't go too far!
+      break;
+    }
+    utf8_index_ += utf8_length;
+    utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+  }
+  return true;
+}
+
+bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
+  if (desired_utf8_index < 0) {
+    // Enforce the requirement.
+    return false;
+  }
+  // Need to work backwards.
+  while (utf8_index_ > desired_utf8_index) {
+    --utf8_index_;
+    utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+    if (utf8_index_ < 0) {
+      // Somehow, there wasn't a single UTF-8 lead byte at
+      // requested_byte_index or an earlier byte.
+      return false;
+    }
+    // We've found the start of a unicode char!
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+  }
+  return true;
+}
+
+bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
+  while (utf16_index_ < desired_utf16_index) {
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+    if (utf16_index_ + utf16_length > desired_utf16_index) {
+      // Ah! Don't go too far!
+      break;
+    }
+    int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+    if (utf8_index_ + utf8_length > text_.length()) {
+      // Enforce the requirement.
+      return false;
+    }
+    utf8_index_ += utf8_length;
+    utf16_index_ += utf16_length;
+  }
+  return true;
+}
+
+bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
+  if (desired_utf16_index < 0) {
+    return false;
+  }
+  while (utf16_index_ > desired_utf16_index) {
+    --utf8_index_;
+    utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+    // We've found the start of a unicode char!
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+  }
+  return true;
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
new file mode 100644
index 0000000..22de6c5
--- /dev/null
+++ b/icing/util/character-iterator.h

@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
+#define ICING_UTIL_CHARACTER_ITERATOR_H_
+
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+class CharacterIterator {
+ public:
+  explicit CharacterIterator(std::string_view text)
+      : CharacterIterator(text, 0, 0) {}
+
+  CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+      : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+  // Moves from current position to the character that includes the specified
+  // UTF-8 index.
+  // REQUIRES: desired_utf8_index <= text_.length()
+  // desired_utf8_index is allowed to point one index past the end, but no
+  // further.
+  bool AdvanceToUtf8(int desired_utf8_index);
+
+  // Moves from current position to the character that includes the specified
+  // UTF-8 index.
+  // REQUIRES: 0 <= desired_utf8_index
+  bool RewindToUtf8(int desired_utf8_index);
+
+  // Advances current position to desired_utf16_index.
+  // REQUIRES: desired_utf16_index <= text_.utf16_length()
+  // desired_utf16_index is allowed to point one index past the end, but no
+  // further.
+  bool AdvanceToUtf16(int desired_utf16_index);
+
+  // Rewinds current position to desired_utf16_index.
+  // REQUIRES: 0 <= desired_utf16_index
+  bool RewindToUtf16(int desired_utf16_index);
+
+  int utf8_index() const { return utf8_index_; }
+  int utf16_index() const { return utf16_index_; }
+
+  bool operator==(const CharacterIterator& rhs) const {
+    return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
+           utf16_index_ == rhs.utf16_index_;
+  }
+
+ private:
+  std::string_view text_;
+  int utf8_index_;
+  int utf16_index_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_UTIL_CHARACTER_ITERATOR_H_

diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 9cf992f..d6754d5 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc

@@ -99,16 +99,17 @@
     return;
   }
 
-  while (truncate_to_length > 0) {
-    if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
-      str->resize(truncate_to_length);
-      return;
-    }
-    truncate_to_length--;
-  }
+  str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
+}
 
-  // Truncates to an empty string
-  str->resize(0);
+int SafeTruncateUtf8Length(const char* str, int desired_length) {
+  while (desired_length > 0) {
+    if (IsLeadUtf8Byte(str[desired_length])) {
+      break;
+    }
+    --desired_length;
+  }
+  return desired_length;
 }
 
 bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }

diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index e103bab..82ae828 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h

@@ -50,6 +50,13 @@
 // Returns the char at the given position.
 UChar32 GetUChar32At(const char* data, int length, int position);
 
+// Returns the safe position to truncate a UTF8 string at so that multi-byte
+// UTF8 characters are not cut in the middle. The returned value will always be
+// 0 <= val <= desired_length.
+//
+// REQUIRES: 0 <= desired_length < strlen(str)
+int SafeTruncateUtf8Length(const char* str, int desired_length);
+
 // Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
 // in the middle. The string will be truncated in place.
 void SafeTruncateUtf8(std::string* str, int truncate_to_length);

diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index f4e301d..125da12 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java

@@ -328,6 +328,27 @@
   }
 
   @NonNull
+  public DeleteResultProto deleteByQuery(@NonNull SearchSpecProto searchSpec) {
+    byte[] deleteResultBytes = nativeDeleteByQuery(nativePointer, searchSpec.toByteArray());
+    if (deleteResultBytes == null) {
+      Log.e(TAG, "Received null DeleteResultProto from native.");
+      return DeleteResultProto.newBuilder()
+          .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+          .build();
+    }
+
+    try {
+      return DeleteResultProto.parseFrom(
+          deleteResultBytes, EXTENSION_REGISTRY_LITE);
+    } catch (InvalidProtocolBufferException e) {
+      Log.e(TAG, "Error parsing DeleteResultProto.", e);
+      return DeleteResultProto.newBuilder()
+          .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+          .build();
+    }
+  }
+
+  @NonNull
   public PersistToDiskResultProto persistToDisk() {
     byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer);
     if (persistToDiskResultBytes == null) {
@@ -438,6 +459,8 @@
 
   private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType);
 
+  private static native byte[] nativeDeleteByQuery(long nativePointer, byte[] searchSpecBytes);
+
   private static native byte[] nativePersistToDisk(long nativePointer);
 
   private static native byte[] nativeOptimize(long nativePointer);

diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index d907d4e..ed7e318 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java

@@ -335,6 +335,58 @@
     assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
   }
 
+
+  @Test
+  public void testDeleteByQuery() throws Exception {
+    IcingSearchEngineOptions options =
+        IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+    IcingSearchEngine icing = new IcingSearchEngine(options);
+    assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+    SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+    SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+    assertThat(
+        icing
+            .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+            .getStatus()
+            .getCode())
+        .isEqualTo(StatusProto.Code.OK);
+
+    DocumentProto emailDocument1 =
+        createEmailDocument("namespace", "uri1").toBuilder()
+        .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+        .build();;
+    assertThat(icing.put(emailDocument1).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+    DocumentProto emailDocument2 =
+        createEmailDocument("namespace", "uri2").toBuilder()
+        .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar"))
+        .build();;
+    assertThat(icing.put(emailDocument2).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+    SearchSpecProto searchSpec =
+        SearchSpecProto.newBuilder()
+            // .setQuery("")
+            .setTermMatchType(TermMatchType.Code.PREFIX)
+            .build();
+
+    SearchResultProto searchResultProto =
+        icing.search(
+            searchSpec,
+            ScoringSpecProto.getDefaultInstance(),
+            ResultSpecProto.getDefaultInstance());
+    assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+    assertThat(searchResultProto.getResultsCount()).isEqualTo(2);
+    // assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument1);
+
+    DeleteResultProto deleteResultProto = icing.deleteByQuery(searchSpec);
+    assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+    GetResultProto getResultProto = icing.get("namespace", "uri1");
+    assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+    getResultProto = icing.get("namespace", "uri2");
+    assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+  }
+
   @Test
   public void testPersistToDisk() throws Exception {
     IcingSearchEngineOptions options =

diff --git a/proto/icing/proto/status.proto b/proto/icing/proto/status.proto
index 2733a15..08677b0 100644
--- a/proto/icing/proto/status.proto
+++ b/proto/icing/proto/status.proto

@@ -24,7 +24,7 @@
 // Canonical status to indicate the results of API calls.
 // Next tag: 3
 message StatusProto {
-  // Next tag: 9
+  // Next tag: 10
   enum Code {
     // A default for all other use-cases. Should never be used in practice. This
     // may happen if there are backwards-compatibility issues.
@@ -62,6 +62,12 @@
     // make some space on the underlying filesystem.
     OUT_OF_SPACE = 8;
 
+    // An operation is invalid because the resource already exists and can't be
+    // replaced. For example, this status is used when a SchemaProto contains
+    // multiple definitions of the same type or multiple properties with the
+    // same name within a type.
+    ALREADY_EXISTS = 9;
+
     // Any future status codes.
   }
   optional Code code = 1;
commit	e15b6b66f871a71b73278c34d5c54f648f880c29	[log] [tgz]
author	Terry Wang <tytytyww@google.com>	Thu Sep 24 13:39:23 2020 -0700
committer	Terry Wang <tytytyww@google.com>	Thu Sep 24 13:39:23 2020 -0700
tree	61e187172a8802fae8e39f04ce69d3ae5c939f2e
parent	9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 [diff]