Pull upstream changes.
Change-Id: I44831fdadcdb67f2e19570a35cb4c76faf8397f9
diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc
index d283e13..dfe5566 100644
--- a/icing/absl_ports/annotate.cc
+++ b/icing/absl_ports/annotate.cc
@@ -33,7 +33,7 @@
std::string new_msg =
(!s.error_message().empty())
- ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg)
+ ? absl_ports::StrCat(msg, kErrorSeparator, s.error_message())
: std::string(msg);
return libtextclassifier3::Status(s.CanonicalCode(), new_msg);
}
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 62943b8..95511ac 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -78,6 +78,23 @@
namespace icing {
namespace lib {
+namespace {
+
+bool IsEmptyBuffer(const char* buffer, int size) {
+ return std::all_of(buffer, buffer + size,
+ [](const char byte) { return byte == 0; });
+}
+
+// Helper function to get stored proto size from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+// Helper function to get stored proto magic from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+} // namespace
+
template <typename ProtoT>
class FileBackedProtoLog {
public:
@@ -206,10 +223,19 @@
//
// Returns:
// A proto on success
+ // NOT_FOUND if the proto at the given offset has been erased
// OUT_OF_RANGE_ERROR if file_offset exceeds file size
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+ // Erases the data of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status EraseProto(int64_t file_offset);
+
// Calculates and returns the disk usage in bytes. Rounds up to the nearest
// block size.
//
@@ -239,7 +265,7 @@
Iterator(const Filesystem& filesystem, const std::string& file_path,
int64_t initial_offset);
- // Advances to the position of next proto.
+ // Advances to the position of next proto whether it has been erased or not.
//
// Returns:
// OK on success
@@ -716,10 +742,15 @@
int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
// Copy out however many bytes it says the proto is
- int stored_size = metadata & 0x00FFFFFF;
+ int stored_size = GetProtoSize(metadata);
ICING_RETURN_IF_ERROR(
mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+
+ if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ return absl_ports::NotFoundError("The proto data has been erased.");
+ }
+
google::protobuf::io::ArrayInputStream proto_stream(
mmapped_file.mutable_region(), stored_size);
@@ -736,6 +767,62 @@
}
template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
+ int64_t file_offset) {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get the
+ // inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Trying to erase data at a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ MemoryMappedFile mmapped_file(
+ *filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+ GetProtoSize(metadata)));
+
+ // We need to update the crc checksum if the erased area is before the rewind
+ // position.
+ if (file_offset + sizeof(metadata) < header_->rewind_offset) {
+ // We need to calculate [original string xor 0s].
+ // The xored string is the same as the original string because 0 xor 0 = 0,
+ // 1 xor 0 = 1.
+ const std::string_view xored_str(mmapped_file.region(),
+ mmapped_file.region_size());
+
+ Crc32 crc(header_->log_checksum);
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t new_crc,
+ crc.UpdateWithXor(
+ xored_str,
+ /*full_data_size=*/header_->rewind_offset - sizeof(Header),
+ /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
+
+ header_->log_checksum = new_crc;
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+ }
+
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
const {
int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
@@ -781,8 +868,7 @@
ICING_ASSIGN_OR_RETURN(
int metadata,
ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
- int proto_size = metadata & 0x00FFFFFF;
- current_offset_ += sizeof(metadata) + proto_size;
+ current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
if (current_offset_ < file_size_) {
@@ -829,7 +915,7 @@
ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
memcpy(&metadata, mmapped_file->region(), metadata_size);
// Checks magic number
- uint8_t stored_k_proto_magic = metadata >> 24;
+ uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
if (stored_k_proto_magic != kProtoMagic) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
@@ -842,7 +928,7 @@
libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
if (file_size == header_->rewind_offset) {
- // No changes made, don't need to update the checksum.
+ // No new protos appended, don't need to update the checksum.
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index 3a9060d..fad5248 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -48,7 +48,10 @@
// https://stackoverflow.com/a/47368753
FileBackedProtoLogTest() {}
- void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; }
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/proto_log";
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
@@ -93,7 +96,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -110,7 +113,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a proto
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -144,7 +147,7 @@
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write the first proto
DocumentProto document1 =
@@ -191,7 +194,7 @@
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a third proto
DocumentProto document3 =
@@ -213,7 +216,7 @@
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write the first proto
DocumentProto document1 =
@@ -260,7 +263,7 @@
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a third proto
DocumentProto document3 =
@@ -360,7 +363,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write and persist the first proto
ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
@@ -430,7 +433,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
{
// Empty iterator
@@ -481,7 +484,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
ICING_EXPECT_OK(proto_log->WriteProto(document));
@@ -499,7 +502,7 @@
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Checksum should be consistent across instances
EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
@@ -514,6 +517,166 @@
}
}
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes and erases proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // Checks if the erased area is set to 0.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ MemoryMappedFile mmapped_file(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+
+ // document1_offset + sizeof(int) is the start byte of the proto where
+ // sizeof(int) is the size of the proto metadata.
+ mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+ for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+ ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes 2 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Erases the first proto
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // The first proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // The second proto should be returned.
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace", "uri3").Build();
+ DocumentProto document4 =
+ DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+ int64_t document2_offset;
+ int64_t document3_offset;
+
+ {
+ // Erase data after the rewind position. This won't update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes 3 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ proto_log->WriteProto(document3));
+
+ // Erases the 1st proto, checksum won't be updated immediately because the
+ // rewind position is 0.
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2293202502))));
+ } // New checksum is updated in destructor.
+
+ {
+ // Erase data before the rewind position. This will update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Erases the 2nd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(639634028))));
+ }
+
+ {
+ // Append data and erase data before the rewind position. This will update
+ // the checksum twice: in EraseProto() and destructor.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Append a new document which is after the rewind position.
+ ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+ // Erases the 3rd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(1990198693))));
+ } // Checksum is updated with the newly appended document.
+
+ {
+ // A successful creation means that the checksum matches.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+ }
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index e4ec0cd..eb89db8 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -187,7 +187,7 @@
//
// Returns:
// OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
- libtextclassifier3::Status TruncateTo(int32_t len);
+ libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
// Flushes content to underlying file.
//
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index c973885..5e0a46e 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -59,6 +59,7 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -148,30 +149,31 @@
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
+ StatusProto::Code code;
switch (internal_status.CanonicalCode()) {
case libtextclassifier3::StatusCode::OK:
- status_proto->set_code(StatusProto::OK);
+ code = StatusProto::OK;
break;
case libtextclassifier3::StatusCode::DATA_LOSS:
- status_proto->set_code(StatusProto::WARNING_DATA_LOSS);
+ code = StatusProto::WARNING_DATA_LOSS;
break;
case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
- status_proto->set_code(StatusProto::INVALID_ARGUMENT);
+ code = StatusProto::INVALID_ARGUMENT;
break;
case libtextclassifier3::StatusCode::NOT_FOUND:
- status_proto->set_code(StatusProto::NOT_FOUND);
+ code = StatusProto::NOT_FOUND;
break;
case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
- status_proto->set_code(StatusProto::FAILED_PRECONDITION);
+ code = StatusProto::FAILED_PRECONDITION;
break;
case libtextclassifier3::StatusCode::ABORTED:
- status_proto->set_code(StatusProto::ABORTED);
+ code = StatusProto::ABORTED;
break;
case libtextclassifier3::StatusCode::INTERNAL:
// TODO(b/147699081): Cleanup our internal use of INTERNAL since it
// doesn't match with what it *should* indicate as described in
// go/icing-library-apis.
- status_proto->set_code(StatusProto::INTERNAL);
+ code = StatusProto::INTERNAL;
break;
case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
// TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
@@ -179,17 +181,35 @@
// internally to indicate other resources are exhausted (e.g.
// DocHitInfos) - although none of these are exposed through the API.
// Consider separating the two cases out more clearly.
- status_proto->set_code(StatusProto::OUT_OF_SPACE);
+ code = StatusProto::OUT_OF_SPACE;
break;
- default:
+ case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+ code = StatusProto::ALREADY_EXISTS;
+ break;
+ case libtextclassifier3::StatusCode::CANCELLED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNKNOWN:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAVAILABLE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAUTHENTICATED:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
- ICING_LOG(FATAL) << IcingStringUtil::StringPrintf(
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
"Internal status code %d not supported in the external API",
internal_status.error_code());
+ code = StatusProto::UNKNOWN;
break;
}
-
+ status_proto->set_code(code);
status_proto->set_message(internal_status.error_message());
}
@@ -681,12 +701,14 @@
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteByNamespace(name_space);
- TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete Namespace: " << name_space;
+ TransformStatus(status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
return delete_result;
}
@@ -707,15 +729,82 @@
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteBySchemaType(schema_type);
- TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
+ TransformStatus(status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
return delete_result;
}
+DeleteResultProto IcingSearchEngine::DeleteByQuery(
+ const SearchSpecProto& search_spec) {
+ ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
+ << " from doc store";
+
+ DeleteResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ libtextclassifier3::Status status =
+ ValidateSearchSpec(search_spec, performance_configuration_);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+
+ // Gets unordered results from query processor
+ auto query_processor_or = QueryProcessor::Create(
+ index_.get(), language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get(), clock_.get());
+ if (!query_processor_or.ok()) {
+ TransformStatus(query_processor_or.status(), result_status);
+ return result_proto;
+ }
+ std::unique_ptr<QueryProcessor> query_processor =
+ std::move(query_processor_or).ValueOrDie();
+
+ auto query_results_or = query_processor->ParseSearch(search_spec);
+ if (!query_results_or.ok()) {
+ TransformStatus(query_results_or.status(), result_status);
+ return result_proto;
+ }
+ QueryProcessor::QueryResults query_results =
+ std::move(query_results_or).ValueOrDie();
+
+ ICING_LOG(ERROR) << "Deleting the docs that matched the query.";
+ bool found_results = false;
+ while (query_results.root_iterator->Advance().ok()) {
+ ICING_LOG(ERROR)
+ << "Deleting doc "
+ << query_results.root_iterator->doc_hit_info().document_id();
+ found_results = true;
+ status = document_store_->Delete(
+ query_results.root_iterator->doc_hit_info().document_id());
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+ }
+ if (found_results) {
+ result_proto.mutable_status()->set_code(StatusProto::OK);
+ } else {
+ result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ result_proto.mutable_status()->set_message(
+ "No documents matched the query to delete by!");
+ }
+ return result_proto;
+}
+
PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
ICING_VLOG(1) << "Persisting data to disk";
@@ -1147,6 +1236,9 @@
// Ensures that current directory is still present.
if (!filesystem_->CreateDirectoryRecursively(
current_document_dir.c_str())) {
+ // Can't even create the old directory. Mark as uninitialized and return
+ // INTERNAL.
+ initialized_ = false;
return absl_ports::InternalError(
"Failed to create file directory for document store");
}
@@ -1159,6 +1251,9 @@
// TODO(b/144458732): Implement a more robust version of
// TC_ASSIGN_OR_RETURN that can support error logging.
if (!document_store_or.ok()) {
+ // Unable to create DocumentStore from the old file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
ICING_LOG(ERROR) << "Failed to create document store instance";
return absl_ports::Annotate(
absl_ports::InternalError("Failed to create document store instance"),
@@ -1173,13 +1268,18 @@
}
// Recreates the doc store instance
- ICING_ASSIGN_OR_RETURN(
- document_store_,
+ auto document_store_or =
DocumentStore::Create(filesystem_.get(), current_document_dir,
- clock_.get(), schema_store_.get()),
- absl_ports::InternalError(
- "Document store has been optimized, but a valid document store "
- "instance can't be created"));
+ clock_.get(), schema_store_.get());
+ if (!document_store_or.ok()) {
+ // Unable to create DocumentStore from the new file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
+ return absl_ports::InternalError(
+ "Document store has been optimized, but a valid document store "
+ "instance can't be created");
+ }
+ document_store_ = std::move(document_store_or).ValueOrDie();
// Deletes tmp directory
if (!filesystem_->DeleteDirectoryRecursively(
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 6ae76d7..55d6b2f 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -128,6 +128,9 @@
//
// Returns:
// OK on success
+ // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
+ // type or contains a type that has multiple properties with the same
+ // name.
// INVALID_ARGUMENT if 'new_schema' is invalid
// FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
// has not been initialized yet.
@@ -256,6 +259,21 @@
DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
ICING_LOCKS_EXCLUDED(mutex_);
+ // Deletes all Documents that match the query specified in search_spec. Delete
+ // changes are automatically applied to disk, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if the query doesn't match any documents
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ DeleteResultProto DeleteByQuery(const SearchSpecProto& search_spec)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
// SearchResultProto.next_page_token will be populated and that can be used to
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index b0946c9..5a8bb80 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -55,6 +55,7 @@
using ::testing::IsEmpty;
using ::testing::Lt;
using ::testing::Matcher;
+using ::testing::Ne;
using ::testing::Return;
using ::testing::SizeIs;
using ::testing::StrEq;
@@ -470,6 +471,163 @@
HasSubstr("Unable to open file for write"));
}
+TEST_F(IcingSearchEngineTest, SetSchemaDelete2) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 3. Set a schema that deletes email. This should fail.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema, false).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ // 4. Try to delete by email type.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+ Eq(StatusProto::OK));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDelete) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 3. Set a schema that deletes email. This should fail.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema, true).status().code(),
+ Eq(StatusProto::OK));
+
+ // 4. Try to delete by email type.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+ Eq(StatusProto::NOT_FOUND));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDuplicateTypesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Create a schema with types { "Email", "Message" and "Email" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ type = schema.add_types();
+ type->set_schema_type("Message");
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ *schema.add_types() = schema.types(0);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(),
+ Eq(StatusProto::ALREADY_EXISTS));
+}
+
+TEST_F(IcingSearchEngineTest,
+ SetSchemaDuplicatePropertiesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Create a schema with an Email type with properties { "title", "body" and
+ // "title" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(),
+ Eq(StatusProto::ALREADY_EXISTS));
+}
+
TEST_F(IcingSearchEngineTest, SetSchema) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
@@ -1519,6 +1677,82 @@
EqualsProto(expected_get_result_proto));
}
+TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) {
+ // Setup filesystem to fail
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ bool just_swapped_files = false;
+ auto create_dir_lambda = [this, &just_swapped_files](const char* dir_name) {
+ if (just_swapped_files) {
+ // We should fail the first call immediately after swapping files.
+ just_swapped_files = false;
+ return false;
+ }
+ return filesystem()->CreateDirectoryRecursively(dir_name);
+ };
+ ON_CALL(*mock_filesystem, CreateDirectoryRecursively)
+ .WillByDefault(create_dir_lambda);
+ auto swap_lambda = [&just_swapped_files](const char* first_dir,
+ const char* second_dir) {
+ just_swapped_files = true;
+ return false;
+ };
+ ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // The mocks should cause an unrecoverable error during Optimize - returning
+ // INTERNAL.
+ ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::INTERNAL));
+
+ // Ordinary operations should fail safely.
+ SchemaProto simple_schema;
+ auto type = simple_schema.add_types();
+ type->set_schema_type("type0");
+ auto property = type->add_properties();
+ property->set_property_name("prop0");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ DocumentProto simple_doc = DocumentBuilder()
+ .SetKey("namespace0", "uri0")
+ .SetSchema("type0")
+ .AddStringProperty("prop0", "foo")
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ ResultSpecProto result_spec;
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Put(simple_doc).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ // Reset should get icing back to a safe (empty) and working state.
+ EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(simple_doc).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::OK));
+}
+
TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
SchemaProto schema;
// Add an email type
@@ -1528,6 +1762,10 @@
property->set_property_name("subject");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
// Add an message type
type = schema.add_types();
type->set_schema_type("message");
@@ -1535,6 +1773,10 @@
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
DocumentProto document1 =
DocumentBuilder()
.SetKey("namespace1", "uri1")
@@ -1550,10 +1792,10 @@
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1582,6 +1824,88 @@
*expected_get_result_proto.mutable_document() = document2;
EXPECT_THAT(icing.Get("namespace2", "uri2"),
EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document2 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) {
+ SchemaProto schema = CreateMessageSchema();
+ // Add an email type
+ SchemaProto tmp = CreateEmailSchema();
+ *schema.add_types() = tmp.types(0);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema(schema.types(0).schema_type())
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema(schema.types(1).schema_type())
+ .AddStringProperty("subject", "subject subject2")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete the first type. The first doc should be irretrievable. The
+ // second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.add_schema_type_filters(schema.types(0).schema_type());
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
@@ -1594,6 +1918,89 @@
.Build();
DocumentProto document2 =
DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace1", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(icing.Get("namespace3", "uri3"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete namespace1. Document1 and document2 should be irretrievable.
+ // Document3 should still be present.
+ EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri2) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(icing.Get("namespace3", "uri3"),
+ EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document3 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
.SetKey("namespace2", "uri2")
.SetSchema("Message")
.AddStringProperty("body", "message body2")
@@ -1619,7 +2026,9 @@
// Delete the first namespace. The first doc should be irretrievable. The
// second should still be present.
- EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+ SearchSpecProto search_spec;
+ search_spec.add_namespace_filters("namespace1");
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
Eq(StatusProto::OK));
expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
@@ -1634,6 +2043,153 @@
*expected_get_result_proto.mutable_document() = document2;
EXPECT_THAT(icing.Get("namespace2", "uri2"),
EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'body1'. The first doc should be irretrievable.
+ // The second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("body1");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'foo', which should be none of them. Both docs
+ // should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+ Eq(StatusProto::NOT_FOUND));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 00d116f..eb01731 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -31,6 +31,7 @@
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -192,8 +193,9 @@
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -239,8 +241,9 @@
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -287,8 +290,9 @@
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -335,8 +339,9 @@
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 8dfb9c2..824c440 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -47,6 +47,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -91,8 +92,10 @@
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
ICING_ASSERT_OK_AND_ASSIGN(
normalizer_,
diff --git a/icing/index/index.cc b/icing/index/index.cc
index d4a2508..e7f2fbc 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -24,8 +24,8 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
@@ -102,10 +102,10 @@
TermMatchType::Code term_match_type) {
switch (term_match_type) {
case TermMatchType::EXACT_ONLY:
- return std::make_unique<DocHitInfoIteratorTermExact>(
+ return std::make_unique<DocHitInfoIteratorTermLiteExact>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
case TermMatchType::PREFIX:
- return std::make_unique<DocHitInfoIteratorTermPrefix>(
+ return std::make_unique<DocHitInfoIteratorTermLitePrefix>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
default:
return absl_ports::InvalidArgumentError(
@@ -163,9 +163,14 @@
// Step 2: Update the lexicon, either add the term or update its properties
if (tvi_or.ok()) {
+ tvi = tvi_or.ValueOrDie();
+ if (seen_tokens_.find(tvi) != seen_tokens_.end()) {
+ ICING_VLOG(1) << "A hit for term " << term
+ << " has already been added. Skipping.";
+ return libtextclassifier3::Status::OK;
+ }
ICING_VLOG(1) << "Term " << term
<< " is already present in lexicon. Updating.";
- tvi = tvi_or.ValueOrDie();
// Already in the lexicon. Just update the properties.
ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
@@ -175,6 +180,7 @@
ICING_ASSIGN_OR_RETURN(
tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
}
+ seen_tokens_.insert(tvi);
// Step 3: Add the hit itself
Hit hit(section_id_, document_id_, score,
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 070e82a..f7ca285 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -37,6 +37,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
namespace icing {
namespace lib {
@@ -48,6 +49,7 @@
using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::IsTrue;
+using ::testing::Ne;
using ::testing::NiceMock;
using ::testing::Not;
using ::testing::SizeIs;
@@ -255,11 +257,16 @@
}
TEST_F(IndexTest, SingleHitDedupeIndex) {
+ Crc32 empty_crc = index_->ComputeChecksum();
// Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ Crc32 first_hit_crc = index_->ComputeChecksum();
+ EXPECT_THAT(first_hit_crc.Get(), Ne(empty_crc.Get()));
EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ Crc32 second_hit_crc = index_->ComputeChecksum();
+ EXPECT_THAT(second_hit_crc.Get(), Eq(first_hit_crc.Get()));
// Assert
ICING_ASSERT_OK_AND_ASSIGN(
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc
index 482a5ab..c6cb86d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc
@@ -82,12 +82,10 @@
"Couldn't get current time. Try again in a bit");
}
- if (options_.filter_deleted) {
- if (!document_store_.DoesDocumentExist(
- delegate_->doc_hit_info().document_id())) {
- // Document doesn't exist, keep searching
- return Advance();
- }
+ if (!document_store_.DoesDocumentExist(
+ delegate_->doc_hit_info().document_id())) {
+ // Document doesn't exist, keep searching
+ return Advance();
}
// Try to get the DocumentFilterData
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index bf027e4..9119610 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -37,10 +37,6 @@
class DocHitInfoIteratorFilter : public DocHitInfoIterator {
public:
struct Options {
- // Filter out/don't return DocHitInfos that are associated with nonexistent
- // Documents.
- bool filter_deleted = true;
-
// List of namespaces that documents must have. An empty vector means that
// all namespaces are valid, and no documents will be filtered out.
//
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
index e769013..9eb147a 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
@@ -105,33 +105,6 @@
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
-TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) {
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(test_document1_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- document_store_->Put(test_document2_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- document_store_->Put(test_document3_));
-
- // Deletes test document 2
- ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
- test_document2_.uri()));
-
- std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
- DocHitInfo(document_id2),
- DocHitInfo(document_id3)};
- std::unique_ptr<DocHitInfoIterator> original_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- options_.filter_deleted = false;
- DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
-
- EXPECT_THAT(GetDocumentIds(&filtered_iterator),
- ElementsAre(document_id1, document_id2, document_id3));
-}
-
TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(test_document1_));
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
similarity index 88%
rename from icing/index/iterator/doc-hit-info-iterator-term.cc
rename to icing/index/lite/doc-hit-info-iterator-term-lite.cc
index 97ca3c4..a975f86 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
#include <cstdint>
@@ -40,7 +40,7 @@
} // namespace
-libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
if (cached_hits_idx_ == -1) {
ICING_RETURN_IF_ERROR(RetrieveMoreHits());
} else {
@@ -59,7 +59,7 @@
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
// Exact match only. All hits in lite lexicon are exact.
ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
@@ -70,12 +70,13 @@
return libtextclassifier3::Status::OK;
}
-std::string DocHitInfoIteratorTermExact::ToString() const {
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_);
}
-libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
// Take union of lite terms.
int term_len = term_.length();
int terms_matched = 0;
@@ -97,7 +98,7 @@
return libtextclassifier3::Status::OK;
}
-void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
// Re-sort cached document_ids and merge sections.
sort(cached_hits_.begin(), cached_hits_.end());
@@ -116,7 +117,7 @@
cached_hits_.resize(idx + 1);
}
-std::string DocHitInfoIteratorTermPrefix::ToString() const {
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_, "*");
}
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
similarity index 63%
rename from icing/index/iterator/doc-hit-info-iterator-term.h
rename to icing/index/lite/doc-hit-info-iterator-term-lite.h
index 21d1dd6..bd2de6d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
-#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
#include <cstdint>
#include <vector>
@@ -28,11 +28,12 @@
namespace icing {
namespace lib {
-class DocHitInfoIteratorTerm : public DocHitInfoIterator {
+class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
public:
- explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index, const std::string term,
- SectionIdMask section_restrict_mask)
+ explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
: term_(term),
lite_index_(lite_index),
cached_hits_idx_(-1),
@@ -66,14 +67,14 @@
const SectionIdMask section_restrict_mask_;
};
-class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
public:
- explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
+ explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
std::string ToString() const override;
@@ -81,14 +82,14 @@
libtextclassifier3::Status RetrieveMoreHits() override;
};
-class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
public:
- explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
+ explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
std::string ToString() const override;
@@ -105,4 +106,4 @@
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
new file mode 100644
index 0000000..0640135
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -0,0 +1,166 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kMaxSectionId + 1, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (1U << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
+ if (posting_list_accessor_ == nullptr ||
+ cached_doc_hit_infos_idx_ == (cached_doc_hit_infos_.size() - 2)) {
+ // If we haven't retrieved any hits before or we've already returned all but
+ // the last cached hit, then go get some more!
+ // We hold back the last cached hit because it could have more hits on the
+ // next posting list in the chain.
+ ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+ } else {
+ ++cached_doc_hit_infos_idx_;
+ }
+ if (cached_doc_hit_infos_idx_ == -1 ||
+ cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_);
+ hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() {
+ DocHitInfo last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = cached_doc_hit_infos_.back();
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(last_doc_hit_info);
+ }
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(posting_list_accessor_,
+ main_index_->GetAccessorForExactTerm(term_));
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ ++num_blocks_inspected_;
+ cached_doc_hit_infos_.reserve(hits.size() + 1);
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // We want exact hits, skip prefix-only hits.
+ if (hit.is_prefix_hit()) {
+ continue;
+ }
+ if (cached_doc_hit_infos_.empty() ||
+ hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+ cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+ }
+ cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() {
+ DocHitInfo last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = cached_doc_hit_infos_.back();
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(last_doc_hit_info);
+ }
+
+ ++num_blocks_inspected_;
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ MainIndex::GetPrefixAccessorResult result,
+ main_index_->GetAccessorForPrefixTerm(term_));
+ posting_list_accessor_ = std::move(result.accessor);
+ exact_ = result.exact;
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ cached_doc_hit_infos_.reserve(hits.size());
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // If we only want hits from prefix sections.
+ if (!exact_ && !hit.is_in_prefix_section()) {
+ continue;
+ }
+ if (cached_doc_hit_infos_.empty() ||
+ hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+ cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+ }
+ cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainPrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
new file mode 100644
index 0000000..1f77226
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermMain : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorTermMain(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : term_(term),
+ main_index_(main_index),
+ cached_doc_hit_infos_idx_(-1),
+ num_advance_calls_(0),
+ num_blocks_inspected_(0),
+ next_posting_list_id_(PostingListIdentifier::kInvalid),
+ section_restrict_mask_(section_restrict_mask) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override {
+ return num_blocks_inspected_;
+ }
+ int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+ // The accessor of the posting list chain for the requested term.
+ std::unique_ptr<PostingListAccessor> posting_list_accessor_;
+
+ MainIndex* main_index_;
+ // Stores hits retrieved from the index. This may only be a subset of the hits
+ // that are present in the index. Current value pointed to by the Iterator is
+ // tracked by cached_doc_hit_infos_idx_.
+ std::vector<DocHitInfo> cached_doc_hit_infos_;
+ int cached_doc_hit_infos_idx_;
+ int num_advance_calls_;
+ int num_blocks_inspected_;
+ PostingListIdentifier next_posting_list_id_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermMainExact : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainExact(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainPrefix(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+ // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+ // merged.
+ void SortAndDedupeDocumentIds();
+ // Whether or not posting_list_accessor_ holds a posting list chain for
+ // 'term' or for a term for which 'term' is a prefix. This is necessary to
+ // determine whether to return hits that are not from a prefix section (hits
+ // not from a prefix section should only be returned if exact_ is true).
+ bool exact_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
diff --git a/icing/index/main/flash-index-storage-header.h b/icing/index/main/flash-index-storage-header.h
new file mode 100644
index 0000000..f81e99e
--- /dev/null
+++ b/icing/index/main/flash-index-storage-header.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+// The class used to manage the flash block that contains the header for
+// FlashIndexStorage. This contains information about the index blocks that
+// store the posting lists.
+class HeaderBlock {
+ public:
+ // The class used to access the actual header.
+ struct Header {
+ // A magic used to mark the beginning of a valid header.
+ static constexpr int kMagic = 0x6dfba6ae;
+ int magic;
+ int block_size;
+ int last_indexed_docid;
+ // The size of the index_block_infos array.
+ int num_index_block_infos;
+
+ struct IndexBlockInfo {
+ // The size of the posting lists that fit on all the index blocks in this
+ // chain. Each block on this posting list will have posting lists of size
+ // posting_list_bytes.
+ int posting_list_bytes;
+ // The block index of the first block in the free list chain.
+ int free_list_block_index;
+ };
+ // Variable-size array, num_index_block_infos long. Can have a max length
+ // of log(block_size). This array is used to maintain a free list for the
+ // available blocks.
+ IndexBlockInfo index_block_infos[0];
+ };
+
+ // Read HeaderBlock from the specified fd.
+ //
+ // RETURNS:
+ // - HeaderBlock, on success
+ // - INTERNAL if unable to read block_size bytes from fd.
+ static libtextclassifier3::StatusOr<HeaderBlock> Read(
+ const Filesystem* filesystem, int fd, int block_size) {
+ std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(block_size);
+ if (!filesystem->PRead(fd, buffer.get(), block_size, 0)) {
+ return absl_ports::InternalError("Unable to reader header block!");
+ }
+ return HeaderBlock(filesystem, std::move(buffer), block_size);
+ }
+
+ // Make a new HeaderBlock with the specified size.
+ explicit HeaderBlock(const Filesystem* filesystem, int block_size)
+ : HeaderBlock(filesystem, std::make_unique<uint8_t[]>(block_size),
+ block_size) {
+ std::memset(header_buffer_.get(), 0, block_size);
+ }
+
+ Header* header() const {
+ return reinterpret_cast<Header*>(header_buffer_.get());
+ }
+
+ // Add another entry to the index_block_infos array and return a pointer to
+ // that entry. Returns a nullptr if the index_block_infos array is already
+ // at a max size.
+ Header::IndexBlockInfo* AddIndexBlockInfo() {
+ if (size() + sizeof(Header::IndexBlockInfo) > block_size_) {
+ return nullptr;
+ }
+ ++header()->num_index_block_infos;
+ return header()->index_block_infos + (header()->num_index_block_infos - 1);
+ }
+
+ // Returns the size of the header block currently in use.
+ int size() const {
+ return sizeof(Header) +
+ header()->num_index_block_infos * sizeof(Header::IndexBlockInfo);
+ }
+
+ // Writes the header to fd. Returns true on success.
+ bool Write(int fd) {
+ return filesystem_->PWrite(fd, 0, header_buffer_.get(), block_size_);
+ }
+
+ private:
+ explicit HeaderBlock(const Filesystem* filesystem,
+ std::unique_ptr<uint8_t[]> buffer, int block_size)
+ : filesystem_(filesystem),
+ header_buffer_(std::move(buffer)),
+ block_size_(block_size) {}
+
+ const Filesystem* filesystem_; // does NOT own!
+ std::unique_ptr<uint8_t[]> header_buffer_;
+ int block_size_;
+};
+static_assert(16 == sizeof(HeaderBlock::Header),
+ "Header has changed size. Consider how this change might affect "
+ "pre-existing indices.");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc
new file mode 100644
index 0000000..b88d7fe
--- /dev/null
+++ b/icing/index/main/flash-index-storage.cc
@@ -0,0 +1,511 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <unordered_set>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t SelectBlockSize() {
+ // This should be close to the flash page size.
+ static constexpr uint32_t kMinBlockSize = 4096;
+
+ // Determine a good block size.
+ uint32_t page_size = getpagesize();
+ uint32_t block_size = std::max(kMinBlockSize, page_size);
+
+ // Align up to the nearest page size.
+ return math_util::RoundUpTo(block_size, page_size);
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ bool in_memory) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ FlashIndexStorage storage(index_filename, filesystem, in_memory);
+ if (!storage.Init()) {
+ return absl_ports::InternalError(
+ "Unable to successfully read header block!");
+ }
+ return storage;
+}
+
+FlashIndexStorage::FlashIndexStorage(const std::string& index_filename,
+ const Filesystem* filesystem,
+ bool has_in_memory_freelists)
+ : index_filename_(index_filename),
+ num_blocks_(0),
+ filesystem_(filesystem),
+ has_in_memory_freelists_(has_in_memory_freelists) {}
+
+FlashIndexStorage::~FlashIndexStorage() {
+ if (header_block_ != nullptr) {
+ FlushInMemoryFreeList();
+ PersistToDisk();
+ }
+}
+
+bool FlashIndexStorage::Init() {
+ block_fd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str()));
+ if (!block_fd_.is_valid()) {
+ return false;
+ }
+
+ // Read in or create the header.
+ return InitHeader();
+}
+
+bool FlashIndexStorage::InitHeader() {
+ // Look for an existing file size.
+ int64_t file_size = filesystem_->GetFileSize(block_fd_.get());
+ if (file_size == Filesystem::kBadFileSize) {
+ ICING_LOG(ERROR) << "Could not initialize main index. Bad file size.";
+ return false;
+ }
+
+ if (file_size == 0) {
+ if (!CreateHeader()) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to create header.";
+ return false;
+ }
+ } else {
+ if (!OpenHeader(file_size)) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to open header.";
+ return false;
+ }
+ }
+ in_memory_freelists_.resize(header_block_->header()->num_index_block_infos);
+
+ return true;
+}
+
+bool FlashIndexStorage::CreateHeader() {
+ uint32_t block_size = SelectBlockSize();
+ header_block_ = std::make_unique<HeaderBlock>(filesystem_, block_size);
+ // Initialize.
+ header_block_->header()->magic = HeaderBlock::Header::kMagic;
+ header_block_->header()->block_size = block_size;
+ header_block_->header()->last_indexed_docid = kInvalidDocumentId;
+
+ // Work down from the largest posting list that fits in
+ // block_size. We don't care about locality of blocks because this
+ // is a flash index.
+ for (uint32_t posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(block_size);
+ posting_list_bytes >= posting_list_utils::min_posting_list_size();
+ posting_list_bytes /= 2) {
+ uint32_t aligned_posting_list_bytes =
+ (posting_list_bytes / sizeof(Hit) * sizeof(Hit));
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Block size %u: %u", header_block_->header()->num_index_block_infos,
+ aligned_posting_list_bytes);
+
+ // Initialize free list to empty.
+ HeaderBlock::Header::IndexBlockInfo* block_info =
+ header_block_->AddIndexBlockInfo();
+ if (block_info == nullptr) {
+ // This should never happen anyways. Min block size is 4k, so adding these
+ // IndexBlockInfos should never exceed the block size.
+ return false;
+ }
+ block_info->posting_list_bytes = aligned_posting_list_bytes;
+ block_info->free_list_block_index = kInvalidBlockIndex;
+ }
+
+ // Write the header.
+ if (!header_block_->Write(block_fd_.get())) {
+ filesystem_->Truncate(block_fd_.get(), 0);
+ return false;
+ }
+ num_blocks_ = 1;
+ return true;
+}
+
+bool FlashIndexStorage::OpenHeader(int64_t file_size) {
+ uint32_t block_size = SelectBlockSize();
+ // Read and validate header.
+ ICING_ASSIGN_OR_RETURN(
+ HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, block_fd_.get(), block_size), false);
+ if (read_header.header()->magic != HeaderBlock::Header::kMagic) {
+ ICING_LOG(ERROR) << "Index header block wrong magic";
+ return false;
+ }
+ if (file_size % read_header.header()->block_size != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Index size %" PRIu64 " not a multiple of block size %u", file_size,
+ read_header.header()->block_size);
+ return false;
+ }
+
+ if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Index size %" PRIu64 " shorter than block size %u", file_size,
+ read_header.header()->block_size);
+ return false;
+ }
+
+ if (read_header.header()->block_size % getpagesize() != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Block size %u is not a multiple of page size %d",
+ read_header.header()->block_size, getpagesize());
+ return false;
+ }
+ num_blocks_ = file_size / read_header.header()->block_size;
+ if (block_size != read_header.header()->block_size) {
+ // The block_size changed? That's weird. But the old block_size is still
+ // valid (it must be some multiple of the new block_size). So reinitialize
+ // with that old block size. Using the old block size means that we can
+ // still use the main index, but reads/writes won't be as efficient in terms
+ // of flash IO because the 'blocks' that we're reading are actually multiple
+ // pages long.
+ ICING_LOG(ERROR) << "Block size of existing header ("
+ << read_header.header()->block_size
+ << ") does not match the requested block size ("
+ << block_size << "). Defaulting to existing block size "
+ << read_header.header()->block_size;
+ ICING_ASSIGN_OR_RETURN(HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, block_fd_.get(),
+ read_header.header()->block_size),
+ false);
+ }
+ header_block_ = std::make_unique<HeaderBlock>(std::move(read_header));
+
+ // Check for memory alignment on posting_list_bytes. See b/29983315.
+ // The issue of potential corruption to the header could also be handled by
+ // checksumming the header block.
+ for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
+ int posting_list_bytes =
+ header_block_->header()->index_block_infos[i].posting_list_bytes;
+ if (posting_list_bytes % sizeof(Hit) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Posting list size misaligned, index %u, size %u, hit %zu, "
+ "file_size %" PRIu64,
+ i, header_block_->header()->index_block_infos[i].posting_list_bytes,
+ sizeof(Hit), file_size);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FlashIndexStorage::PersistToDisk() {
+ // First, write header.
+ if (!header_block_->Write(block_fd_.get())) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Write index header failed: %s", strerror(errno));
+ return false;
+ }
+
+ // Then sync.
+ return filesystem_->DataSync(block_fd_.get());
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingList(PostingListIdentifier id) const {
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(id.block_index()));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(id.posting_list_index()));
+ PostingListHolder holder = {std::move(posting_list), std::move(block), id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock(
+ int block_index) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %d when only %d blocks have "
+ "been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ *filesystem_, index_filename_, offset, block_size());
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
+ int block_index, uint32_t posting_list_size) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %d when only %d blocks have "
+ "been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromUninitializedRegion(
+ *filesystem_, index_filename_, offset, block_size(), posting_list_size);
+}
+
+int FlashIndexStorage::FindBestIndexBlockInfo(
+ uint32_t posting_list_bytes) const {
+ int i = header_block_->header()->num_index_block_infos - 1;
+ for (; i >= 0; i--) {
+ if (header_block_->header()->index_block_infos[i].posting_list_bytes >=
+ posting_list_bytes) {
+ return i;
+ }
+ }
+ return i;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromInMemoryFreeList(int block_info_index) {
+ // Get something from in memory free list.
+ ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
+ in_memory_freelists_[block_info_index].TryPop());
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed. First, we
+ // need to free this posting list.
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(posting_list_id.block_index()));
+ block.FreePostingList(posting_list_id.posting_list_index());
+
+ // Now, we can allocate a posting list from the same index block. It may not
+ // be the same posting list that was just freed, but that's okay.
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ posting_list_id =
+ PostingListIdentifier(posting_list_id.block_index(), posting_list_index,
+ posting_list_id.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromOnDiskFreeList(int block_info_index) {
+ // Get something from the free list.
+ uint32_t block_index = header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index;
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ // Get the index block
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(block_index));
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id = PostingListIdentifier(
+ block_index, posting_list_index, block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ if (!block.has_free_posting_lists()) {
+ RemoveFromOnDiskFreeList(block_index, block_info_index, &block);
+ }
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocateNewPostingList(int block_info_index) {
+ uint32_t block_index = GrowIndex();
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::ResourceExhaustedError(
+ "Unable to grow the index further!");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ IndexBlock block,
+ CreateIndexBlock(block_index, header_block_->header()
+ ->index_block_infos[block_info_index]
+ .posting_list_bytes));
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id = PostingListIdentifier(
+ block_index, posting_list_index, block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ if (block.has_free_posting_lists()) {
+ AddToOnDiskFreeList(block_index, block_info_index, &block);
+ }
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) {
+ int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size());
+ if (min_posting_list_bytes > max_block_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d exceeds max posting list size %d",
+ min_posting_list_bytes, max_block_size));
+ }
+ int best_block_info_index = FindBestIndexBlockInfo(min_posting_list_bytes);
+
+ auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ // Nothing in memory. Look for something in the block file.
+ holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ return AllocateNewPostingList(best_block_info_index);
+}
+
+void FlashIndexStorage::AddToOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block) {
+ index_block->set_next_block_index(header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index);
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = block_index;
+}
+
+void FlashIndexStorage::RemoveFromOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block) {
+ // Cannot be used anymore. Move free ptr to the next block.
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = index_block->next_block_index();
+ index_block->set_next_block_index(kInvalidBlockIndex);
+}
+
+void FlashIndexStorage::FreePostingList(PostingListHolder holder) {
+ uint32_t posting_list_bytes = holder.block.get_posting_list_bytes();
+ int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes);
+
+ // It *should* be guaranteed elsewhere that FindBestIndexBlockInfo will not
+ // return a value in >= in_memory_freelists_, but check regardless. If it
+ // doesn't fit for some reason, then put it in the Header free list instead.
+ if (has_in_memory_freelists_ &&
+ best_block_info_index < in_memory_freelists_.size()) {
+ in_memory_freelists_[best_block_info_index].Push(holder.id);
+ } else {
+ bool was_full = !holder.block.has_free_posting_lists();
+ holder.block.FreePostingList(holder.id.posting_list_index());
+ // If this block was not already full, then it is already in the free list.
+ if (was_full) {
+ AddToOnDiskFreeList(holder.id.block_index(), best_block_info_index,
+ &holder.block);
+ }
+ }
+}
+
+int FlashIndexStorage::GrowIndex() {
+ if (num_blocks_ >= kMaxBlockIndex) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf("Reached max block index %u",
+ kMaxBlockIndex);
+ return kInvalidBlockIndex;
+ }
+
+ // Grow the index file.
+ if (!filesystem_->Grow(
+ block_fd_.get(),
+ static_cast<uint64_t>(num_blocks_ + 1) * block_size())) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Error growing index file: %s", strerror(errno));
+ return kInvalidBlockIndex;
+ }
+
+ return num_blocks_++;
+}
+
+void FlashIndexStorage::FlushInMemoryFreeList() {
+ for (int i = 0; i < in_memory_freelists_.size(); ++i) {
+ FreeList& freelist = in_memory_freelists_.at(i);
+ auto freelist_elt_or = freelist.TryPop();
+ while (freelist_elt_or.ok()) {
+ PostingListIdentifier freelist_elt = freelist_elt_or.ValueOrDie();
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed.
+ // First, we need to free this posting list.
+ auto block_or = GetIndexBlock(freelist_elt.block_index());
+ if (!block_or.ok()) {
+ // Can't read the block. Nothing to do here. This posting list will have
+ // to leak. Just proceed to the next freelist element.
+ freelist_elt_or = freelist.TryPop();
+ continue;
+ }
+ IndexBlock block = std::move(block_or).ValueOrDie();
+ bool was_full = !block.has_free_posting_lists();
+ block.FreePostingList(freelist_elt.posting_list_index());
+ // If this block was not already full, then it is already in the free
+ // list.
+ if (was_full) {
+ AddToOnDiskFreeList(freelist_elt.block_index(), /*block_info_index=*/i,
+ &block);
+ }
+ freelist_elt_or = freelist.TryPop();
+ }
+ }
+}
+
+// FreeList.
+void FlashIndexStorage::FreeList::Push(PostingListIdentifier id) {
+ if (free_list_.size() >= kMaxSize) {
+ ICING_LOG(WARNING)
+ << "Freelist for posting lists of size (block_size / "
+ << (1u << id.posting_list_index_bits())
+ << ") has reached max size. Dropping freed posting list [block_index:"
+ << id.block_index()
+ << ", posting_list_index:" << id.posting_list_index() << "]";
+ return;
+ }
+
+ free_list_.push_back(id);
+}
+
+libtextclassifier3::StatusOr<PostingListIdentifier>
+FlashIndexStorage::FreeList::TryPop() {
+ if (free_list_.empty()) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ PostingListIdentifier id = free_list_.back();
+ free_list_.pop_back();
+ return id;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
new file mode 100644
index 0000000..958f131
--- /dev/null
+++ b/icing/index/main/flash-index-storage.h
@@ -0,0 +1,275 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_
+#define ICING_INDEX_FLASH_INDEX_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/main/flash-index-storage-header.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// The PostingListHolder struct exists to group together related PostingListUsed
+// IndexBlock pairs and their ids.
+struct PostingListHolder {
+ // PostingListUseds interpret data that they themselves do NOT own. The data
+ // being interpreted is stored on a flash block and its memory mapping is
+ // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must
+ // NOT exceed the lifecycle of the IndexBlock.
+ PostingListUsed posting_list;
+ IndexBlock block;
+ // The PostingListIdentifier, which identifies both the IndexBlock and the
+ // PostingListUsed, is also returned for convenience.
+ PostingListIdentifier id;
+};
+
+// The FlashIndexStorage class manages the actual file that makes up the index.
+// It allocates IndexBlocks as needed and maintains freelists to prevent
+// excessive block fragmentation.
+//
+// It maintains two types of free lists:
+// 1. On-disk, Header free list - This free list is stored in the Header
+// block. There is a free list for every possible posting list size. Each
+// entry for a posting list size contains the block_index of the
+// IndexBlock that starts the free list chain. Each IndexBlock in the free
+// list chain stores the index of the next IndexBlock in the chain.
+// 2. In-memory free list - Like the Header free list, there is a free list of
+// every possible posting list size. This free list contains not just the
+// block_index of the available IndexBlock, but also the posting_list_index
+// of the available PostingListUsed within the IndexBlock. This is because,
+// unlike the Header free list, PostingListUseds are not actually freed
+// when added to this free list.
+//
+// Whether or not the in-memory free list is used can be chosen via the
+// in_memory param to the Create factory function.
+//
+// The advantage of using the in-memory free list is that it reduces the amount
+// of flash writes made while editing the index (because actually freeing the
+// PostingLists would require writing to that flash block). The disadvantage is
+// that it introduces code complexity and potentially leaks blocks if power is
+// lost or if FlashIndexStorage is destroyed before emptying the free list.
+class FlashIndexStorage {
+ public:
+ // Creates a FlashIndexStorage at index_filename. in_memory determines whether
+ // or not the FlashIndexStorage maintains an in-memory freelist in order to
+ // avoid writes to the on-disk freelist.
+ //
+ // RETURNS:
+ // - On success, a valid instance of FlashIndexStorage
+ // - INTERNAL error if unable to create a new header or read the existing
+ // one from disk.
+ static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ bool in_memory = true);
+
+ // Retrieve the PostingList referred to by PostingListIdentifier. This posting
+ // list must have been previously allocated by a prior call to
+ // AllocatePostingList.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the
+ // IndexBlock referred to by id.block_index()
+ // - INTERNAL_ERROR if unable to access the region in file.
+ libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
+ PostingListIdentifier id) const;
+
+ // Allocates and returns a PostingListHolder containing a PostingListUsed that
+ // can fit min_posting_list_bytes.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to create a
+ // PostingListUsed of the requested size.
+ libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
+ uint32_t min_posting_list_bytes);
+
+ ~FlashIndexStorage();
+ FlashIndexStorage(FlashIndexStorage&&) = default;
+ FlashIndexStorage(const FlashIndexStorage&) = delete;
+ FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
+ FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
+
+ // Free the PostingListUsed that this holder holds.
+ void FreePostingList(PostingListHolder holder);
+
+ // Used to track the largest docid indexed in the index.
+ DocumentId get_last_indexed_docid() const {
+ return header_block_->header()->last_indexed_docid;
+ }
+ void set_last_indexed_docid(DocumentId docid) {
+ header_block_->header()->last_indexed_docid = docid;
+ }
+
+ // Updates the header and persists all changes to the index to disk. Returns
+ // true on success.
+ bool PersistToDisk();
+
+ // Returns the size of the index file in bytes.
+ int64_t GetDiskUsage() const {
+ return filesystem_->GetDiskUsage(block_fd_.get());
+ }
+
+ int num_blocks() const { return num_blocks_; }
+
+ // Info about the index based on the block size.
+ int block_size() const { return header_block_->header()->block_size; }
+
+ // Num blocks starts at 1 since the first block is the header.
+ bool empty() const { return num_blocks_ <= 1; }
+
+ // The percentage of the maximum index size that is free. Allocated blocks are
+ // treated as fully used, even if they are only partially used. In this way,
+ // min_free_fraction is a lower bound of available space.
+ double min_free_fraction() const {
+ return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
+ }
+
+ private:
+ FlashIndexStorage(const std::string& index_filename,
+ const Filesystem* filesystem, bool has_in_memory_freelists);
+
+ // Init the index from persistence. Create if file does not exist. We do not
+ // erase corrupt files.
+ //
+ // Returns false if unable to create a new header or if the existing one is
+ // corrupt.
+ bool Init();
+
+ // Create or open the header block. Returns true on success.
+ bool InitHeader();
+
+ // Create a new header block for an empty index file.
+ bool CreateHeader();
+
+ // Loads the header stored at the beginning of the index file and validates
+ // the values stored in it.
+ bool OpenHeader(int64_t file_size);
+
+ // Add the IndexBlock referred to by block_index in the on-disk free list with
+ // index block_info_index.
+ void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
+ IndexBlock* index_block);
+
+ // Remove the IndexBlock referred to by block_index from the Header free list
+ // with index block_info_index.
+ void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index,
+ IndexBlock* index_block);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the in-memory freelist at block_info_index
+ // - NOT_FOUND if there was no entry in the freelist
+ // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+ // allocated for some reason.
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromInMemoryFreeList(int block_info_index);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the on-disk freelist at block_info_index
+ // - NOT_FOUND if there was no entry in the freelist
+ // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+ // allocated for some reason.
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromOnDiskFreeList(int block_info_index);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from a newly allocated
+ // IndexBlock.
+ // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
+ // IndexBlock.
+ libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
+ int block_info_index);
+
+ // Returns:
+ // - On success, a newly created IndexBlock at block_index with posting
+ // lists of size posting_list_size
+ // - INTERNAL_ERROR if unable to access the region in file representing the
+ // IndexBlock
+ libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
+ int block_index, uint32_t posting_list_size) const;
+
+ // Returns:
+ // - On success, the IndexBlock that exists at block_index
+ // - INTERNAL_ERROR if unable to access the region in file representing the
+ // IndexBlock
+ libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const;
+
+ // Add a new block to the end of the file and return its block
+ // index. Returns kInvalidBlockIndex if unable to grow the index file.
+ int GrowIndex();
+
+ // Return the index into index_block_infos of the smallest posting_list free
+ // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
+ // the max-sized posting list.
+ int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
+
+ // Flushes the in-memory free list to disk.
+ void FlushInMemoryFreeList();
+
+ // Underlying filename.
+ std::string index_filename_;
+
+ // We open the index file into this fd.
+ ScopedFd block_fd_;
+ int num_blocks_; // can be inferred from index file size
+
+ std::unique_ptr<HeaderBlock> header_block_;
+
+ // In-memory cache of free posting lists.
+ struct FreeList {
+ // Experimentally determined that high watermark for largest
+ // freelist was ~3500.
+ static constexpr size_t kMaxSize = 4096;
+
+ // Push a new PostingListIdentifier if there is space.
+ void Push(PostingListIdentifier id);
+
+ // Attempt to pop a PostingListIdentifier.
+ //
+ // RETURNS:
+ // - identifier of a free posting list, on success
+ // - NOT_FOUND if there are no free posting lists on this free list.
+ libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
+
+ private:
+ std::vector<PostingListIdentifier> free_list_;
+ };
+ std::vector<FreeList> in_memory_freelists_;
+
+ const Filesystem* filesystem_; // not owned; can't be null
+
+ bool has_in_memory_freelists_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_
diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc
new file mode 100644
index 0000000..cf899b3
--- /dev/null
+++ b/icing/index/main/flash-index-storage_test.cc
@@ -0,0 +1,540 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Not;
+
+class FlashIndexStorageTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ protected:
+ std::string test_dir_;
+ std::string file_name_;
+ Filesystem filesystem_;
+};
+
+TEST_F(FlashIndexStorageTest, CorruptHeader) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+ {
+ // Corrupt the header file by changing pl_bytes
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = sizeof(Hit) - 1; // This is intentionally invalid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the header file - should fail because pl_bytes is not divisible
+ // by sizeof(Hit), which is 5 as of writing
+ ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+ {
+ // Correct the pl_bytes header alignment
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = 2 * sizeof(Hit); // Should be valid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+
+ // Delete the file
+ filesystem_.DeleteFile(file_name_.c_str());
+}
+
+TEST_F(FlashIndexStorageTest, EmptyStorage) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ // An 'empty' FlashIndexStorage should have:
+ // 1. One block allocated for the header
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ // 2. The invalid DocumentId stored in its header
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ // 3. It's disk usage should be the equivalent of one block.
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+ {
+ // Read the valid header. All functions should return the same values.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
+ PostingListIdentifier id1 = PostingListIdentifier::kInvalid;
+ int half_block_posting_list_size = 0;
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+ }
+
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ // 4. The FlashIndexStorage should go out of scope and flush the in-memory
+ // posting list to disk
+ }
+
+ {
+ // Recreate the flash index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ {
+ // 5. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ const int kQuarterBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 4;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could
+ // theoretically fit in the same block, we'll allocate a new one because PLs
+ // on a block are required to be the same size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should have grown by one block.
+ EXPECT_THAT(posting_list_holder2.id.block_index(),
+ Not(Eq(id1.block_index())));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Request another 1/4 block-size posting list. This should NOT grow the
+ // index because there should be three free posting lists on block2.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should have remained the same size as before and the
+ // third posting list holder should use the same block as the second
+ // posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.block_index(),
+ Eq(posting_list_holder2.id.block_index()));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(3 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ // Request a PL that is 2x block size.
+ const int kDoubleBlockSize = flash_index_storage.block_size() * 2;
+ EXPECT_THAT(flash_index_storage.AllocatePostingList(kDoubleBlockSize),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc
index 9d7df3c..652dbc6 100644
--- a/icing/index/main/index-block.cc
+++ b/icing/index/main/index-block.cc
@@ -105,11 +105,12 @@
posting_lists_start_ptr_(mmapped_block.mutable_region() +
sizeof(BlockHeader)),
block_size_in_bytes_(mmapped_block.region_size()),
- mmapped_block_(std::move(mmapped_block)) {}
+ mmapped_block_(
+ std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {}
libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) {
- ICING_RETURN_IF_ERROR(ValidatePostingListBytes(posting_list_bytes,
- mmapped_block_.region_size()));
+ ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
+ posting_list_bytes, mmapped_block_->region_size()));
header_->free_list_posting_list_index = kInvalidPostingListIndex;
header_->next_block_index = kInvalidBlockIndex;
header_->posting_list_bytes = posting_list_bytes;
diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h
index 1d17e34..edf9a79 100644
--- a/icing/index/main/index-block.h
+++ b/icing/index/main/index-block.h
@@ -20,6 +20,7 @@
#include <algorithm>
#include <limits>
+#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
@@ -95,6 +96,12 @@
IndexBlock(IndexBlock&&) = default;
IndexBlock& operator=(IndexBlock&&) = default;
+ ~IndexBlock() {
+ if (mmapped_block_ != nullptr) {
+ mmapped_block_->PersistToDisk();
+ }
+ }
+
// Instantiate a PostingListUsed at posting_list_index with the existing
// content in the IndexBlock.
//
@@ -206,7 +213,7 @@
uint32_t block_size_in_bytes_;
// MemoryMappedFile used to interact with the underlying flash block.
- MemoryMappedFile mmapped_block_;
+ std::unique_ptr<MemoryMappedFile> mmapped_block_;
};
} // namespace lib
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
new file mode 100644
index 0000000..878038f
--- /dev/null
+++ b/icing/index/main/main-index.cc
@@ -0,0 +1,339 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index.h"
+
+#include <cstring>
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Finds the best prefix term in lexicon for which "prefix" is a prefix.
+// 'Best' is defined as the shortest term that holds a valid posting list id.
+// Returns a valid FindTermResult with found=true if either:
+// 1. prefix exists as a term in lexicon.
+// 2. the shortest, valid prefix in the lexicon exists and contains prefix
+// hits.
+// Returns a FindTermResult with found=false and undefined values of tvi and
+// exact if no term was found.
+struct FindTermResult {
+ // TVI of the term that was found. Undefined if found=false.
+ uint32_t tvi;
+ // Whether or not a valid term with prefix hits was found.
+ bool found;
+ // Whether or not that term is equal to 'prefix'
+ bool exact;
+};
+FindTermResult FindShortestValidTermWithPrefixHits(
+ const IcingDynamicTrie* lexicon, const std::string& prefix) {
+ // For prefix indexing: when we are doing a prefix match for "prefix", find
+ // the tvi to the equivalent posting list. prefix's own posting list might not
+ // exist but one of its children acts as a proxy.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *lexicon, GetHasHitsInPrefixSectionPropertyId());
+ uint32_t tvi = 0;
+ bool found = false;
+ bool exact = false;
+ for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
+ it.Advance()) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
+
+ // Posting list id might be invalid if this is also a backfill term.
+ // Suppose that the main index has two pre-existing prefix hits "foot" and
+ // "fool" - it will have a branch point posting list for "foo". Then, let's
+ // suppose that the other index adds hits for "foul", "four" and "far". This
+ // will result in branch points for "fo" and "f".
+ // If "fo" was added before "f", then the iterator would first give us "fo".
+ // "fo" will have an invalid posting_list_id because it hasn't been
+ // backfilled yet, so we need to continue iterating to "foo".
+ if (posting_list_id.is_valid()) {
+ exact = (prefix.size() == strlen(it.GetKey()));
+ tvi = it.GetValueIndex();
+ // Found it. Does it have prefix hits?
+ found = exact || hits_in_prefix_section.HasProperty(tvi);
+ break;
+ }
+ }
+ FindTermResult result = {tvi, found, exact};
+ return result;
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<MainIndex> MainIndex::Create(
+ const string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
+ MainIndex main_index;
+ ICING_RETURN_IF_ERROR(
+ main_index.Init(index_filename, filesystem, icing_filesystem));
+ return main_index;
+}
+
+// TODO(b/139087650) : Migrate off of IcingFilesystem.
+libtextclassifier3::Status MainIndex::Init(
+ const string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
+ std::string flash_index_file = index_filename + "-main-index";
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index,
+ FlashIndexStorage::Create(flash_index_file, filesystem));
+ flash_index_ = std::make_unique<FlashIndexStorage>(std::move(flash_index));
+
+ std::string lexicon_file = index_filename + "-main-lexicon";
+ IcingDynamicTrie::RuntimeOptions runtime_options;
+ main_lexicon_ = std::make_unique<IcingDynamicTrie>(
+ lexicon_file, runtime_options, icing_filesystem);
+ IcingDynamicTrie::Options lexicon_options;
+ if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
+ !main_lexicon_->Init()) {
+ return absl_ports::InternalError("Failed to initialize lexicon trie");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+MainIndex::GetAccessorForExactTerm(const std::string& term) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term %s is not present in main lexicon.", term.c_str()));
+ }
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_.get(), posting_list_id));
+ return std::make_unique<PostingListAccessor>(std::move(accessor));
+}
+
+libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
+MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
+ bool exact = false;
+ // For prefix indexing: when we are doing a prefix match for
+ // "prefix", find the tvi to the equivalent posting list. prefix's
+ // own posting list might not exist but its shortest child acts as a proxy.
+ //
+ // For example, if there are only two hits in the index are prefix hits for
+ // "bar" and "bat", then both will appear on a posting list for "ba". "b"
+ // won't have a posting list, but "ba" will suffice.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
+ IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
+ if (!main_itr.IsValid()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term: %s is not present in the main lexicon.", prefix.c_str()));
+ }
+ exact = (prefix.length() == strlen(main_itr.GetKey()));
+
+ if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
+ // Found it, but it doesn't have prefix hits. Exit early. No need to
+ // retrieve the posting list because there's nothing there for us.
+ return libtextclassifier3::Status::OK;
+ }
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_.get(), posting_list_id));
+ GetPrefixAccessorResult result = {std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact};
+ return result;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
+ // Maps new branching points in main lexicon to the term such that
+ // branching_point_term is a prefix of term and there are no terms smaller
+ // than term and greater than branching_point_term.
+ std::string prefix;
+ LexiconMergeOutputs outputs;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // If term were inserted in the main lexicon, what new branching would it
+ // create? (It always creates at most one.)
+ int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
+ other_term_itr.GetKey(), /*utf8=*/true);
+ if (prefix_len <= 0) {
+ continue;
+ }
+ prefix.assign(other_term_itr.GetKey(), prefix_len);
+
+ // Figure out backfill tvi. Might not exist since all children terms could
+ // only contain hits from non-prefix sections.
+ //
+ // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
+ // we're adding "foul". The new branching prefix will be "fo". The backfill
+ // prefix will be "foo" - all hits in prefix section on "foo" will need to
+ // be added to the new "fo" posting list later.
+ FindTermResult result =
+ FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
+ if (!result.found || result.exact) {
+ continue;
+ }
+
+ // This is a new prefix that will need backfilling from its next-in-line
+ // posting list. This new prefix will have to have a posting list eventually
+ // so insert a default PostingListIdentifier as a placeholder.
+ uint32_t branching_prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id,
+ &branching_prefix_tvi, false, &new_key)) {
+ return absl_ports::InternalError("Could not insert branching prefix");
+ }
+
+ // Backfills only contain prefix hits by default. So set these here but
+ // could be overridden when adding hits from the other index later.
+ if (!main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasNoExactHitsPropertyId()) ||
+ !main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+
+ outputs.backfill_map[branching_prefix_tvi] = result.tvi;
+ }
+ return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, "");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ uint32_t new_main_tvi;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(other_term_itr.GetKey(), &posting_list_id,
+ &new_main_tvi,
+ /*replace=*/false)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not insert term: ", other_term_itr.GetKey()));
+ }
+
+ // Copy the properties from the other lexicon over to the main lexicon.
+ uint32_t other_tvi = other_term_itr.GetValueIndex();
+ if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
+ new_main_tvi)) {
+ return absl_ports::InternalError("Could not insert term");
+ }
+
+ // Add other to main mapping.
+ outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
+ }
+ return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
+ other_lexicon, GetHasHitsInPrefixSectionPropertyId());
+ if (!has_prefix_prop_reader.Exists()) {
+ return outputs;
+ }
+ std::string prefix;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, "");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // Only expand terms that have hits in prefix sections.
+ if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
+ continue;
+ }
+
+ // Get prefixes where there is already a branching point in the main
+ // lexicon. We skip prefixes which don't already have a branching point.
+ std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
+ other_term_itr.GetKey(), /*utf8=*/true);
+
+ int buf_start = outputs.prefix_tvis_buf.size();
+ // Add prefixes.
+ for (int prefix_length : prefix_lengths) {
+ if (prefix_length <= 0) {
+ continue;
+ }
+
+ prefix.assign(other_term_itr.GetKey(), prefix_length);
+ uint32_t prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_identifier =
+ PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_identifier,
+ &prefix_tvi, /*replace=*/false, &new_key)) {
+ return absl_ports::InternalError("Could not insert prefix");
+ }
+
+ // Prefix tvi will have hits in prefix section.
+ if (!main_lexicon_->SetProperty(prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError(
+ "Setting has hits in prefix section prop failed");
+ }
+
+ // If it hasn't been added by non-prefix term insertions in
+ // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
+ if (new_key && !main_lexicon_->SetProperty(
+ prefix_tvi, GetHasNoExactHitsPropertyId())) {
+ return absl_ports::InternalError("Setting no exact hits prop failed");
+ }
+
+ outputs.prefix_tvis_buf.push_back(prefix_tvi);
+ }
+
+ // Any prefixes added? Then add to map.
+ if (buf_start < outputs.prefix_tvis_buf.size()) {
+ outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
+ buf_start, outputs.prefix_tvis_buf.size() - buf_start};
+ }
+ }
+ return outputs;
+}
+
+bool MainIndex::CopyProperties(
+ const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi) {
+ for (uint32_t property_id = 0; property_id < prop_reader.size();
+ ++property_id) {
+ if (property_id == GetHasNoExactHitsPropertyId()) {
+ // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
+ // for this term, then HasNoExactHits needs to be set to false in
+ // main_lexicon. If other_lexicon has no exact hits for this term, then
+ // HasNoExactHits in the main_lexicon should not be modified.
+ if (!prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
+ LOG(ERROR) << "Clearing prefix prop failed";
+ return false;
+ }
+ } else {
+ // If other_lexicon has this property set for this term, then that
+ // property needs to be set for the main_lexicon. If other_lexicon
+ // doesn't have this property set, then
+ if (prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
new file mode 100644
index 0000000..15bec1f
--- /dev/null
+++ b/icing/index/main/main-index.h
@@ -0,0 +1,182 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class MainIndex {
+ public:
+ static libtextclassifier3::StatusOr<MainIndex> Create(
+ const string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Get a PostingListAccessor that holds the posting list chain for 'term'.
+ //
+ // RETURNS:
+ // - On success, a valid PostingListAccessor
+ // - NOT_FOUND if term is not present in the main index.
+ libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+ GetAccessorForExactTerm(const std::string& term);
+
+ // Get a PostingListAccessor for 'prefix'.
+ //
+ // RETURNS:
+ // - On success, a result containing a valid PostingListAccessor.
+ // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
+ // prefix are present in the main index.
+ struct GetPrefixAccessorResult {
+ // A PostingListAccessor that holds the posting list chain for the term
+ // that best represents 'prefix' in the main index.
+ std::unique_ptr<PostingListAccessor> accessor;
+ // True if the returned posting list chain is for 'prefix' or false if the
+ // returned posting list chain is for a term for which 'prefix' is a prefix.
+ bool exact;
+ };
+ libtextclassifier3::StatusOr<GetPrefixAccessorResult>
+ GetAccessorForPrefixTerm(const std::string& prefix);
+
+ struct LexiconMergeOutputs {
+ // Maps from main_lexicon tvi for new branching point to the main_lexicon
+ // tvi for posting list whose hits must be backfilled.
+ std::unordered_map<uint32_t, uint32_t> backfill_map;
+
+ // Maps from lexicon tvis to main_lexicon tvis.
+ std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
+
+ // Maps from the lexicon tvi to the beginning position in
+ // prefix_tvis_buf and the length.
+ std::unordered_map<uint32_t, std::pair<int, int>>
+ other_tvi_to_prefix_main_tvis;
+
+ // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
+ std::vector<uint32_t> prefix_tvis_buf;
+ };
+
+ // Merge the lexicon into the main lexicon and populate the data
+ // structures necessary to translate lite tvis to main tvis, track backfilling
+ // and expanding lite terms to prefix terms.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
+ const IcingDynamicTrie& other_lexicon) {
+ // Backfill branch points need to be added first so that the backfill_map
+ // can be correctly populated.
+ ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
+ AddBackfillBranchPoints(other_lexicon));
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddTerms(other_lexicon, std::move(outputs)));
+ // Non-backfill branch points need to be added last so that the mapping of
+ // newly added terms to prefix terms can be correctly populated (prefix
+ // terms might be branch points between two new terms or between a
+ // pre-existing term and a new term).
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddBranchPoints(other_lexicon, std::move(outputs)));
+ return outputs;
+ }
+
+ // Add hits to the main index and backfill from existing posting lists to new
+ // backfill branch points.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+ // exceeds the max TermId, is not valid or is not less than pre-existing hits
+ // in the main index.
+ // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
+ // - RESOURCE_EXHAUSTED error if unable to grow the index
+ libtextclassifier3::Status AddHits(
+ const TermIdCodec& term_id_codec,
+ std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+ std::vector<LiteIndex::Element>&& hits);
+
+ private:
+ libtextclassifier3::Status Init(const string& index_filename,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Helpers for merging the lexicon
+ // Add all 'backfill' branch points. Backfill branch points are prefix
+ // branch points that are a prefix of terms that existed in the lexicon
+ // to the merge.
+ //
+ // For example, if the main lexicon only contains "foot" and is then merged
+ // with a lite lexicon containing only "fool", then a backfill branch point
+ // for "foo" will be added to contain prefix hits from both the pre-existing
+ // posting list for "foot" and the new posting list for "fool".
+ //
+ // Populates LexiconMergeOutputs.backfill_map
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
+ const IcingDynamicTrie& other_lexicon);
+
+ // Add all terms from the lexicon.
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Add all branch points for terms added from the lexicon.
+ // For example, if the main lexicon is empty and is then merged with a
+ // lexicon containing only "foot" and "fool", then a branch point for "foo"
+ // will be added to contain prefix hits from both "foot" and "fool".
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
+ // LexiconMergeOutputs.prefix_tvis_buf;
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Copies all properties from old_tvi in the other lexicon to the new_tvi in
+ // the main lexicon.
+ // Returns true on success, false if an IO error is encountered.
+ bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi);
+
+ std::unique_ptr<FlashIndexStorage> flash_index_;
+ std::unique_ptr<IcingDynamicTrie> main_lexicon_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_MAIN_INDEX_H_
diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc
new file mode 100644
index 0000000..a4f8ca7
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.cc
@@ -0,0 +1,194 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create(
+ FlashIndexStorage *storage) {
+ uint32_t max_posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(storage->block_size());
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array =
+ std::make_unique<uint8_t[]>(max_posting_list_bytes);
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list_buffer,
+ PostingListUsed::CreateFromUnitializedRegion(
+ posting_list_buffer_array.get(), max_posting_list_bytes));
+ return PostingListAccessor(storage, std::move(posting_list_buffer_array),
+ std::move(posting_list_buffer));
+}
+
+libtextclassifier3::StatusOr<PostingListAccessor>
+PostingListAccessor::CreateFromExisting(
+ FlashIndexStorage *storage,
+ PostingListIdentifier existing_posting_list_id) {
+ // Our posting_list_buffer_ will start as empty.
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage->GetPostingList(existing_posting_list_id));
+ pl_accessor.preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ return pl_accessor;
+}
+
+// Returns the next batch of hits for the provided posting list.
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListAccessor::GetNextHitsBatch() {
+ if (preexisting_posting_list_ == nullptr) {
+ if (has_reached_posting_list_chain_end_) {
+ return std::vector<Hit>();
+ }
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve hits from a PostingListAccessor that was not creaated "
+ "from a preexisting posting list.");
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch,
+ preexisting_posting_list_->posting_list.GetHits());
+ uint32_t block_index = preexisting_posting_list_->block.next_block_index();
+ if (block_index != kInvalidBlockIndex) {
+ PostingListIdentifier next_posting_list_id(
+ block_index, /*posting_list_index=*/0,
+ preexisting_posting_list_->block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->GetPostingList(next_posting_list_id));
+ preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ } else {
+ has_reached_posting_list_chain_end_ = true;
+ preexisting_posting_list_.reset();
+ }
+ return batch;
+}
+
+libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) {
+ PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : posting_list_buffer_;
+ libtextclassifier3::Status status = active_pl.PrependHit(hit);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add hits to this current posting list! Therefore,
+ // we need to either move those hits to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ FlushPreexistingPostingList();
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add hit. Should always fit since we just cleared posting_list_buffer_.
+ // It's fine to explicitly reference posting_list_buffer_ here because there's
+ // no way of reaching this line while preexisting_posting_list_ is still in
+ // use.
+ return posting_list_buffer_.PrependHit(hit);
+}
+
+void PostingListAccessor::FlushPreexistingPostingList() {
+ if (preexisting_posting_list_->block.max_num_posting_lists() == 1) {
+ // If this is a max-sized posting list, then just keep track of the id for
+ // chaining. It'll be flushed to disk when preexisting_posting_list_ is
+ // destructed.
+ prev_block_identifier_ = preexisting_posting_list_->id;
+ } else {
+ // If this is NOT a max-sized posting list, then our hits have outgrown this
+ // particular posting list. Move the hits into the in-memory posting list
+ // and free this posting list.
+ //
+ // Move will always succeed since posting_list_buffer_ is max_pl_bytes.
+ posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list);
+
+ // Now that all the contents of this posting list have been copied, there's
+ // no more use for it. Make it available to be used for another posting
+ // list.
+ storage_->FreePostingList(std::move(*preexisting_posting_list_));
+ }
+ preexisting_posting_list_.reset();
+}
+
+libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() {
+ // We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update
+ // the chain.
+ uint32_t max_posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(storage_->block_size());
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->AllocatePostingList(max_posting_list_bytes));
+ holder.block.set_next_block_index(prev_block_identifier_.block_index());
+ prev_block_identifier_ = holder.id;
+ return holder.posting_list.MoveFrom(&posting_list_buffer_);
+}
+
+PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
+ PostingListAccessor accessor) {
+ if (accessor.preexisting_posting_list_ != nullptr) {
+ // Our hits are already in an existing posting list. Nothing else to do, but
+ // return its id.
+ FinalizeResult result = {libtextclassifier3::Status::OK,
+ accessor.preexisting_posting_list_->id};
+ return result;
+ }
+ if (accessor.posting_list_buffer_.BytesUsed() <= 0) {
+ FinalizeResult result = {absl_ports::InvalidArgumentError(
+ "Can't finalize an empty PostingListAccessor. "
+ "There's nothing to Finalize!"),
+ PostingListIdentifier::kInvalid};
+ return result;
+ }
+ uint32_t posting_list_bytes =
+ accessor.posting_list_buffer_.MinPostingListSizeToFit();
+ if (accessor.prev_block_identifier_.is_valid()) {
+ posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ accessor.storage_->block_size());
+ }
+ auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes);
+ if (!holder_or.ok()) {
+ FinalizeResult result = {holder_or.status(),
+ accessor.prev_block_identifier_};
+ return result;
+ }
+ PostingListHolder holder = std::move(holder_or).ValueOrDie();
+ if (accessor.prev_block_identifier_.is_valid()) {
+ holder.block.set_next_block_index(
+ accessor.prev_block_identifier_.block_index());
+ }
+
+ // Move to allocated area. This should never actually return an error. We know
+ // that editor.posting_list() is valid because it wouldn't have successfully
+ // returned by AllocatePostingList if it wasn't. We know posting_list_buffer_
+ // is valid because we created it in-memory. And finally, we know that the
+ // hits from posting_list_buffer_ will fit in editor.posting_list() because we
+ // requested it be at at least posting_list_bytes large.
+ auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_);
+ if (!status.ok()) {
+ FinalizeResult result = {std::move(status),
+ accessor.prev_block_identifier_};
+ return result;
+ }
+ FinalizeResult result = {libtextclassifier3::Status::OK, holder.id};
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h
new file mode 100644
index 0000000..e1bb3c0
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.h
@@ -0,0 +1,168 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+#define ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+
+namespace icing {
+namespace lib {
+
+// This class serves to:
+// 1. Expose PostingListUseds to clients of FlashIndexStorage
+// 2. Ensure the corresponding instance of IndexBlock has the same lifecycle as
+// the instance of PostingListUsed that the client has access to, while
+// not exposing IndexBlock's api surface.
+// 3. Ensure that PostingListUseds can only be freed by calling methods which
+// will also properly maintain the FlashIndexStorage free list and prevent
+// callers from modifying the Posting List after freeing.
+
+// This class is used to provide a simple abstraction for adding hits to posting
+// lists. PostingListAccessor handles 1) selection of properly-sized posting
+// lists for the accumulated hits during Finalize() and 2) chaining of max-sized
+// posting lists.
+class PostingListAccessor {
+ public:
+ // Creates an empty PostingListAccessor.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListAccessor
+ // - INVALID_ARGUMENT error if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<PostingListAccessor> Create(
+ FlashIndexStorage* storage);
+
+ // Create a PostingListAccessor with an existing posting list identified by
+ // existing_posting_list_id.
+ //
+ // The PostingListAccessor will add hits to this posting list until it is
+ // necessary either to 1) chain the posting list (if it is max-sized) or 2)
+ // move its hits to a larger posting list.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListAccessor
+ // - INVALID_ARGUMENT if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting(
+ FlashIndexStorage* storage,
+ PostingListIdentifier existing_posting_list_id);
+
+ // Retrieve the next batch of hits for the posting list chain
+ //
+ // RETURNS:
+ // - On success, a vector of hits in the posting list chain
+ // - INTERNAL if called on an instance of PostingListAccessor that was
+ // created via PostingListAccessor::Create, if unable to read the next
+ // posting list in the chain or if the posting list has been corrupted
+ // somehow.
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetNextHitsBatch();
+
+ // Prepend one hit. This may result in flushing the posting list to disk (if
+ // the PostingListAccessor holds a max-sized posting list that is full) or
+ // freeing a pre-existing posting list if it is too small to fit all hits
+ // necessary.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+ // previously added hit.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status PrependHit(const Hit& hit);
+
+ struct FinalizeResult {
+ // - OK on success
+ // - INVALID_ARGUMENT if there was no pre-existing posting list and no
+ // hits were added
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a
+ // new posting list.
+ libtextclassifier3::Status status;
+ // Id of the posting list chain that was finalized. Guaranteed to be valid
+ // if status is OK. May be valid if status is non-OK, but previous blocks
+ // were written.
+ PostingListIdentifier id;
+ };
+ // Write all accumulated hits to storage.
+ //
+ // If accessor points to a posting list chain with multiple posting lists in
+ // the chain and unable to write the last posting list in the chain, Finalize
+ // will return the error and also populate id with the id of the
+ // second-to-last posting list.
+ static FinalizeResult Finalize(PostingListAccessor accessor);
+
+ private:
+ explicit PostingListAccessor(
+ FlashIndexStorage* storage,
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array,
+ PostingListUsed posting_list_buffer)
+ : storage_(storage),
+ prev_block_identifier_(PostingListIdentifier::kInvalid),
+ posting_list_buffer_array_(std::move(posting_list_buffer_array)),
+ posting_list_buffer_(std::move(posting_list_buffer)),
+ has_reached_posting_list_chain_end_(false) {}
+
+ // Flushes preexisting_posting_list_ to disk if it's a max-sized posting list
+ // and populates prev_block_identifier.
+ // If it's not a max-sized posting list, moves the contents of
+ // preexisting_posting_list_ to posting_list_buffer_ and frees
+ // preexisting_posting_list_.
+ // Sets preexisting_posting_list_ to nullptr.
+ void FlushPreexistingPostingList();
+
+ // Flushes posting_list_buffer_ to a max-sized posting list on disk, setting
+ // its next pointer to prev_block_identifier_ and updating
+ // prev_block_identifier_ to point to the just-written posting list.
+ libtextclassifier3::Status FlushInMemoryPostingList();
+
+ // Frees all posting lists in the posting list chain starting at
+ // prev_block_identifier_.
+ libtextclassifier3::Status FreePostingListChain();
+
+ FlashIndexStorage* storage_; // Does not own.
+
+ // The PostingListIdentifier of the first max-sized posting list in the
+ // posting list chain or PostingListIdentifier::kInvalid if there is no
+ // posting list chain.
+ PostingListIdentifier prev_block_identifier_;
+
+ // An editor to an existing posting list on disk. If available (non-NULL),
+ // we'll try to add all hits to this posting list. Once this posting list
+ // fills up, we'll either 1) chain it (if a max-sized posting list) and put
+ // future hits in posting_list_buffer_ or 2) copy all of its hits into
+ // posting_list_buffer_ and free this pl (if not a max-sized posting list).
+ // TODO(tjbarron) provide a benchmark to demonstrate the effects that re-using
+ // existing posting lists has on latency.
+ std::unique_ptr<PostingListHolder> preexisting_posting_list_;
+
+ // In-memory posting list used to buffer hits before writing them to the
+ // smallest on-disk posting list that will fit them.
+ // posting_list_buffer_array_ owns the memory region that posting_list_buffer_
+ // interprets. Therefore, posting_list_buffer_array_ must have the same
+ // lifecycle as posting_list_buffer_.
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array_;
+ PostingListUsed posting_list_buffer_;
+
+ bool has_reached_posting_list_chain_end_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_POSTING_LIST_ACCESSOR_H_
diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc
new file mode 100644
index 0000000..8a5ef07
--- /dev/null
+++ b/icing/index/main/posting-list-accessor_test.cc
@@ -0,0 +1,384 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::SizeIs;
+
+TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result.status);
+ EXPECT_THAT(result.id.block_index(), Eq(1));
+ EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+ // Retrieve some hits.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add a single hit. This will fit in a min-sized posting list.
+ Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more hit. The minimum size for a posting list must be able to fit
+ // at least two hits, so this should NOT cause the previous pl to be
+ // reallocated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ result1.id));
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit2));
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the same posting list as the first hit.
+ EXPECT_THAT(result2.id, Eq(result1.id));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result2.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAre(hit2, hit1)));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // The smallest posting list size is 15 bytes. The first four hits will be
+ // compressed to one byte each and will be able to fit in the 5 byte padded
+ // region. The last hit will fit in one of the special hits. The posting list
+ // will be ALMOST_FULL and can fit at most 2 more hits.
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Now let's add some more hits!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ result1.id));
+ // The current posting list can fit at most 2 more hits. Adding 12 more hits
+ // should result in these hits being moved to a larger posting list.
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the second (new) block because the posting
+ // list should have grown beyond the size that the first block maintains.
+ EXPECT_THAT(result2.id.block_index(), Eq(2));
+ EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result2.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier second_block_id = result1.id;
+ // Should have been allocated to the second block, which holds a max-sized
+ // posting list.
+ EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now let's retrieve them!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(second_block_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ pl_holder.posting_list.GetHits());
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.block.next_block_index();
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage.GetPostingList(pl_id));
+ EXPECT_THAT(
+ pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest,
+ PreexistingMultiBlockReusesBlocksProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier first_add_id = result1.id;
+ EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now add a couple more hits. These should fit on the existing, not full
+ // second block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ first_add_id));
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ PostingListIdentifier second_add_id = result2.id;
+ EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+ // We should be able to retrieve all 5050 hits.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(second_add_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ pl_holder.posting_list.GetHits());
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.block.next_block_index();
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage.GetPostingList(pl_id));
+ EXPECT_THAT(
+ pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit invalid_hit;
+ EXPECT_THAT(pl_accessor.PrependHit(invalid_hit),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+
+ Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kMaxHitScore);
+ EXPECT_THAT(pl_accessor.PrependHit(hit2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore);
+ EXPECT_THAT(pl_accessor.PrependHit(hit3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ EXPECT_THAT(result1.status,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2,
+ PostingListAccessor::CreateFromExisting(
+ &flash_index_storage, result1.id));
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor2));
+ ICING_ASSERT_OK(result2.status);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-identifier.cc b/icing/index/main/posting-list-identifier.cc
new file mode 100644
index 0000000..1cdac65
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-identifier.h"
+
+namespace icing {
+namespace lib {
+
+PostingListIdentifier PostingListIdentifier::kInvalid(
+ kInvalidBlockIndex, /*posting_list_index=*/0,
+ PostingListIdentifier::kEncodedPostingListIndexBits - 1);
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-identifier.h b/icing/index/main/posting-list-identifier.h
new file mode 100644
index 0000000..4953865
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.h
@@ -0,0 +1,116 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+// 1M blocks * 4K page size = 4GB index
+inline constexpr int kBlockIndexBits = 20;
+inline constexpr int kMaxBlockIndex = (1u << kBlockIndexBits) - 1;
+
+// Class used to store information necessary to identify any posting list within
+// the index.
+//
+// The 20 leftmost bits in this identifier encode the block index. The 12
+// rightmost bits encode both the posting list index and the maximum number of
+// bits required to encode a posting list index on that block.
+//
+// Ex. An index block containing a max of 68 posting lists each of size 60
+// bytes (and thus 7 posting list bits), with a block index of 13 and a posting
+// list index of 5.
+// 0000 0000 0000 0000 1101 1111 0000 0101
+// |__________block-index_______|__pad__|_pl-index_|
+//
+// "pad" is some region starting at kEncodedPostingListIndexBits (12) bit and
+// continuing rightward until reaching a terminating "0". This padding encodes
+// the posting list bits value - posting list bits value is the number of bits
+// after the terminating '0' of the "pad" region.
+//
+// This value will eventually be stored in the Main Lexicon.
+class PostingListIdentifier {
+ // 1 bit is wasted to encode max pl index bits so there can be at most 2^11
+ // posting lists per block. Block size would have to be >=40020 bytes for
+ // there to be more than 2K+ posting lists in a block.
+ static constexpr int kEncodedPostingListIndexBits = 12;
+ static_assert(kEncodedPostingListIndexBits + kBlockIndexBits <=
+ 8 * sizeof(uint32_t),
+ "Not enough room in PostingListIdentifier value to encode "
+ "block index and posting list index.");
+
+ public:
+ static PostingListIdentifier kInvalid;
+
+ // 1. block_index - the index of this block within the FlashIndexStorage file
+ // 2. posting_list_index - the index of this posting list within the block
+ // 3. posting_list_index_bits - the number of bits needed to encode the
+ // largest posting_list_index that this block can have.
+ PostingListIdentifier(uint32_t block_index,
+ PostingListIndex posting_list_index,
+ int posting_list_index_bits) {
+ val_ = 0;
+ BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits,
+ /*val=*/static_cast<uint64_t>(posting_list_index));
+ BITFIELD_OR(
+ val_, /*offset=*/posting_list_index_bits + 1,
+ /*len=*/kEncodedPostingListIndexBits - posting_list_index_bits - 1,
+ /*val=*/~0u);
+ BITFIELD_OR(val_, /*offset=*/kEncodedPostingListIndexBits,
+ /*len=*/kBlockIndexBits,
+ /*val=*/block_index);
+ }
+
+ int block_index() const {
+ return BITFIELD_GET(val_, kEncodedPostingListIndexBits, kBlockIndexBits);
+ }
+
+ PostingListIndex posting_list_index() const {
+ return BITFIELD_GET(val_, 0, posting_list_index_bits());
+ }
+
+ // Returns the maximum number of bits that a posting list index on the block
+ // referred to by block_index could use.
+ int posting_list_index_bits() const {
+ for (int bits = kEncodedPostingListIndexBits - 1; bits >= 0; --bits) {
+ if (((1u << bits) & val_) == 0) {
+ // Got to the zero bit. This is the start of pl index.
+ return bits;
+ }
+ }
+ return -1;
+ }
+
+ bool is_valid() const { return *this != kInvalid; }
+
+ bool operator==(const PostingListIdentifier& rhs) const {
+ return val_ == rhs.val_;
+ }
+ bool operator!=(const PostingListIdentifier& rhs) const {
+ return !(*this == rhs);
+ }
+
+ private:
+ uint32_t val_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 4396007..71752dd 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -302,6 +302,24 @@
}
JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery(
+ JNIEnv* env, jclass clazz, jlong native_pointer,
+ jbyteArray search_spec_bytes) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::SearchSpecProto search_spec_proto;
+ if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) {
+ ICING_LOG(ERROR) << "Failed to parse SearchSpecProto in nativeSearch";
+ return nullptr;
+ }
+ icing::lib::DeleteResultProto delete_result_proto =
+ icing->DeleteByQuery(search_spec_proto);
+
+ return SerializeProtoToJniByteArray(env, delete_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk(
JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
index 1954cd3..2eb64ac 100644
--- a/icing/legacy/core/icing-string-util.cc
+++ b/icing/legacy/core/icing-string-util.cc
@@ -11,13 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-//
-// This is a list of IsGoogleLetter letters. It is copied from
-// google3/util/utf8/proptables/letters.txt CL 19164202.
#include "icing/legacy/core/icing-string-util.h"
#include <stdarg.h>
@@ -34,7 +27,6 @@
namespace icing {
namespace lib {
-namespace {} // namespace
uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) {
if (len > 0) {
crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len);
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index 4ea93ec..767e581 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h
@@ -12,10 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-
#ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
#define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index ee3d3a2..29843ba 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -96,14 +96,28 @@
namespace icing {
namespace lib {
+namespace {
+constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
+constexpr uint32_t kInvalidNextIndex = ~0U;
+
+// Returns the number of valid nexts in the array.
+int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start,
+ int next_array_length) {
+ int valid_nexts_length = 0;
+ for (; valid_nexts_length < next_array_length &&
+ next_array_start[valid_nexts_length].node_index() != kInvalidNodeIndex;
+ ++valid_nexts_length) {
+ }
+ return valid_nexts_length;
+}
+} // namespace
+
// Based on the bit field widths.
const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
-const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1;
-const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U;
const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
const int IcingDynamicTrie::kMaxNextArraySize;
@@ -891,7 +905,7 @@
bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
uint8_t *buf, uint32_t buf_size) const {
- uint32_t size = hdr.ByteSize();
+ uint32_t size = hdr.ByteSizeLong();
if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
memcpy(buf, &kMagic, sizeof(kMagic));
memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
@@ -1502,6 +1516,53 @@
deleted_bitmap_->Truncate(0);
}
+bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
+ // The size 1 below is for a '\0' between the suffix and the value.
+ size_t suffix_and_value_length =
+ strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
+ this->value_size();
+ char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
+ suffix_value_index, suffix_and_value_length);
+
+ if (mutable_suffix_and_value == nullptr) {
+ return false;
+ }
+
+ memset(mutable_suffix_and_value, 0, suffix_and_value_length);
+ return true;
+}
+
+bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
+ Next *mutable_next =
+ this->storage_->GetMutableNextArray(next_index, /*len=*/1);
+
+ if (mutable_next == nullptr) {
+ return false;
+ }
+
+ mutable_next->set_val(0);
+ mutable_next->set_node_index(kInvalidNodeIndex);
+ return true;
+}
+
+bool IcingDynamicTrie::SortNextArray(const Node *node) {
+ if (node == nullptr) {
+ // Nothing to sort, return success directly.
+ return true;
+ }
+
+ uint32_t next_array_buffer_size = 1u << node->log2_num_children();
+ Next *next_array_start = this->storage_->GetMutableNextArray(
+ node->next_index(), next_array_buffer_size);
+
+ if (next_array_start == nullptr) {
+ return false;
+ }
+
+ std::sort(next_array_start, next_array_start + next_array_buffer_size - 1);
+ return true;
+}
+
bool IcingDynamicTrie::Insert(const char *key, const void *value,
uint32_t *value_index, bool replace,
bool *pnew_key) {
@@ -1641,15 +1702,12 @@
new_leaf_node->set_log2_num_children(0);
// Figure out the real length of the existing next array.
- Next *cur_next = storage_->GetMutableNextArray(
- best_node->next_index(), 1 << best_node->log2_num_children());
- int next_len = 0;
- for (; next_len < (1 << best_node->log2_num_children()) &&
- cur_next[next_len].node_index() != kInvalidNodeIndex;
- next_len++) {
- }
+ uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
+ Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
+ next_array_buffer_size);
+ int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
Next *new_next = cur_next;
- if (next_len == (1 << best_node->log2_num_children())) {
+ if (next_len == (next_array_buffer_size)) {
// Allocate a new, larger, array.
new_next = storage_->AllocNextArray(next_len + 1);
memcpy(new_next, cur_next, sizeof(Next) * next_len);
@@ -2072,7 +2130,8 @@
}
void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
- int *key_offset, bool prefix) const {
+ int *key_offset, bool prefix,
+ bool utf8) const {
// Find the best node such that:
//
// - If key is NOT in the trie, key[0..key_offset) is a prefix to
@@ -2093,6 +2152,8 @@
const Node *cur_node = storage_->GetRootNode();
const char *cur_key = key;
+ const Node *utf8_node = cur_node;
+ const char *utf8_key = cur_key;
while (!cur_node->is_leaf()) {
const Next *found = GetNextByChar(cur_node, *cur_key);
if (!found) break;
@@ -2108,12 +2169,101 @@
break;
}
cur_key++;
+
+ if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) {
+ utf8_node = cur_node;
+ utf8_key = cur_key;
+ }
+ }
+
+ if (utf8) {
+ // Rewind.
+ cur_node = utf8_node;
+ cur_key = utf8_key;
}
*best_node_index = storage_->GetNodeIndex(cur_node);
*key_offset = reinterpret_cast<const char *>(cur_key) - key;
}
+int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key,
+ bool utf8) const {
+ if (storage_->empty()) {
+ return kNoBranchFound;
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
+ const Node *cur_node = storage_->GetNode(best_node_index);
+ const char *cur_key = key + key_offset;
+ if (cur_node->is_leaf()) {
+ // Prefix in the trie. Split at leaf.
+ const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
+ while (*prev_suffix != '\0' && *prev_suffix == *cur_key) {
+ prev_suffix++;
+ cur_key++;
+ }
+
+ // Equal strings? No branching.
+ if (*prev_suffix == '\0' && *cur_key == '\0') {
+ return kNoBranchFound;
+ }
+
+ if (utf8) {
+ // Rewind to utf8 boundary.
+ size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key);
+ cur_key = key + offset;
+ }
+
+ return cur_key - key;
+ } else if (cur_node->log2_num_children() == 0) {
+ // Intermediate node going from no branching to branching.
+ return cur_key - key;
+ }
+
+ // If we've reached this point, then we're already at a branch point. So there
+ // is no *new* branch point.
+ return kNoBranchFound;
+}
+
+std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key,
+ bool utf8) const {
+ std::vector<int> prefix_lengths;
+
+ if (storage_->empty()) {
+ return prefix_lengths;
+ }
+
+ const Node *cur_node = storage_->GetRootNode();
+ const char *cur_key = key;
+ while (*cur_key && !cur_node->is_leaf()) {
+ // Branching prefix?
+ if (cur_node->log2_num_children() > 0) {
+ int len = cur_key - key;
+ if (utf8) {
+ // Do not cut mid-utf8. Walk up to utf8 boundary.
+ len = i18n_utils::SafeTruncateUtf8Length(key, len);
+ if (prefix_lengths.empty() || len != prefix_lengths.back()) {
+ prefix_lengths.push_back(len);
+ }
+ } else {
+ prefix_lengths.push_back(len);
+ }
+ }
+
+ // Move to next.
+ const Next *found = GetNextByChar(cur_node, *cur_key);
+ if (found == nullptr) {
+ break;
+ }
+ cur_node = storage_->GetNode(found->node_index());
+
+ ++cur_key;
+ }
+ return prefix_lengths;
+}
+
void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
Stats stats;
CollectStats(&stats);
@@ -2248,6 +2398,102 @@
return deleted_bitmap_->SetBit(idx, false);
}
+// Steps:
+// 1. Find the key in the trie.
+// 2. Remove the suffix and the value.
+// 3. Reset the nexts that point to the nodes to be removed.
+// 4. Sort any next array if needed.
+bool IcingDynamicTrie::Delete(const std::string_view key) {
+ if (!is_initialized()) {
+ ICING_LOG(ERROR) << "DynamicTrie not initialized";
+ return false;
+ }
+
+ if (storage_->empty()) {
+ // Nothing to delete.
+ return true;
+ }
+
+ // Tries to find the key in the trie, starting from the root.
+ const Node *current_node = storage_->GetRootNode();
+
+ // The node after which we start to remove data.
+ const Node *last_multichild_node = nullptr;
+
+ // While visiting the trie nodes, we store the indices of Nexts that point
+ // to all the nodes after last_multichild_node. Those nodes must be
+ // consecutive and all have only one child. Resetting those Nexts means that
+ // we remove the data of the key.
+ std::vector<uint32_t> nexts_to_reset;
+ nexts_to_reset.reserve(key.length());
+
+ // Iterates through chars in the key, finds nodes in the trie until a leaf
+ // node is reached. The max number of loops is key.length() + 1 because we
+ // start from the root.
+ for (size_t i = 0; i <= key.length(); ++i) {
+ if (current_node->is_leaf()) {
+ // Leaf node, now check the suffix.
+ if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+ // Otherwise, key is found.
+ break;
+ }
+
+ // Finds the next char.
+ const Next *next;
+ if (i == key.length()) {
+ // When we're at the end of the key, the next char is the termination char
+ // '\0'.
+ next = GetNextByChar(current_node, '\0');
+ } else {
+ next = GetNextByChar(current_node, key[i]);
+ }
+
+ if (next == nullptr) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+
+ // Checks the real size of next array.
+ uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
+ Next *next_array_start = storage_->GetMutableNextArray(
+ current_node->next_index(), next_array_buffer_size);
+ int valid_next_array_size =
+ GetValidNextsSize(next_array_start, next_array_buffer_size);
+ if (valid_next_array_size == 0) {
+ // Key does not exist in the trie, nothing to delete.
+ // This shouldn't happen, but we put a sanity check here in case something
+ // is wrong.
+ return true;
+ } else if (valid_next_array_size == 1) {
+ // Single-child branch will be deleted.
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ } else {
+ // We see a new node with multiple children, all the previously seen nodes
+ // shouldn't be removed.
+ last_multichild_node = current_node;
+ nexts_to_reset.clear();
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ }
+
+ // Updates current_node.
+ current_node = storage_->GetNode(next->node_index());
+ }
+ // Now we've found the key in the trie.
+
+ ClearSuffixAndValue(current_node->next_index());
+
+ // Resets nexts to remove key information.
+ for (uint32_t next_index : nexts_to_reset) {
+ ResetNext(next_index);
+ }
+ SortNextArray(last_multichild_node);
+
+ return true;
+}
+
bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
if (!is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index c33be96..7fe290b 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -288,6 +288,16 @@
// Empty out the trie without closing or removing.
void Clear();
+ // Clears the suffix and value at the given index. Returns true on success.
+ bool ClearSuffixAndValue(uint32_t suffix_value_index);
+
+ // Resets the next at the given index so that it points to no node.
+ // Returns true on success.
+ bool ResetNext(uint32_t next_index);
+
+ // Sorts the next array of the node. Returns true on success.
+ bool SortNextArray(const Node *node);
+
// Sync to disk.
bool Sync() override;
@@ -375,6 +385,16 @@
bool is_full_match() const { return value_index != kInvalidValueIndex; }
};
+ static constexpr int kNoBranchFound = -1;
+ // Return prefix of any new branches created if key were inserted. If utf8 is
+ // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
+ // would be created.
+ int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
+
+ // Find all prefixes of key where the trie branches. Excludes the key
+ // itself. If utf8 is true, does not cut key mid-utf8.
+ std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
+
void GetDebugInfo(int verbosity, std::string *out) const override;
double min_free_fraction() const;
@@ -402,6 +422,10 @@
// Clears the deleted property for each value.
bool ClearDeleted(uint32_t value_index);
+ // Deletes the entry associated with the key. Data can not be recovered after
+ // the deletion. Returns true on success.
+ bool Delete(std::string_view key);
+
// Clear a specific property id from all values. For each value that has this
// property cleared, also check to see if it was the only property set; if
// so, set the deleted property for the value to indicate it no longer has any
@@ -575,8 +599,6 @@
void GetHeader(IcingDynamicTrieHeader *hdr) const;
void SetHeader(const IcingDynamicTrieHeader &new_hdr);
- static const uint32_t kInvalidNodeIndex;
- static const uint32_t kInvalidNextIndex;
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
@@ -587,7 +609,7 @@
const Next *LowerBound(const Next *start, const Next *end,
uint8_t key_char) const;
void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
- bool prefix) const;
+ bool prefix, bool utf8 = false) const;
// For value properties. This truncates the data by clearing it, but leaving
// the storage intact.
diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
index 4fae52a..193765b 100644
--- a/icing/legacy/index/icing-dynamic-trie_test.cc
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc
@@ -746,6 +746,222 @@
}
}
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenRootIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts a key, the root is a leaf.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("foo", &value));
+ ASSERT_TRUE(trie.Find("foo", &value));
+
+ // Deletes the key.
+ EXPECT_TRUE(trie.Delete("foo"));
+ EXPECT_FALSE(trie.Find("foo", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenLastCharIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "bar". "r" is a leaf node in the trie.
+ EXPECT_TRUE(trie.Delete("bar"));
+ EXPECT_FALSE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithTerminationNode) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "ba" which is a key with termination node in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_FALSE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleNexts) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "ba", "bb", "bc", and "bd", the trie structure looks like:
+ // root
+ // |
+ // b
+ // / | | \
+ // a b c d
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Insert("bb", &value));
+ ASSERT_TRUE(trie.Insert("bc", &value));
+ ASSERT_TRUE(trie.Insert("bd", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+ ASSERT_TRUE(trie.Find("bb", &value));
+ ASSERT_TRUE(trie.Find("bc", &value));
+ ASSERT_TRUE(trie.Find("bd", &value));
+
+ // Deletes "bc".
+ EXPECT_TRUE(trie.Delete("bc"));
+ EXPECT_FALSE(trie.Find("bc", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bb", &value));
+ EXPECT_TRUE(trie.Find("bd", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleTrieBranches) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "batter", "battle", and "bar", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // t r
+ // |
+ // t
+ // / \
+ // e l
+ // | |
+ // r e
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("batter", &value));
+ ASSERT_TRUE(trie.Insert("battle", &value));
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Find("batter", &value));
+ ASSERT_TRUE(trie.Find("battle", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+
+ // Deletes "batter".
+ EXPECT_TRUE(trie.Delete("batter"));
+ EXPECT_FALSE(trie.Find("batter", &value));
+ EXPECT_TRUE(trie.Find("battle", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, InsertionShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+ ASSERT_TRUE(trie.Insert("foo", &value));
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+ ASSERT_FALSE(trie.Find("bed", &value));
+
+ // Inserts after deletion
+ EXPECT_TRUE(trie.Insert("bed", &value));
+ EXPECT_TRUE(trie.Insert("bedroom", &value));
+ EXPECT_TRUE(trie.Find("bed", &value));
+ EXPECT_TRUE(trie.Find("bedroom", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+ ASSERT_TRUE(trie.Insert("foo", &value));
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+
+ // Iterates through all keys
+ IcingDynamicTrie::Iterator iterator_all(trie, "");
+ std::vector<std::string> results;
+ for (; iterator_all.IsValid(); iterator_all.Advance()) {
+ results.emplace_back(iterator_all.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar", "foo"));
+
+ // Iterates through keys that start with "b"
+ IcingDynamicTrie::Iterator iterator_b(trie, "b");
+ results.clear();
+ for (; iterator_b.IsValid(); iterator_b.Advance()) {
+ results.emplace_back(iterator_b.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+
+ // "ba" and bedroom are not keys in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_TRUE(trie.Delete("bedroom"));
+
+ // The original keys are not affected.
+ EXPECT_TRUE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("bed", &value));
+}
+
} // namespace
// The tests below are accessing private methods and fields of IcingDynamicTrie
diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
index 31e012a..5a064ea 100644
--- a/icing/legacy/index/icing-mock-filesystem.h
+++ b/icing/legacy/index/icing-mock-filesystem.h
@@ -31,65 +31,78 @@
class IcingMockFilesystem : public IcingFilesystem {
public:
- MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name));
+ MOCK_METHOD(bool, DeleteFile, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, DeleteDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD1(FileExists, bool(const char *file_name));
+ MOCK_METHOD(bool, FileExists, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name));
+ MOCK_METHOD(bool, DirectoryExists, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name));
+ MOCK_METHOD(int, GetBasenameIndex, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name));
+ MOCK_METHOD(std::string, GetBasename, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name));
+ MOCK_METHOD(std::string, GetDirname, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name,
- std::vector<std::string> *entries));
+ MOCK_METHOD(bool, ListDirectory,
+ (const char *dir_name, std::vector<std::string> *entries),
+ (const, override));
- MOCK_CONST_METHOD2(GetMatchingFiles,
- bool(const char *glob, std::vector<std::string> *matches));
+ MOCK_METHOD(bool, GetMatchingFiles,
+ (const char *glob, std::vector<std::string> *matches),
+ (const, override));
- MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name));
+ MOCK_METHOD(int, OpenForWrite, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name));
+ MOCK_METHOD(int, OpenForAppend, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name));
+ MOCK_METHOD(int, OpenForRead, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd));
+ MOCK_METHOD(uint64_t, GetFileSize, (int fd), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename));
+ MOCK_METHOD(uint64_t, GetFileSize, (const char *filename), (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Truncate, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size));
+ MOCK_METHOD(bool, Truncate, (const char *filename, uint64_t new_size),
+ (const, override));
- MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Grow, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size));
- MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data,
- size_t data_size));
+ MOCK_METHOD(bool, Write, (int fd, const void *data, size_t data_size),
+ (const, override));
+ MOCK_METHOD(bool, PWrite,
+ (int fd, off_t offset, const void *data, size_t data_size),
+ (const, override));
- MOCK_CONST_METHOD1(DataSync, bool(int fd));
+ MOCK_METHOD(bool, DataSync, (int fd), (const, override));
- MOCK_CONST_METHOD2(RenameFile,
- bool(const char *old_name, const char *new_name));
+ MOCK_METHOD(bool, RenameFile, (const char *old_name, const char *new_name),
+ (const, override));
- MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two));
+ MOCK_METHOD(bool, SwapFiles, (const char *one, const char *two),
+ (const, override));
- MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, CreateDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, CreateDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst));
+ MOCK_METHOD(bool, CopyFile, (const char *src, const char *dst),
+ (const, override));
- MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum,
- uint64_t offset, uint64_t length));
+ MOCK_METHOD(bool, ComputeChecksum,
+ (int fd, uint32_t *checksum, uint64_t offset, uint64_t length),
+ (const, override));
- MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path));
+ MOCK_METHOD(uint64_t, GetDiskUsage, (const char *path), (const, override));
};
} // namespace lib
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 000bf3a..29404d9 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -30,6 +30,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -107,8 +108,9 @@
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -219,8 +221,9 @@
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -349,8 +352,9 @@
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -464,8 +468,9 @@
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 36dbfd9..0d2c2c5 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -36,6 +36,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -59,8 +60,10 @@
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
SchemaStore::Create(&filesystem_, test_dir_));
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 3b3bf61..676ea92 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -40,6 +40,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -60,8 +61,10 @@
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
// Setup the schema
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
index 198bc49..3970132 100644
--- a/icing/store/document-filter-data.h
+++ b/icing/store/document-filter-data.h
@@ -25,6 +25,7 @@
namespace lib {
using SchemaTypeId = int16_t;
+inline constexpr SchemaTypeId kInvalidSchemaTypeId = -1;
class DocumentFilterData {
public:
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 93cebaa..79b91df 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -329,8 +329,22 @@
auto iterator = document_log_->GetIterator();
auto iterator_status = iterator.Advance();
while (iterator_status.ok()) {
- ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper,
- document_log_->ReadProto(iterator.GetOffset()));
+ libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
+ document_log_->ReadProto(iterator.GetOffset());
+
+ if (absl_ports::IsNotFound(document_wrapper_or.status())) {
+ // The erased document still occupies 1 document id.
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(
+ ClearDerivedData(/*name_space=*/"", /*uri=*/"", new_document_id));
+ iterator_status = iterator.Advance();
+ continue;
+ } else if (!document_wrapper_or.ok()) {
+ return document_wrapper_or.status();
+ }
+
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
if (document_wrapper.deleted()) {
if (!document_wrapper.document().uri().empty()) {
// Individual document deletion.
@@ -351,17 +365,22 @@
}
} else if (!document_wrapper.document().namespace_().empty()) {
// Namespace deletion.
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted(
- document_wrapper.document().namespace_()));
-
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->Get(document_wrapper.document().namespace_()));
+ // Tombstone indicates it's a soft delete.
+ ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId,
+ /*soft_delete=*/true));
} else if (!document_wrapper.document().schema().empty()) {
// SchemaType deletion.
auto schema_type_id_or = schema_store_->GetSchemaTypeId(
document_wrapper.document().schema());
if (schema_type_id_or.ok()) {
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(
- schema_type_id_or.ValueOrDie()));
+ // Tombstone indicates it's a soft delete.
+ ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId,
+ schema_type_id_or.ValueOrDie(),
+ /*soft_delete=*/true));
} else {
// The deleted schema type doesn't have a SchemaTypeId we can refer
// to in the FilterCache.
@@ -845,7 +864,8 @@
}
libtextclassifier3::Status DocumentStore::Delete(
- const std::string_view name_space, const std::string_view uri) {
+ const std::string_view name_space, const std::string_view uri,
+ bool soft_delete) {
// Try to get the DocumentId first
auto document_id_or = GetDocumentId(name_space, uri);
if (!document_id_or.ok()) {
@@ -865,25 +885,63 @@
", uri: ", uri));
}
+ if (soft_delete) {
+ return SoftDelete(name_space, uri, document_id);
+ } else {
+ uint64_t document_log_offset = file_offset_or.ValueOrDie();
+ return HardDelete(name_space, uri, document_id, document_log_offset);
+ }
+}
+
+libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
+ bool soft_delete) {
+ // Copy out the document to get namespace and uri.
+ ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
+ DoesDocumentExistAndGetFileOffset(document_id));
+ auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
+ if (!document_wrapper_or.ok()) {
+ ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
+ << "Failed to read from document log";
+ return document_wrapper_or.status();
+ }
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
+
+ if (soft_delete) {
+ return SoftDelete(document_wrapper.document().namespace_(),
+ document_wrapper.document().uri(), document_id);
+ } else {
+ return HardDelete(document_wrapper.document().namespace_(),
+ document_wrapper.document().uri(), document_id,
+ document_log_offset);
+ }
+}
+
+libtextclassifier3::Status DocumentStore::SoftDelete(
+ std::string_view name_space, std::string_view uri, DocumentId document_id) {
// Update ground truth first.
- // To delete a proto we don't directly remove it. Instead, we mark it as
- // deleted first by appending a tombstone of it and actually remove it from
- // file later in Optimize()
- // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR
- // that can support error logging.
+ // Mark the document as deleted by appending a tombstone of it and actually
+ // remove it from file later in Optimize()
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
libtextclassifier3::Status status =
document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
.status();
if (!status.ok()) {
return absl_ports::Annotate(
- status, absl_ports::StrCat("Failed to delete Document. namespace: ",
+ status, absl_ports::StrCat("Failed to delete Document. namespace:",
name_space, ", uri: ", uri));
}
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag));
+ return document_id_mapper_->Set(document_id, kDocDeletedFlag);
+}
- return libtextclassifier3::Status::OK;
+libtextclassifier3::Status DocumentStore::HardDelete(
+ std::string_view name_space, std::string_view uri, DocumentId document_id,
+ uint64_t document_log_offset) {
+ // Erases document proto.
+ ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
+ return ClearDerivedData(name_space, uri, document_id);
}
libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
@@ -899,7 +957,14 @@
<< " from score_cache_";
return score_data_or.status();
}
- return *std::move(score_data_or).ValueOrDie();
+
+ DocumentAssociatedScoreData document_associated_score_data =
+ *std::move(score_data_or).ValueOrDie();
+ if (document_associated_score_data.document_score() < 0) {
+ // An negative / invalid score means that the score data has been deleted.
+ return absl_ports::NotFoundError("Document score data not found.");
+ }
+ return document_associated_score_data;
}
libtextclassifier3::StatusOr<DocumentFilterData>
@@ -910,68 +975,134 @@
<< " from filter_cache_";
return filter_data_or.status();
}
- return *std::move(filter_data_or).ValueOrDie();
+ DocumentFilterData document_filter_data =
+ *std::move(filter_data_or).ValueOrDie();
+ if (document_filter_data.namespace_id() == kInvalidNamespaceId) {
+ // An invalid namespace id means that the filter data has been deleted.
+ return absl_ports::NotFoundError("Document filter data not found.");
+ }
+ return document_filter_data;
}
libtextclassifier3::Status DocumentStore::DeleteByNamespace(
- std::string_view name_space) {
+ std::string_view name_space, bool soft_delete) {
auto namespace_id_or = namespace_mapper_->Get(name_space);
if (!namespace_id_or.ok()) {
return absl_ports::Annotate(
namespace_id_or.status(),
- absl_ports::StrCat("Failed to delete by namespace. namespace: ",
- name_space));
+ absl_ports::StrCat("Failed to find namespace: ", name_space));
+ }
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+
+ int num_updated_documents = 0;
+ if (soft_delete) {
+ // To delete an entire namespace, we append a tombstone that only contains
+ // the deleted bit and the name of the deleted namespace.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateNamespaceTombstone(name_space))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete namespace. namespace = "
+ << name_space;
+ return status;
+ }
}
- // Update ground truth first.
- // To delete an entire namespace, we append a tombstone that only contains
- // the deleted bit and the name of the deleted namespace.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete namespace. namespace = "
- << name_space;
- return status;
- }
+ ICING_ASSIGN_OR_RETURN(
+ num_updated_documents,
+ BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete));
- ICING_ASSIGN_OR_RETURN(bool updated_existing_document,
- UpdateDerivedFilesNamespaceDeleted(name_space));
- if (!updated_existing_document) {
+ if (num_updated_documents <= 0) {
// Treat the fact that no existing documents had this namespace to be the
// same as this namespace not existing at all.
return absl_ports::NotFoundError(
absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
}
+
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<bool>
-DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) {
- auto namespace_id_or = namespace_mapper_->Get(name_space);
- if (!namespace_id_or.ok()) {
- return namespace_id_or.status();
+libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
+ std::string_view schema_type, bool soft_delete) {
+ auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
+ if (!schema_type_id_or.ok()) {
+ return absl_ports::Annotate(
+ schema_type_id_or.status(),
+ absl_ports::StrCat("Failed to find schema type. schema_type: ",
+ schema_type));
+ }
+ SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+
+ int num_updated_documents = 0;
+ if (soft_delete) {
+ // To soft-delete an entire schema type, we append a tombstone that only
+ // contains the deleted bit and the name of the deleted schema type.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete schema_type. schema_type = "
+ << schema_type;
+ return status;
+ }
}
- // Guaranteed to have a NamespaceId now.
- NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ ICING_ASSIGN_OR_RETURN(
+ num_updated_documents,
+ BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete));
+ if (num_updated_documents <= 0) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "No documents found with schema type '", schema_type, "'"));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
+ NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) {
// Tracks if there were any existing documents with this namespace that we
// will mark as deleted.
- bool updated_existing_document = false;
+ int num_updated_documents = 0;
- // Traverse FilterCache and delete all docs that match namespace_id
+ // Traverse FilterCache and delete all docs that match namespace_id and
+ // schema_type_id.
for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
++document_id) {
// filter_cache_->Get can only fail if document_id is < 0
// or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
filter_cache_->Get(document_id));
- if (data->namespace_id() == namespace_id) {
+
+ // Check namespace only when the input namespace id is valid.
+ if (namespace_id != kInvalidNamespaceId &&
+ (data->namespace_id() == kInvalidNamespaceId ||
+ data->namespace_id() != namespace_id)) {
+ // The document has already been hard-deleted or isn't from the desired
+ // namespace.
+ continue;
+ }
+
+ // Check schema type only when the input schema type id is valid.
+ if (schema_type_id != kInvalidSchemaTypeId &&
+ (data->schema_type_id() == kInvalidSchemaTypeId ||
+ data->schema_type_id() != schema_type_id)) {
+ // The document has already been hard-deleted or doesn't have the
+ // desired schema type.
+ continue;
+ }
+
+ // The document has the desired namespace and schema type, it either exists
+ // or has been soft-deleted / expired.
+ if (soft_delete) {
if (DoesDocumentExist(document_id)) {
- updated_existing_document = true;
+ ++num_updated_documents;
}
// docid_mapper_->Set can only fail if document_id is < 0
@@ -980,65 +1111,29 @@
// docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
ICING_RETURN_IF_ERROR(
document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ } else {
+ // Hard delete. Try to copy out the document to get namespace and uri.
+ // Getting namespace and uri is necessary to delete entries in
+ // document_key_mapper_.
+ auto document_or = Get(document_id);
+ if (absl_ports::IsNotFound(document_or.status())) {
+ // Document not found.
+ continue;
+ } else if (!document_or.ok()) {
+ // Real error, pass up.
+ return document_or.status();
+ }
+ DocumentProto document_copy = std::move(document_or).ValueOrDie();
+
+ // Erase from the ground truth. Delete() won't return NOT_FOUND because
+ // NOT_FOUND should have been caught by Get() above.
+ ICING_RETURN_IF_ERROR(Delete(document_copy.namespace_(),
+ document_copy.uri(), /*soft_delete=*/false));
+ ++num_updated_documents;
}
}
- return updated_existing_document;
-}
-
-libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
- std::string_view schema_type) {
- auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
- if (!schema_type_id_or.ok()) {
- return absl_ports::Annotate(
- schema_type_id_or.status(),
- absl_ports::StrCat("Failed to delete by schema type. schema_type: ",
- schema_type));
- }
-
- // Update ground truth first.
- // To delete an entire schema type, we append a tombstone that only contains
- // the deleted bit and the name of the deleted schema type.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
- .status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete schema_type. schema_type = "
- << schema_type;
- return status;
- }
-
- // Guaranteed to have a SchemaTypeId now
- SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
-
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id));
-
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id) {
- // Traverse FilterCache and delete all docs that match schema_type_id.
- for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
- ++document_id) {
- // filter_cache_->Get can only fail if document_id is < 0
- // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
- filter_cache_->Get(document_id));
- if (data->schema_type_id() == schema_type_id) {
- // docid_mapper_->Set can only fail if document_id is < 0
- // or >= docid_mapper_->num_elements. So the only possible way to get an
- // error here would be if filter_cache_->num_elements >
- // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
- }
- }
-
- return libtextclassifier3::Status::OK;
+ return num_updated_documents;
}
libtextclassifier3::Status DocumentStore::PersistToDisk() {
@@ -1328,5 +1423,27 @@
return filter_cache_->Set(document_id, filter_data);
}
+libtextclassifier3::Status DocumentStore::ClearDerivedData(
+ const std::string_view name_space, const std::string_view uri,
+ DocumentId document_id) {
+ if (!name_space.empty() && !uri.empty()) {
+ document_key_mapper_->Delete(MakeFingerprint(name_space, uri));
+ }
+
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
+
+ // Resets the score cache entry
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ document_id, DocumentAssociatedScoreData(/*document_score=*/-1,
+ /*creation_timestamp_ms=*/-1)));
+
+ // Resets the filter cache entry
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
+ /*expiration_timestamp_ms=*/-1)));
+
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 3f4b72f..52ea176 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -147,17 +147,40 @@
// boolean whether a document exists or not
bool DoesDocumentExist(DocumentId document_id) const;
- // Deletes the document identified by the given namespace and uri
+ // Deletes the document identified by the given namespace and uri. The
+ // document proto will be marked as deleted if 'soft_delete' is true,
+ // otherwise the document proto will be erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if no document exists with namespace, uri
// INTERNAL_ERROR on IO error
libtextclassifier3::Status Delete(std::string_view name_space,
- std::string_view uri);
+ std::string_view uri,
+ bool soft_delete = false);
+
+ // Deletes the document identified by the given document_id. The
+ // document proto will be marked as deleted if 'soft_delete' is true,
+ // otherwise the document proto will be erased immediately.
+ //
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ // INVALID_ARGUMENT if document_id is invalid.
+ libtextclassifier3::Status Delete(DocumentId document_id,
+ bool soft_delete = false);
// Returns the NamespaceId of the string namespace
//
@@ -180,6 +203,7 @@
// DocumentAssociatedScoreData on success
// OUT_OF_RANGE if document_id is negative or exceeds previously seen
// DocumentIds
+ // NOT_FOUND if no score data is found
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const;
@@ -194,30 +218,43 @@
// DocumentFilterData on success
// OUT_OF_RANGE if document_id is negative or exceeds previously seen
// DocumentIds
+ // NOT_FOUND if no filter data is found
libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
DocumentId document_id) const;
- // Deletes all documents belonging to the given namespace.
+ // Deletes all documents belonging to the given namespace. The documents will
+ // be marked as deleted if 'soft_delete' is true, otherwise they will be
+ // erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if namespace doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
+ libtextclassifier3::Status DeleteByNamespace(std::string_view name_space,
+ bool soft_delete = false);
- // Deletes all documents belonging to the given schema type
+ // Deletes all documents belonging to the given schema type. The documents
+ // will be marked as deleted if 'soft_delete' is true, otherwise they will be
+ // erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if schema_type doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
+ libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type,
+ bool soft_delete = false);
// Syncs all the data and metadata changes to disk.
//
@@ -424,32 +461,44 @@
// INTERNAL on I/O error
libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
- // Update derived files that `name_space` has been deleted. This is primarily
- // useful if we're trying to update derived files when we've already seen a
- // namespace tombstone, and don't need to write another tombstone.
+ // Helper function to do batch deletes. Documents with the given
+ // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
+ // to specify the namespace or schema type, pass in kInvalidNamespaceId or
+ // kInvalidSchemaTypeId. The document protos will be marked as deleted if
+ // 'soft_delete' is true, otherwise the document protos with their derived
+ // data will be erased / cleared immediately.
//
// NOTE: Space is not reclaimed in the derived files until Optimize() is
// called.
//
// Returns:
- // bool on whether an existing document was actually updated to be deleted
+ // Number of documents that were actually updated to be deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted(
- std::string_view name_space);
+ libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
+ SchemaTypeId schema_type_id,
+ bool soft_delete);
- // Update derived files that the schema type schema_type_id has been deleted.
- // This is primarily useful if we're trying to update derived files when we've
- // already seen a schema type tombstone, and don't need to write another
- // tombstone.
- //
- // NOTE: Space is not reclaimed in the derived files until Optimize() is
- // called.
+ // Marks the document identified by the given name_space, uri and document_id
+ // as deleted, to be removed later during Optimize().
//
// Returns:
// OK on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id);
+ libtextclassifier3::Status SoftDelete(std::string_view name_space,
+ std::string_view uri,
+ DocumentId document_id);
+
+ // Erases the document identified by the given name_space, uri and document_id
+ // from the document_log and erases its uri from the document_key_mapper_, the
+ // space will be reclaimed later during Optimize().
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status HardDelete(std::string_view name_space,
+ std::string_view uri,
+ DocumentId document_id,
+ uint64_t document_log_offset);
// Helper method to find a DocumentId that is associated with the given
// namespace and uri.
@@ -488,6 +537,11 @@
// Updates the entry in the filter cache for document_id.
libtextclassifier3::Status UpdateFilterCache(
DocumentId document_id, const DocumentFilterData& filter_data);
+
+ // Helper method to clear the derived data of a document
+ libtextclassifier3::Status ClearDerivedData(std::string_view name_space,
+ std::string_view uri,
+ DocumentId document_id);
};
} // namespace lib
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index ad56b9a..f857481 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -60,9 +60,6 @@
: test_dir_(GetTestTempDir() + "/icing"),
document_store_dir_(test_dir_ + "/document_store"),
schema_store_dir_(test_dir_ + "/schema_store") {
- filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
test_document1_ =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -88,6 +85,11 @@
}
void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("email");
@@ -270,7 +272,7 @@
IsFalse());
}
-TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
+TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -281,7 +283,26 @@
IsOkAndHolds(EqualsProto(test_document1_)));
ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ test_document1_.uri(),
+ /*soft_delete=*/true));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri(),
+ /*soft_delete=*/false));
EXPECT_THAT(
document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -343,20 +364,6 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
-
- // Get() after Delete() returns NOT_FOUND
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- doc_store->Put(DocumentProto(test_document1_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
- EXPECT_THAT(doc_store->Get(document_id),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
@@ -394,7 +401,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -422,7 +429,8 @@
// DELETE namespace.1. document1 and document 4 should be deleted. document2
// and document3 should still be retrievable.
- ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+ ICING_EXPECT_OK(
+ doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true));
EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
@@ -433,7 +441,47 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document1 = test_document1_;
+ document1.set_namespace_("namespace.1");
+ document1.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document1));
+
+ DocumentProto document2 = test_document1_;
+ document2.set_namespace_("namespace.2");
+ document2.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ DocumentProto document3 = test_document1_;
+ document3.set_namespace_("namespace.3");
+ document3.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document3));
+
+ DocumentProto document4 = test_document1_;
+ document4.set_namespace_("namespace.1");
+ document4.set_uri("uri2");
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ // DELETE namespace.1. document1 and document 4 should be deleted. document2
+ // and document3 should still be retrievable.
+ ICING_EXPECT_OK(
+ doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false));
+ EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
+ IsOkAndHolds(EqualsProto(document2)));
+ EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
+ IsOkAndHolds(EqualsProto(document3)));
+ EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -444,7 +492,8 @@
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"),
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+ /*soft_delete=*/true),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -452,7 +501,27 @@
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+ /*soft_delete=*/false),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -464,7 +533,25 @@
// At this point, there are no existing documents with the namespace, even
// though Icing's derived files know about this namespace. We should still
// return NOT_FOUND since nothing existing has this namespace.
- EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()),
+ EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ // At this point, there are no existing documents with the namespace, even
+ // though Icing's derived files know about this namespace. We should still
+ // return NOT_FOUND since nothing existing has this namespace.
+ EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+ /*soft_delete=*/false),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
@@ -536,7 +623,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) {
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("email");
@@ -593,7 +680,8 @@
// Delete the "email" type and ensure that it works across both
// email_document's namespaces. And that other documents aren't affected.
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("email", /*soft_delete=*/true));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -604,7 +692,8 @@
IsOkAndHolds(EqualsProto(person_document)));
// Delete the "message" type and check that other documents aren't affected
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("message"));
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("message", /*soft_delete=*/true));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -615,7 +704,88 @@
IsOkAndHolds(EqualsProto(person_document)));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ type_config = schema.add_types();
+ type_config->set_schema_type("person");
+
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ DocumentProto email_document_1 = DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id,
+ document_store->Put(email_document_1));
+
+ DocumentProto email_document_2 = DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id,
+ document_store->Put(email_document_2));
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("message")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+
+ DocumentProto person_document = DocumentBuilder()
+ .SetKey("namespace", "4")
+ .SetSchema("person")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
+ document_store->Put(person_document));
+
+ // Delete the "email" type and ensure that it works across both
+ // email_document's namespaces. And that other documents aren't affected.
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("email", /*soft_delete=*/false));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+
+ // Delete the "message" type and check that other documents aren't affected
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("message", /*soft_delete=*/false));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -626,7 +796,8 @@
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"),
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+ /*soft_delete=*/true),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -635,7 +806,28 @@
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+ /*soft_delete=*/false),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -644,10 +836,23 @@
ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
test_document1_.uri()));
- // At this point, there are no existing documents with the schema type, but we
- // still return OK because the SchemaStore is the ground truth on schemas and
- // knows about the type
- ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema()));
+ EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+ /*soft_delete=*/false),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
@@ -1177,7 +1382,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1193,14 +1398,71 @@
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
- // FilterCache doesn't care if the document has been deleted
- ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
EXPECT_THAT(
doc_store->GetDocumentFilterData(document_id),
IsOkAndHolds(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(),
+ IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(DocumentStoreTest,
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index 4571df2..23c7b69 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -84,6 +84,9 @@
// Returns any encountered IO errors.
libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
+ // Deletes data related to the given key. Returns true on success.
+ bool Delete(std::string_view key);
+
// Returns a map of values to keys. Empty map if the mapper is empty.
std::unordered_map<T, std::string> GetValuesToKeys() const;
@@ -255,6 +258,11 @@
}
template <typename T>
+bool KeyMapper<T>::Delete(std::string_view key) {
+ return trie_.Delete(key);
+}
+
+template <typename T>
std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const {
std::unordered_map<T, std::string> values_to_keys;
for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid();
diff --git a/icing/store/namespace-id.h b/icing/store/namespace-id.h
index 4225be3..374e7a8 100644
--- a/icing/store/namespace-id.h
+++ b/icing/store/namespace-id.h
@@ -22,6 +22,7 @@
// Id of unique namespace in DocumentProto. Generated in DocumentStore.
using NamespaceId = int16_t;
+inline constexpr NamespaceId kInvalidNamespaceId = -1;
} // namespace lib
} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 0ef1824..9213fbe 100644
--- a/icing/tokenization/icu/icu-language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -15,6 +15,7 @@
#include "icing/tokenization/icu/icu-language-segmenter.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 31c2726..d0b90d1 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -409,6 +409,71 @@
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ IteratorMultipleAdvancesResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
language_segmenter_factory::Create(GetOptions()));
@@ -992,6 +1057,19 @@
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, IcuLanguageSegmenterAllLocalesTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
diff --git a/icing/tokenization/ios/ios-language-segmenter-factory.cc b/icing/tokenization/ios/ios-language-segmenter-factory.cc
new file mode 100644
index 0000000..3af7914
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter-factory.cc
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/ios/ios-language-segmenter.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+} // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions options) {
+ // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+ // different from other locales. E.g. "email.subject" will be split into 3
+ // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+ // term in other locales. Our current LanguageSegmenter doesn't handle this
+ // special rule, so we replace it with "en_US".
+ if (options.locale == kLocaleAmericanEnglishComputer) {
+ ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+ << " not supported. Converting to locale en_US";
+ options.locale = "en_US";
+ }
+ return std::make_unique<IosLanguageSegmenter>(std::move(options.locale));
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/ios/ios-language-segmenter.h b/icing/tokenization/ios/ios-language-segmenter.h
new file mode 100644
index 0000000..1aa1f1b
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to segment sentences into words based on rules from
+// CFStringTokenizer, some extra rules are applied in this class:
+//
+// 1. All ASCII terms will be returned.
+// 2. For non-ASCII terms, only the alphabetic terms are returned, which means
+// non-ASCII punctuation and special characters are left out.
+// 3. Multiple continuous whitespaces are treated as one.
+//
+// The rules above are common to the high-level tokenizers that might use this
+// class. Other special tokenization logic will be in each tokenizer.
+//
+// This implementation has a few notable deviations from the ICU-based
+// implementations:
+// 1. This implementation doesn't treat ':' as a word connector. ICU does.
+// 2. When the locale is Japanese, this implementation treats internal periods
+// are as word breaks rather than connectors. "N.B.A." becomes {"N", ".",
+// "B", ".", "A", "."} rather than {"N.B.A", "."} (which is what ICU and
+// all other locales do.
+// 3. Locale can have other effects on segmentation - this is often when the
+// wrong locale is specified for CJKT text.
+// 4. Some CJKT segmentation deviates from ICU results even when the correct
+// locale is specified.
+class IosLanguageSegmenter : public LanguageSegmenter {
+ public:
+ explicit IosLanguageSegmenter(std::string locale)
+ : locale_(std::move(locale)) {}
+
+ IosLanguageSegmenter(const IosLanguageSegmenter&) = delete;
+ IosLanguageSegmenter& operator=(const IosLanguageSegmenter&) = delete;
+
+ // The segmentation depends on the language detected in the input text.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ //
+ // Returns:
+ // An iterator of terms on success
+ // INTERNAL_ERROR if any error occurs
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const override;
+
+ // The segmentation depends on the language detected in the input text.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ //
+ // Returns:
+ // A list of terms on success
+ // INTERNAL_ERROR if any error occurs
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const override;
+
+ private:
+ std::string locale_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_IOS_IOS_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/ios/ios-language-segmenter_test.cc b/icing/tokenization/ios/ios-language-segmenter_test.cc
new file mode 100644
index 0000000..b6831e2
--- /dev/null
+++ b/icing/tokenization/ios/ios-language-segmenter_test.cc
@@ -0,0 +1,1265 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling
+// ResetToStart/ResetAfter with the current position to simulate Advancing on
+// the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ if (!itr->ResetToStart().ok()) {
+ return terms;
+ }
+ terms.push_back(itr->GetTerm());
+ const char* text_begin = itr->GetTerm().data();
+ // Calling ResetToTermStartingAfter with the current position should get the
+ // very next term in the sequence.
+ for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ std::vector<std::string_view> terms;
+
+ bool is_ok = true;
+ int current_pos = 0;
+ while (is_ok) {
+ // Alternate between using Advance and ResetToTermAfter.
+ if (terms.size() % 2 == 0) {
+ is_ok = itr->Advance();
+ } else {
+ // Calling ResetToTermStartingAfter with the current position should get
+ // the very next term in the sequence.
+ current_pos = itr->GetTerm().data() - text_begin;
+ is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+ }
+ if (is_ok) {
+ terms.push_back(itr->GetTerm());
+ }
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ int last_pos = 0;
+ while (itr->Advance()) {
+ last_pos = itr->GetTerm().data() - text_begin;
+ }
+ std::vector<std::string_view> terms;
+ // Calling ResetToTermEndingBefore with the current position should get the
+ // previous term in the sequence.
+ for (int current_pos = last_pos;
+ itr->ResetToTermEndingBefore(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+class IosLanguageSegmenterAllLocalesTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ static std::string GetLocale() { return GetParam(); }
+ static language_segmenter_factory::SegmenterOptions GetOptions() {
+ return language_segmenter_factory::SegmenterOptions(GetLocale());
+ }
+};
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+ UCharToString(0x0009), "World")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms("。?·Hello!×"));
+ EXPECT_THAT(terms, ElementsAre("Hello"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // LOCALE DEVIATION!! When the locale is Japanese, internal periods are
+ // considered word breaks.
+ std::vector<std::string> exp_terms;
+ if (GetOptions().locale == ULOC_JAPAN) {
+ exp_terms = {"U", ".", "S", ".", " ", "Bank"};
+ } else {
+ exp_terms = {"U.S", ".", " ", "Bank"};
+ }
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+ IsOkAndHolds(ElementsAreArray(exp_terms)));
+
+ // LOCALE DEVIATION!! When the locale is Japanese, internal periods are
+ // considered word breaks.
+ if (GetOptions().locale == ULOC_JAPAN) {
+ exp_terms = {"I", ".", "B", ".", "M", "."};
+ } else {
+ exp_terms = {"I.B.M", "."};
+ }
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAreArray(exp_terms)));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ // DIFFERENCE!! iOS doesn't agree that ':' is a word connector
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> term,
+ language_segmenter->GetAllTerms("com:google:android"));
+ EXPECT_THAT(term, ElementsAre("com", ":", "google", ":", "android"));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ // DIFFERENCE!! iOS doesn't agree that ':' is a word connector
+ // TODO(b/157565185) resolve the handling of ':' as a connector.
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android", ":", "icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+ // Multiple continuous whitespaces are treated as one. Whitespace at the
+ // beginning of the text doesn't affect the results of GetTerm() after the
+ // iterator is done.
+ text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+ "Hello", " ", "World");
+ ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+ language_segmenter->Segment(text_with_spaces));
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+ // "每天" should be treated as a single token. All other locales split it into
+ // two tokens.
+ std::vector<std::string> exp_terms;
+ if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+ exp_terms = {"我", "每天", "走路", "去", "上班"};
+ } else if (GetOptions().locale == ULOC_JAPAN) {
+ // LOCALE DEVIATION!! JAPANESE groups "去上" and leaves "班" on its own.
+ // All other locales which, like ICU, breaks the text into "去" and "上班".
+ exp_terms = {"我", "每", "天", "走路", "去上", "班"};
+ } else {
+ exp_terms = {"我", "每", "天", "走路", "去", "上班"};
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms("我每天走路去上班。"));
+ EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+ // Japanese
+ // DIFFERENCE!! Disagreement over how to segment "歩い" (iOS groups) and
+ // "てい" (iOS splits). This difference persists even when locale is set to
+ // JAPAN.
+ if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE ||
+ GetOptions().locale == ULOC_TRADITIONAL_CHINESE) {
+ // LOCALE DEVIATION!! There is also disagreement when locale is CHINESE
+ // about how to tokenize "毎日", "仕事", "歩い", which are all split, and
+ // "てい" which is grouped.
+ exp_terms = {"私", "は", "毎", "日", "仕", "事",
+ "に", "歩", "い", "てい", "ます"};
+ } else {
+ exp_terms = {"私", "は", "毎日", "仕事", "に", "歩い", "て", "い", "ます"};
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"));
+ EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+ // Khmer
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"));
+ EXPECT_THAT(terms, ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"));
+
+ // Thai
+ // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+ // This difference persists even when locale is set to THAI
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"));
+ EXPECT_THAT(terms, ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุกวัน"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(IosLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Turkish
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms("merhaba dünya"));
+ EXPECT_THAT(terms, ElementsAre("merhaba", " ", "dünya"));
+ // Korean
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, language_segmenter->GetAllTerms("나는 매일 출근합니다."));
+ EXPECT_THAT(terms, ElementsAre("나는", " ", "매일", " ", "출근합니다", "."));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // DIFFERENCE/LOCALE DEVIATION!! JAPANESE agrees with ICU that "你好" should
+ // be treated as a single token. All other locales other than
+ // SIMPLIFIED_CHINESE split it into two tokens.
+ std::vector<std::string> exp_terms;
+ if (GetOptions().locale == ULOC_JAPAN) {
+ exp_terms = {"How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か"};
+ } else if (GetOptions().locale == ULOC_TRADITIONAL_CHINESE) {
+ // LOCALE DEVIATION!! TRADITIONAL_CHINESE disagrees over tokenization of
+ // "你好" and "元気", both of which it breaks up.
+ exp_terms = {"How", " ", "are", " ", "you", "你", "好",
+ "吗", "お", "元", "気", "です", "か"};
+ } else if (GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+ // LOCALE DEVIATION!! SIMPLIFIED_CHINESE disagrees over tokenization of
+ // "元気", which it breaks up.
+ exp_terms = {"How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元", "気", "です", "か"};
+ } else {
+ // LOCALE DEVIATION!! All other locales disagree over the tokenization of
+ // "你好", which it breaks up.
+ exp_terms = {"How", " ", "are", " ", "you", "你",
+ "好", "吗", "お", "元気", "です", "か"};
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms("How are you你好吗お元気ですか"));
+ EXPECT_THAT(terms, ElementsAreArray(exp_terms));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, language_segmenter->GetAllTerms("나는 California에 산다"));
+ EXPECT_THAT(terms, ElementsAre("나는", " ", "California", "에", " ", "산다"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ IteratorMultipleAdvancesResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ auto position_or = itr->ResetToTermStartingAfter(7);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
+ ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^^ ^ ^ ^ ^
+ // Bytes: 0 3 4 78 1114172023 29 35
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ // DIFFERENCE/LOCALE DEVIATION!! JAPANESE and SIMPLIFIED_CHINESE agrees with
+ // ICU that "你好" should be treated as a single token. All other locales
+ // other than SIMPLIFIED_CHINESE split it into two tokens.
+ std::string exp_token;
+ if (GetLocale() == ULOC_JAPAN || GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+ exp_token = "你好";
+ } else {
+ exp_token = "你";
+ }
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ auto offset_or = itr->ResetToTermStartingAfter(0);
+ EXPECT_THAT(offset_or.status(), IsOk());
+ EXPECT_THAT(offset_or.ValueOrDie(), Eq(5));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^^ ^ ^^
+ // Bytes: 0 3 6 9 15 18
+ std::string exp_token;
+ // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+ // "每天" should be treated as a single token. All other locales split it into
+ // two tokens.
+ if (GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+ exp_token = "每天";
+ } else {
+ exp_token = "每";
+ }
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(21),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ // LOCALE DEVIATION!! There is disagreement when locale is CHINESE about how
+ // to tokenize "毎日", "仕事", "歩い", which are all split, and "てい" which
+ // is grouped.
+ std::string exp_term;
+ int exp_offset;
+ if (GetLocale() == ULOC_SIMPLIFIED_CHINESE ||
+ GetLocale() == ULOC_TRADITIONAL_CHINESE) {
+ // Since "毎日" is broken up when the locale is CHINESE, ResetAfter(7) will
+ // point to "日" instead of the next segment ("仕事") like other locales.
+ exp_term = "日";
+ exp_offset = 9;
+ } else {
+ exp_term = "仕事";
+ exp_offset = 12;
+ }
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(exp_offset)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_term));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+ // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+ // This difference persists even when locale is set to THAI
+ EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุกวัน"));
+}
+TEST_P(IosLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+ EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ std::string exp_token;
+ int exp_offset;
+ if (GetOptions().locale == ULOC_TRADITIONAL_CHINESE ||
+ GetOptions().locale == ULOC_SIMPLIFIED_CHINESE) {
+ // LOCALE DEVIATION!! SIMPLIFIED_CHINESE disagrees over tokenization of
+ // "元気", which it breaks up.
+ exp_offset = 26;
+ exp_token = "気";
+ } else {
+ exp_offset = 23;
+ exp_token = "元気";
+ }
+ EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(exp_offset)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+ // DIFFERENCE/LOCALE DEVIATION!! JAPANESE and SIMPLIFIED_CHINESE agrees with
+ // ICU that "你好" should be treated as a single token. All other locales
+ // split it into two tokens.
+ if (GetLocale() == ULOC_JAPAN || GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+ exp_offset = 8;
+ exp_token = "you";
+ } else {
+ exp_offset = 11;
+ exp_token = "你";
+ }
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(exp_offset)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^^ ^ ^ ^
+ // Bytes: 0 3 6 9 15 18
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ std::string exp_token;
+ int exp_offset;
+ // DIFFERENCE/LOCALE DEVIATION!! SIMPLISTIC_CHINESE agrees with ICU that
+ // "每天" should be treated as a single token. All other locales split it into
+ // two tokens.
+ if (GetLocale() == ULOC_SIMPLIFIED_CHINESE) {
+ exp_offset = 0;
+ exp_token = "我";
+ } else {
+ exp_offset = 3;
+ exp_token = "每";
+ }
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(exp_offset)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+
+ if (GetOptions().locale == ULOC_JAPAN) {
+ // LOCALE DEVIATION!! JAPANESE groups "去上" and leaves "班" on its own.
+ // All other locales which, like ICU, breaks the text into "去" and "上班".
+ exp_offset = 9;
+ exp_token = "走路";
+ } else {
+ exp_offset = 15;
+ exp_token = "去";
+ }
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(exp_offset)));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_token));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ // LOCALE DEVIATION!! There is disagreement when locale is CHINESE about how
+ // to tokenize "毎日", "仕事", "歩い", which are all split, and "てい" which
+ // is grouped.
+ std::string exp_term;
+ int exp_offset;
+ if (GetLocale() == ULOC_SIMPLIFIED_CHINESE ||
+ GetLocale() == ULOC_TRADITIONAL_CHINESE) {
+ // TODO(b/157565185) For some reason, CFStringTokenizerGoToTokenAtIndex
+ // believes that "いています" is one token when locale is
+ // SIMPLIFIED/TRADITIONAL CHINESE, but CFStringTokenizerAdvanceToNextToken
+ // thinks that it is three: "い" "てい", "ます". Other locales and ICU agree
+ // that that segment should be "歩い", "て", "い", "ます".
+ // This is the only case where CFStringTokenizerGoToTokenAtIndex and
+ // CFStringTokenizerAdvanceToNextToken disagree. Find a way around this
+ // (such as rewinding past the desired segment and then advancing to it) if
+ // this is still an issue after adding language detection.
+ exp_term = "歩";
+ exp_offset = 21;
+ } else {
+ // Since "てい" is broken up when the locale is not CHINESE,
+ // ResetBefore(33) will point to "い" at offset 30.
+ exp_term = "い";
+ exp_offset = 30;
+ }
+ auto offset_or = itr->ResetToTermEndingBefore(33);
+ EXPECT_THAT(offset_or, IsOk());
+ EXPECT_THAT(offset_or.ValueOrDie(), Eq(exp_offset));
+ EXPECT_THAT(itr->GetTerm(), Eq(exp_term));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(IosLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+ // This difference persists even when locale is set to THAI
+ EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทำงาน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ LocaleName, IosLanguageSegmenterAllLocalesTest,
+ testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+ ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+ ULOC_KOREA,
+ ULOC_SIMPLIFIED_CHINESE,
+ ULOC_TRADITIONAL_CHINESE,
+ "es_ES", // Spanish
+ "hi_IN", // Hindi
+ "th_TH", // Thai
+ "lo_LA", // Lao
+ "km_KH", // Khmer
+ "ar_DZ", // Arabic
+ "ru_RU", // Russian
+ "pt_PT", // Portuguese
+ "en_US_POSIX" // American English (Computer)
+ "wrong_locale" // Will fall back to ICU default locale
+ "" // Will fall back to ICU default locale
+ ));
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index ce50d0b..e60c168 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,11 +18,14 @@
#include <memory>
#include <string_view>
+#ifdef __ANDROID__
#include "icing/jni/jni-cache.h"
+#else // __ANDROID__
+class JniCache; // forward declaration to let non-Android builds work.
+#endif // __ANDROID__
+
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/i18n-utils.h"
-#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -30,7 +33,7 @@
namespace language_segmenter_factory {
struct SegmenterOptions {
- explicit SegmenterOptions(std::string locale = ULOC_US,
+ explicit SegmenterOptions(std::string locale,
const JniCache* jni_cache = nullptr)
: locale(std::move(locale)), jni_cache(jni_cache) {}
@@ -46,7 +49,7 @@
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterOptions options = SegmenterOptions());
+ SegmenterOptions options);
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index c7b068d..a1b031a 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -43,8 +43,10 @@
};
TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -62,8 +64,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -77,8 +81,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithNegativeOffsetNotOk) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -95,8 +101,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
@@ -106,8 +114,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
@@ -115,8 +125,10 @@
}
TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -130,8 +142,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithZeroNotFound) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -142,8 +156,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -157,8 +173,10 @@
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 49ddfca..bd86169 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -20,6 +20,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -59,8 +60,9 @@
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
@@ -95,8 +97,9 @@
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
for (int i = 1; i < input_string.length(); i += 2) {
@@ -134,8 +137,9 @@
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string;
while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f2fc678..d9db75a 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -24,6 +24,7 @@
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -49,8 +50,10 @@
}
TEST_F(PlainTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -81,8 +84,10 @@
}
TEST_F(PlainTokenizerTest, Whitespace) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -107,8 +112,10 @@
}
TEST_F(PlainTokenizerTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -136,8 +143,10 @@
}
TEST_F(PlainTokenizerTest, SpecialCharacters) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -157,8 +166,10 @@
}
TEST_F(PlainTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -209,8 +220,10 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -226,8 +239,10 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -243,8 +258,10 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -291,8 +308,10 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 351f7c1..9b71e8a 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -22,6 +22,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -46,8 +47,10 @@
}
TEST_F(RawQueryTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -59,8 +62,10 @@
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -159,8 +164,10 @@
}
TEST_F(RawQueryTokenizerTest, Exclustion) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -226,8 +233,10 @@
}
TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -314,8 +323,10 @@
}
TEST_F(RawQueryTokenizerTest, OR) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -435,8 +446,10 @@
// CJKT are treated the same way by language segmenter and raw tokenizer, so
// here we test Chinese and Japanese to represent CJKT.
TEST_F(RawQueryTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -488,8 +501,10 @@
// Raw tokenizer identifies all characters that it doesn't know as OTHER type,
// so we can choose comma "," to represent all OTHER characters.
TEST_F(RawQueryTokenizerTest, OtherChars) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -533,8 +548,10 @@
}
TEST_F(RawQueryTokenizerTest, Mix) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
index f79bc68..db973f3 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include "icing/jni/jni-cache.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
index a01d944..4b50231 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
@@ -443,6 +443,74 @@
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
@@ -1060,6 +1128,21 @@
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(ReverseJniLanguageSegmenterTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, ReverseJniLanguageSegmenterTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 2256022..bb26364 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -24,164 +24,13 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-namespace {
-
-// Returns the lead byte of the UTF-8 character that includes the byte at
-// current_byte_index within it.
-int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
- while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
- --current_byte_index;
- }
- return current_byte_index;
-}
-
-class CharacterIterator {
- public:
- explicit CharacterIterator(std::string_view text)
- : CharacterIterator(text, 0, 0) {}
- CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
- : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: desired_utf8_index <= text_.length()
- // desired_utf8_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf8(int desired_utf8_index) {
- if (desired_utf8_index > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- // Need to work forwards.
- while (utf8_index_ < desired_utf8_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > desired_utf8_index) {
- // Ah! Don't go too far!
- break;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: 0 <= desired_utf8_index
- bool RewindToUtf8(int desired_utf8_index) {
- if (desired_utf8_index < 0) {
- // Enforce the requirement.
- return false;
- }
- // Need to work backwards.
- while (utf8_index_ > desired_utf8_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- if (utf8_index_ < 0) {
- // Somehow, there wasn't a single UTF-8 lead byte at
- // requested_byte_index or an earlier byte.
- return false;
- }
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Advances current position to desired_utf16_index.
- // REQUIRES: desired_utf16_index <= text_.utf16_length()
- // desired_utf16_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf16(int desired_utf16_index) {
- while (utf16_index_ < desired_utf16_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf16_length = i18n_utils::GetUtf16Length(uchar32);
- if (utf16_index_ + utf16_length > desired_utf16_index) {
- // Ah! Don't go too far!
- break;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += utf16_length;
- }
- return true;
- }
-
- // Rewinds current position to desired_utf16_index.
- // REQUIRES: 0 <= desired_utf16_index
- bool RewindToUtf16(int desired_utf16_index) {
- if (desired_utf16_index < 0) {
- return false;
- }
- while (utf16_index_ > desired_utf16_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- bool IsValidCharacter() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[utf8_index_])) {
- return true;
- }
-
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
- return true;
- }
-
- return false;
- }
-
- int utf8_index() const { return utf8_index_; }
- int utf16_index() const { return utf16_index_; }
-
- private:
- std::string_view text_;
- int utf8_index_;
- int utf16_index_;
-};
-
-} // namespace
-
class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
public:
explicit ReverseJniLanguageSegmenterIterator(
@@ -229,7 +78,7 @@
// Check if the current term is valid. We consider any term valid if its
// first character is valid. If it's not valid, then we need to advance to
// the next term.
- if (term_start_.IsValidCharacter()) {
+ if (IsValidTerm()) {
return true;
}
return Advance();
@@ -382,8 +231,7 @@
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf8_index() > offset ||
- !term_start_.IsValidCharacter()) {
+ if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) {
return ResetToTermEndingBefore(term_start_.utf8_index());
}
return term_start_.utf8_index();
@@ -414,6 +262,21 @@
/*utf16_index=*/ReverseJniBreakIterator::kDone);
}
+ bool IsValidTerm() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
+ return true;
+ }
+
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+ return true;
+ }
+ return false;
+ }
+
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
// this class needs to maintain state to convert between UTF-16 and UTF-8.
std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
index 8ed38b2..6c5e3f6 100644
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc
@@ -18,6 +18,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -28,21 +29,27 @@
using ::testing::IsEmpty;
TEST(SpaceLanguageSegmenterTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
}
TEST(SpaceLanguageSegmenterTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
IsOkAndHolds(ElementsAre("Hello", " ", "World")));
}
TEST(SpaceLanguageSegmenterTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
@@ -55,8 +62,10 @@
}
TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Alphanumeric terms are allowed
EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -64,8 +73,10 @@
}
TEST(SpaceLanguageSegmenterTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Alphanumeric terms are allowed
EXPECT_THAT(
@@ -80,8 +91,10 @@
}
TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Multiple continuous whitespaces are treated as one.
const int kNumSeparators = 256;
@@ -92,8 +105,10 @@
}
TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Validates that the input strings are not copied
const std::string text = "Hello World";
const char* word1_address = text.c_str();
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
new file mode 100644
index 0000000..3707f95
--- /dev/null
+++ b/icing/util/character-iterator.cc
@@ -0,0 +1,127 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/character-iterator.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+ while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+ --current_byte_index;
+ }
+ return current_byte_index;
+}
+
+} // namespace
+
+bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work forwards.
+ while (utf8_index_ < desired_utf8_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > desired_utf8_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index < 0) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work backwards.
+ while (utf8_index_ > desired_utf8_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
+ while (utf16_index_ < desired_utf16_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ if (utf16_index_ + utf16_length > desired_utf16_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ }
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
+ if (desired_utf16_index < 0) {
+ return false;
+ }
+ while (utf16_index_ > desired_utf16_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
new file mode 100644
index 0000000..22de6c5
--- /dev/null
+++ b/icing/util/character-iterator.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
+#define ICING_UTIL_CHARACTER_ITERATOR_H_
+
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+class CharacterIterator {
+ public:
+ explicit CharacterIterator(std::string_view text)
+ : CharacterIterator(text, 0, 0) {}
+
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+ : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: desired_utf8_index <= text_.length()
+ // desired_utf8_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf8(int desired_utf8_index);
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: 0 <= desired_utf8_index
+ bool RewindToUtf8(int desired_utf8_index);
+
+ // Advances current position to desired_utf16_index.
+ // REQUIRES: desired_utf16_index <= text_.utf16_length()
+ // desired_utf16_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf16(int desired_utf16_index);
+
+ // Rewinds current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index
+ bool RewindToUtf16(int desired_utf16_index);
+
+ int utf8_index() const { return utf8_index_; }
+ int utf16_index() const { return utf16_index_; }
+
+ bool operator==(const CharacterIterator& rhs) const {
+ return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
+ utf16_index_ == rhs.utf16_index_;
+ }
+
+ private:
+ std::string_view text_;
+ int utf8_index_;
+ int utf16_index_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_CHARACTER_ITERATOR_H_
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 9cf992f..d6754d5 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -99,16 +99,17 @@
return;
}
- while (truncate_to_length > 0) {
- if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
- str->resize(truncate_to_length);
- return;
- }
- truncate_to_length--;
- }
+ str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
+}
- // Truncates to an empty string
- str->resize(0);
+int SafeTruncateUtf8Length(const char* str, int desired_length) {
+ while (desired_length > 0) {
+ if (IsLeadUtf8Byte(str[desired_length])) {
+ break;
+ }
+ --desired_length;
+ }
+ return desired_length;
}
bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index e103bab..82ae828 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -50,6 +50,13 @@
// Returns the char at the given position.
UChar32 GetUChar32At(const char* data, int length, int position);
+// Returns the safe position to truncate a UTF8 string at so that multi-byte
+// UTF8 characters are not cut in the middle. The returned value will always be
+// 0 <= val <= desired_length.
+//
+// REQUIRES: 0 <= desired_length < strlen(str)
+int SafeTruncateUtf8Length(const char* str, int desired_length);
+
// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
// in the middle. The string will be truncated in place.
void SafeTruncateUtf8(std::string* str, int truncate_to_length);
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index f4e301d..125da12 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -328,6 +328,27 @@
}
@NonNull
+ public DeleteResultProto deleteByQuery(@NonNull SearchSpecProto searchSpec) {
+ byte[] deleteResultBytes = nativeDeleteByQuery(nativePointer, searchSpec.toByteArray());
+ if (deleteResultBytes == null) {
+ Log.e(TAG, "Received null DeleteResultProto from native.");
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteResultProto.parseFrom(
+ deleteResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteResultProto.", e);
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
public PersistToDiskResultProto persistToDisk() {
byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer);
if (persistToDiskResultBytes == null) {
@@ -438,6 +459,8 @@
private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType);
+ private static native byte[] nativeDeleteByQuery(long nativePointer, byte[] searchSpecBytes);
+
private static native byte[] nativePersistToDisk(long nativePointer);
private static native byte[] nativeOptimize(long nativePointer);
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index d907d4e..ed7e318 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -335,6 +335,58 @@
assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
}
+
+ @Test
+ public void testDeleteByQuery() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();;
+ assertThat(icing.put(emailDocument1).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ DocumentProto emailDocument2 =
+ createEmailDocument("namespace", "uri2").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar"))
+ .build();;
+ assertThat(icing.put(emailDocument2).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ // .setQuery("")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+
+ SearchResultProto searchResultProto =
+ icing.search(
+ searchSpec,
+ ScoringSpecProto.getDefaultInstance(),
+ ResultSpecProto.getDefaultInstance());
+ assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(2);
+ // assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument1);
+
+ DeleteResultProto deleteResultProto = icing.deleteByQuery(searchSpec);
+ assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ GetResultProto getResultProto = icing.get("namespace", "uri1");
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+ getResultProto = icing.get("namespace", "uri2");
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+ }
+
@Test
public void testPersistToDisk() throws Exception {
IcingSearchEngineOptions options =
diff --git a/proto/icing/proto/status.proto b/proto/icing/proto/status.proto
index 2733a15..08677b0 100644
--- a/proto/icing/proto/status.proto
+++ b/proto/icing/proto/status.proto
@@ -24,7 +24,7 @@
// Canonical status to indicate the results of API calls.
// Next tag: 3
message StatusProto {
- // Next tag: 9
+ // Next tag: 10
enum Code {
// A default for all other use-cases. Should never be used in practice. This
// may happen if there are backwards-compatibility issues.
@@ -62,6 +62,12 @@
// make some space on the underlying filesystem.
OUT_OF_SPACE = 8;
+ // An operation is invalid because the resource already exists and can't be
+ // replaced. For example, this status is used when a SchemaProto contains
+ // multiple definitions of the same type or multiple properties with the
+ // same name within a type.
+ ALREADY_EXISTS = 9;
+
// Any future status codes.
}
optional Code code = 1;