Merge remote-tracking branch 'goog/androidx-platform-dev' into sc-dev am: b7ee27c61c am: e448a1cf86 am: 4f874a1690
Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/14347420
Change-Id: Id950c75049b286698c8264be057c64d72ba91183
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70f6852..01ee8eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,9 @@
cmake_minimum_required(VERSION 3.10.2)
add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1")
+set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds")
+set(CMAKE_SHARED_LINKER_FLAGS
+ "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections -Wl,--version-script=${VERSION_SCRIPT}")
set(
Protobuf_PREBUILTS_DIR
diff --git a/build.gradle b/build.gradle
index 437f57f..882a929 100644
--- a/build.gradle
+++ b/build.gradle
@@ -69,6 +69,9 @@
generateProtoTasks {
all().each { task ->
+ project.tasks.named("extractReleaseAnnotations").configure {
+ it.dependsOn(task)
+ }
task.builtins {
java {
option 'lite'
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 6a596f5..0655cb9 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc
@@ -466,7 +466,13 @@
bool Filesystem::CopyFile(const char* src, const char* dst) const {
ScopedFd src_fd(OpenForRead(src));
+
+ std::string dir = GetDirname(dst);
+ if (!CreateDirectoryRecursively(dir.c_str())) {
+ return false;
+ }
ScopedFd dst_fd(OpenForWrite(dst));
+
if (!src_fd.is_valid() || !dst_fd.is_valid()) {
return false;
}
@@ -478,6 +484,49 @@
return Write(*dst_fd, buf.get(), size);
}
+bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir,
+ bool recursive) const {
+ DIR* dir = opendir(src_dir);
+ if (!dir) {
+ LogOpenError("Unable to open directory ", src_dir, ": ", errno);
+ return false;
+ }
+
+ dirent* p;
+ // readdir's implementation seems to be thread safe.
+ while ((p = readdir(dir)) != nullptr) {
+ std::string file_name(p->d_name);
+ if (file_name == "." || file_name == "..") {
+ continue;
+ }
+
+ std::string full_src_path = absl_ports::StrCat(src_dir, "/", p->d_name);
+ std::string full_dst_path = absl_ports::StrCat(dst_dir, "/", p->d_name);
+
+ // Directories are copied when writing a non-directory file, so no
+ // explicit copying of a directory is required.
+ if (p->d_type != DT_DIR) {
+ if (!CopyFile(full_src_path.c_str(), full_dst_path.c_str())) {
+ return false;
+ }
+ }
+
+ // Recurse down directories, if requested.
+ if (recursive && (p->d_type == DT_DIR)) {
+ std::string src_sub_dir = absl_ports::StrCat(src_dir, "/", p->d_name);
+ std::string dst_sub_dir = absl_ports::StrCat(dst_dir, "/", p->d_name);
+ if (!CopyDirectory(src_sub_dir.c_str(), dst_sub_dir.c_str(), recursive)) {
+ return false;
+ }
+ }
+ }
+ if (closedir(dir) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s",
+ src_dir, strerror(errno));
+ }
+ return true;
+}
+
bool Filesystem::PWrite(int fd, off_t offset, const void* data,
size_t data_size) const {
size_t write_len = data_size;
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index d3c7787..6bed8e6 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -86,8 +86,12 @@
// Copies the src file to the dst file.
virtual bool CopyFile(const char* src, const char* dst) const;
+ // Copies the src directory and its contents to the dst dir.
+ virtual bool CopyDirectory(const char* src_dir, const char* dst_dir,
+ bool recursive) const;
+
// Returns true if a file exists. False if the file doesn't exist.
- // If there is an error getting stat on the file, it logs the error and //
+ // If there is an error getting stat on the file, it logs the error and
// asserts.
virtual bool FileExists(const char* file_name) const;
diff --git a/icing/file/filesystem_test.cc b/icing/file/filesystem_test.cc
index 492a50d..214180e 100644
--- a/icing/file/filesystem_test.cc
+++ b/icing/file/filesystem_test.cc
@@ -38,6 +38,7 @@
using ::testing::Le;
using ::testing::Ne;
using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
namespace icing {
namespace lib {
@@ -450,5 +451,47 @@
EXPECT_THAT(hello, Eq("hello"));
}
+TEST_F(FilesystemTest, CopyDirectory) {
+ Filesystem filesystem;
+
+ // File structure:
+ // <temp_dir>/
+ // src_dir/
+ // file1
+ // file2
+ // sub_dir/
+ // file3
+ const std::string src_dir = temp_dir_ + "/src_dir";
+ const std::string sub_dir = "sub_dir";
+ const std::string sub_dir_path = src_dir + "/" + sub_dir;
+ vector<std::string> some_files = {"file1", "file2", sub_dir + "/file3"};
+
+ // Make sure there is no pre-existing test-dir structure
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+
+ // Setup a test-dir structure
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(
+ sub_dir_path.c_str())); // deepest path for test
+ CreateTestFiles(some_files, src_dir);
+
+ const std::string dst_dir = temp_dir_ + "/dst_dir";
+ EXPECT_TRUE(filesystem.CopyDirectory(src_dir.c_str(), dst_dir.c_str(),
+ /*recursive=*/true));
+
+ vector<std::string> src_dir_files;
+ EXPECT_TRUE(filesystem.ListDirectory(src_dir.c_str(), /*exclude=*/{},
+ /*recursive=*/true, &src_dir_files));
+
+ vector<std::string> dst_dir_files;
+ EXPECT_TRUE(filesystem.ListDirectory(dst_dir.c_str(), /*exclude=*/{},
+ /*recursive=*/true, &dst_dir_files));
+
+ EXPECT_THAT(dst_dir_files, UnorderedElementsAreArray(src_dir_files));
+
+ // Clean up
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(dst_dir.c_str()));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h
index 88475cd..32817d4 100644
--- a/icing/file/mock-filesystem.h
+++ b/icing/file/mock-filesystem.h
@@ -44,6 +44,17 @@
return real_filesystem_.DeleteDirectoryRecursively(dir_name);
});
+ ON_CALL(*this, CopyFile)
+ .WillByDefault([this](const char* src, const char* dst) {
+ return real_filesystem_.CopyFile(src, dst);
+ });
+
+ ON_CALL(*this, CopyDirectory)
+ .WillByDefault(
+ [this](const char* src, const char* dst, bool recursive) {
+ return real_filesystem_.CopyDirectory(src, dst, recursive);
+ });
+
ON_CALL(*this, FileExists).WillByDefault([this](const char* file_name) {
return real_filesystem_.FileExists(file_name);
});
@@ -227,6 +238,9 @@
MOCK_METHOD(bool, CopyFile, (const char* src, const char* dst), (const));
+ MOCK_METHOD(bool, CopyDirectory,
+ (const char* src, const char* dst, bool recursive), (const));
+
MOCK_METHOD(bool, FileExists, (const char* file_name), (const));
MOCK_METHOD(bool, DirectoryExists, (const char* dir_name), (const));
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index 95c3949..000ab3d 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -147,80 +147,92 @@
Crc32 crc;
// Get a string_view of all the fields of the Header, excluding the
- // magic_nbytes and header_checksum_nbytes
- std::string_view header_str(reinterpret_cast<const char*>(this) +
- offsetof(Header, header_checksum_nbytes) +
- sizeof(header_checksum_nbytes),
- sizeof(Header) - sizeof(magic_nbytes) -
- sizeof(header_checksum_nbytes));
+ // magic_nbytes_ and header_checksum_nbytes_
+ std::string_view header_str(
+ reinterpret_cast<const char*>(this) +
+ offsetof(Header, header_checksum_nbytes_) +
+ sizeof(header_checksum_nbytes_),
+ sizeof(Header) - sizeof(magic_nbytes_) -
+ sizeof(header_checksum_nbytes_));
crc.Append(header_str);
return crc.Get();
}
- int32_t GetMagic() const { return gntohl(magic_nbytes); }
+ int32_t GetMagic() const { return gntohl(magic_nbytes_); }
- void SetMagic(int32_t magic_in) { magic_nbytes = ghtonl(magic_in); }
+ void SetMagic(int32_t magic_in) { magic_nbytes_ = ghtonl(magic_in); }
int32_t GetFileFormatVersion() const {
- return gntohl(file_format_version_nbytes);
+ return gntohl(file_format_version_nbytes_);
}
void SetFileFormatVersion(int32_t file_format_version_in) {
- file_format_version_nbytes = ghtonl(file_format_version_in);
+ file_format_version_nbytes_ = ghtonl(file_format_version_in);
}
- int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes); }
+ int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes_); }
void SetMaxProtoSize(int32_t max_proto_size_in) {
- max_proto_size_nbytes = ghtonl(max_proto_size_in);
+ max_proto_size_nbytes_ = ghtonl(max_proto_size_in);
}
- int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes); }
+ int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes_); }
void SetLogChecksum(int32_t log_checksum_in) {
- log_checksum_nbytes = ghtonl(log_checksum_in);
+ log_checksum_nbytes_ = ghtonl(log_checksum_in);
}
- int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes); }
+ int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes_); }
void SetRewindOffset(int64_t rewind_offset_in) {
- rewind_offset_nbytes = ghtonll(rewind_offset_in);
+ rewind_offset_nbytes_ = ghtonll(rewind_offset_in);
}
- int32_t GetHeaderChecksum() const { return gntohl(header_checksum_nbytes); }
+ int32_t GetHeaderChecksum() const {
+ return gntohl(header_checksum_nbytes_);
+ }
void SetHeaderChecksum(int32_t header_checksum_in) {
- header_checksum_nbytes = ghtonl(header_checksum_in);
+ header_checksum_nbytes_ = ghtonl(header_checksum_in);
}
- bool GetCompressFlag() const {
- uint16_t host_order_flags = gntohs(flags_nbytes);
- return bit_util::BitfieldGet(host_order_flags, kCompressBit, /*len=*/1);
- }
+ bool GetCompressFlag() const { return GetFlag(kCompressBit); }
- void SetCompressFlag(bool compress) {
- uint16_t host_order_flags = gntohs(flags_nbytes);
- bit_util::BitfieldSet(compress, kCompressBit,
- /*len=*/1, &host_order_flags);
- flags_nbytes = ghtons(host_order_flags);
- }
+ void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
+
+ bool GetDirtyFlag() { return GetFlag(kDirtyBit); }
+
+ void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
private:
// The least-significant bit offset at which the compress flag is stored in
- // 'flags_nbytes'. Represents whether the protos in the log are compressed
+ // 'flags_nbytes_'. Represents whether the protos in the log are compressed
// or not.
static constexpr int32_t kCompressBit = 0;
+ // The least-significant bit offset at which the dirty flag is stored in
+ // 'flags'. Represents whether the checksummed portion of the log has been
+ // modified after the last checksum was computed.
+ static constexpr int32_t kDirtyBit = 1;
+
+ bool GetFlag(int offset) const {
+ return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
+ }
+
+ void SetFlag(int offset, bool value) {
+ bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
+ }
+
// Holds the magic as a quick sanity check against file corruption.
//
// Field is in network-byte order.
- int32_t magic_nbytes = ghtonl(kMagic);
+ int32_t magic_nbytes_ = ghtonl(kMagic);
// Must be at the beginning after kMagic. Contains the crc checksum of
// the following fields.
//
// Field is in network-byte order.
- uint32_t header_checksum_nbytes = 0;
+ uint32_t header_checksum_nbytes_ = 0;
// Last known good offset at which the log and its checksum were updated.
// If we crash between writing to the log and updating the checksum, we can
@@ -228,7 +240,7 @@
// valid instead of throwing away the entire log.
//
// Field is in network-byte order.
- int64_t rewind_offset_nbytes = ghtonll(kHeaderReservedBytes);
+ int64_t rewind_offset_nbytes_ = ghtonll(kHeaderReservedBytes);
// Version number tracking how we serialize the file to disk. If we change
// how/what we write to disk, this version should be updated and this class
@@ -237,23 +249,23 @@
// Currently at kFileFormatVersion.
//
// Field is in network-byte order.
- int32_t file_format_version_nbytes = 0;
+ int32_t file_format_version_nbytes_ = 0;
// The maximum proto size that can be written to the log.
//
// Field is in network-byte order.
- int32_t max_proto_size_nbytes = 0;
+ int32_t max_proto_size_nbytes_ = 0;
// Checksum of the log elements, doesn't include the header fields.
//
// Field is in network-byte order.
- uint32_t log_checksum_nbytes = 0;
+ uint32_t log_checksum_nbytes_ = 0;
// Bits are used to hold various flags.
// Lowest bit is whether the protos are compressed or not.
//
- // Field is in network-byte order.
- uint16_t flags_nbytes = 0;
+ // Field is only 1 byte, so is byte-order agnostic.
+ uint8_t flags_ = 0;
// NOTE: New fields should *almost always* be added to the end here. Since
// this class may have already been written to disk, appending fields
@@ -270,7 +282,14 @@
// happen if the file is corrupted or some previously added data was
// unpersisted. This may be used to signal that any derived data off of the
// proto log may need to be regenerated.
- DataLoss data_loss;
+ DataLoss data_loss = DataLoss::NONE;
+
+ // Whether the proto log had to recalculate the checksum to check its
+ // integrity. This can be avoided if no changes were made or the log was
+ // able to update its checksum before shutting down. But it may have to
+ // recalculate if it's unclear if we crashed after updating the log, but
+ // before updating our checksum.
+ bool recalculated_checksum = false;
bool has_data_loss() {
return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
@@ -638,7 +657,7 @@
std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
std::move(header))),
- /*data_loss=*/DataLoss::NONE};
+ /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
return create_result;
}
@@ -649,6 +668,7 @@
PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
const Filesystem* filesystem, const std::string& file_path,
const Options& options, int64_t file_size) {
+ bool header_changed = false;
if (file_size < kHeaderReservedBytes) {
return absl_ports::InternalError(
absl_ports::StrCat("File header too short for: ", file_path));
@@ -687,61 +707,85 @@
header->GetCompressFlag(), options.compress));
}
- if (header->GetMaxProtoSize() > options.max_proto_size) {
+ int32_t existing_max_proto_size = header->GetMaxProtoSize();
+ if (existing_max_proto_size > options.max_proto_size) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Max proto size cannot be smaller than previous "
"instantiations, previous size %d, wanted size %d",
header->GetMaxProtoSize(), options.max_proto_size));
+ } else if (existing_max_proto_size < options.max_proto_size) {
+ // It's fine if our new max size is greater than our previous one. Existing
+ // data is still valid.
+ header->SetMaxProtoSize(options.max_proto_size);
+ header_changed = true;
}
- header->SetMaxProtoSize(options.max_proto_size);
DataLoss data_loss = DataLoss::NONE;
- ICING_ASSIGN_OR_RETURN(
- Crc32 calculated_log_checksum,
- ComputeChecksum(filesystem, file_path, Crc32(),
- /*start=*/kHeaderReservedBytes, /*end=*/file_size));
- // Double check that the log checksum is the same as the one that was
- // persisted last time. If not, we start recovery logic.
- if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
- // Need to rewind the proto log since the checksums don't match.
- // Worst case, we have to rewind the entire log back to just the header
- int64_t last_known_good = kHeaderReservedBytes;
+ // If we have any documents in our tail, get rid of them since they're not in
+ // our checksum. Our checksum reflects content up to the rewind offset.
+ if (file_size > header->GetRewindOffset()) {
+ if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to truncate '%s' to size %lld", file_path.data(),
+ static_cast<long long>(header->GetRewindOffset())));
+ };
+ data_loss = DataLoss::PARTIAL;
+ }
- // Calculate the checksum of the log contents just up to the last rewind
- // offset point. This will be valid if we just appended contents to the log
- // without updating the checksum, and we can rewind back to this point
- // safely.
- ICING_ASSIGN_OR_RETURN(calculated_log_checksum,
- ComputeChecksum(filesystem, file_path, Crc32(),
- /*start=*/kHeaderReservedBytes,
- /*end=*/header->GetRewindOffset()));
- if (header->GetLogChecksum() == calculated_log_checksum.Get()) {
- // Check if it matches our last rewind state. If so, this becomes our last
- // good state and we can safely truncate and recover from here.
- last_known_good = header->GetRewindOffset();
- data_loss = DataLoss::PARTIAL;
- } else {
- // Otherwise, we're going to truncate the entire log and this resets the
- // checksum to an empty log state.
- header->SetLogChecksum(0);
- data_loss = DataLoss::COMPLETE;
+ bool recalculated_checksum = false;
+
+ // If our dirty flag is set, that means we might have crashed in the middle of
+ // erasing a proto. This could have happened anywhere between:
+ // A. Set dirty flag to true and update header checksum
+ // B. Erase the proto
+ // C. Set dirty flag to false, update log checksum, update header checksum
+ //
+ // Scenario 1: We went down between A and B. Maybe our dirty flag is a
+ // false alarm and we can keep all our data.
+ //
+ // Scenario 2: We went down between B and C. Our data is compromised and
+ // we need to throw everything out.
+ if (header->GetDirtyFlag()) {
+ // Recompute the log's checksum to detect which scenario we're in.
+ ICING_ASSIGN_OR_RETURN(
+ Crc32 calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+
+ if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
+ // Still doesn't match, we're in Scenario 2. Throw out all our data now
+ // and initialize as a new instance.
+ ICING_ASSIGN_OR_RETURN(CreateResult create_result,
+ InitializeNewFile(filesystem, file_path, options));
+ create_result.data_loss = DataLoss::COMPLETE;
+ create_result.recalculated_checksum = true;
+ return create_result;
}
+ // Otherwise we're good, checksum matches our contents so continue
+ // initializing like normal.
+ recalculated_checksum = true;
- if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
+ // Update our header.
+ header->SetDirtyFlag(false);
+ header_changed = true;
+ }
+
+ if (header_changed) {
+ header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+ if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
+ sizeof(Header))) {
return absl_ports::InternalError(
- absl_ports::StrCat("Error truncating file: ", file_path));
+ absl_ports::StrCat("Failed to update header to: ", file_path));
}
-
- ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
- << last_known_good;
}
CreateResult create_result = {
std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
std::move(header))),
- data_loss};
+ data_loss, recalculated_checksum};
return create_result;
}
@@ -963,7 +1007,18 @@
// We need to update the crc checksum if the erased area is before the
// rewind position.
- if (file_offset + sizeof(metadata) < header_->GetRewindOffset()) {
+ int32_t new_crc;
+ int64_t erased_proto_offset = file_offset + sizeof(metadata);
+ if (erased_proto_offset < header_->GetRewindOffset()) {
+ // Set to "dirty" before we start writing anything.
+ header_->SetDirtyFlag(true);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to update dirty bit of header to: ", file_path_));
+ }
+
// We need to calculate [original string xor 0s].
// The xored string is the same as the original string because 0 xor 0 =
// 0, 1 xor 0 = 1.
@@ -972,13 +1027,20 @@
Crc32 crc(header_->GetLogChecksum());
ICING_ASSIGN_OR_RETURN(
- uint32_t new_crc,
- crc.UpdateWithXor(xored_str,
- /*full_data_size=*/header_->GetRewindOffset() -
- kHeaderReservedBytes,
- /*position=*/file_offset + sizeof(metadata) -
- kHeaderReservedBytes));
+ new_crc, crc.UpdateWithXor(
+ xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/erased_proto_offset - kHeaderReservedBytes));
+ }
+ // Clear the region.
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+
+ // If we cleared something in our checksummed area, we should update our
+ // checksum and reset our dirty bit.
+ if (erased_proto_offset < header_->GetRewindOffset()) {
+ header_->SetDirtyFlag(false);
header_->SetLogChecksum(new_crc);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -989,7 +1051,6 @@
}
}
- memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index dfb67aa..69b8a1a 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -42,6 +42,20 @@
using ::testing::Pair;
using ::testing::Return;
+using Header = PortableFileBackedProtoLog<DocumentProto>::Header;
+
+Header ReadHeader(Filesystem filesystem, const std::string& file_path) {
+ Header header;
+ filesystem.PRead(file_path.c_str(), &header, sizeof(Header),
+ /*offset=*/0);
+ return header;
+}
+
+void WriteHeader(Filesystem filesystem, const std::string& file_path,
+ Header& header) {
+ filesystem.Write(file_path.c_str(), &header, sizeof(Header));
+}
+
class PortableFileBackedProtoLogTest : public ::testing::Test {
protected:
// Adds a user-defined default construct because a const member variable may
@@ -79,6 +93,7 @@
max_proto_size_)));
EXPECT_THAT(create_result.proto_log, NotNull());
EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
// Can't recreate the same file with different options.
ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
@@ -300,12 +315,12 @@
EXPECT_FALSE(create_result.has_data_loss());
}
- int corrupt_value = 24;
+ int corrupt_checksum = 24;
- // Offset after the kMagic and the header_checksum.
- int offset_after_checksum = 8;
- filesystem_.PWrite(file_path_.c_str(), offset_after_checksum, &corrupt_value,
- sizeof(corrupt_value));
+ // Write the corrupted header
+ Header header = ReadHeader(filesystem_, file_path_);
+ header.SetHeaderChecksum(corrupt_checksum);
+ WriteHeader(filesystem_, file_path_, header);
{
// Reinitialize the same proto_log
@@ -331,8 +346,12 @@
// Corrupt the magic that's stored at the beginning of the header.
int invalid_magic = -1;
- filesystem_.PWrite(file_path_.c_str(), /*offset=*/0, &invalid_magic,
- sizeof(invalid_magic));
+ ASSERT_THAT(invalid_magic, Not(Eq(Header::kMagic)));
+
+ // Write the corrupted header
+ Header header = ReadHeader(filesystem_, file_path_);
+ header.SetMagic(invalid_magic);
+ WriteHeader(filesystem_, file_path_, header);
}
{
@@ -346,7 +365,17 @@
}
}
-TEST_F(PortableFileBackedProtoLogTest, CorruptContent) {
+TEST_F(PortableFileBackedProtoLogTest,
+ UnableToDetectCorruptContentWithoutDirtyBit) {
+ // This is intentional that we can't detect corruption. We're trading off
+ // earlier corruption detection for lower initialization latency. By not
+ // calculating the checksum on initialization, we can initialize much faster,
+ // but at the cost of detecting corruption. Note that even if we did detect
+ // corruption, there was nothing we could've done except throw an error to
+ // clients. We'll still do that, but at some later point when the log is
+ // attempting to be accessed and we can't actually deserialize a proto from
+ // it. See the description in cl/374278280 for more details.
+
{
ICING_ASSERT_OK_AND_ASSIGN(
PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
@@ -361,19 +390,20 @@
DocumentBuilder().SetKey("namespace1", "uri1").Build();
// Write and persist an document.
- ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
proto_log->WriteProto(document));
ICING_ASSERT_OK(proto_log->PersistToDisk());
// "Corrupt" the content written in the log.
document.set_uri("invalid");
std::string serialized_document = document.SerializeAsString();
- filesystem_.PWrite(file_path_.c_str(), document_offset,
- serialized_document.data(), serialized_document.size());
+ ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), document_offset,
+ serialized_document.data(),
+ serialized_document.size()));
}
{
- // We can recover, but we have data loss.
+ // We can recover, and we don't have data loss.
ICING_ASSERT_OK_AND_ASSIGN(
PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
PortableFileBackedProtoLog<DocumentProto>::Create(
@@ -381,17 +411,147 @@
PortableFileBackedProtoLog<DocumentProto>::Options(
compress_, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.has_data_loss());
- ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_FALSE(create_result.recalculated_checksum);
- // Lost everything in the log since the rewind position doesn't help if
- // there's been data corruption within the persisted region
- ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
- kHeaderReservedBytes);
+ // We still have the corrupted content in our file, we didn't throw
+ // everything out.
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Gt(kHeaderReservedBytes));
}
}
-TEST_F(PortableFileBackedProtoLogTest, PersistToDisk) {
+TEST_F(PortableFileBackedProtoLogTest,
+ DetectAndThrowOutCorruptContentWithDirtyBit) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .AddStringProperty("string_property", "foo", "bar")
+ .Build();
+
+ // Write and persist the protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ // "Corrupt" the content written in the log. Make the corrupt document
+ // smaller than our original one so we don't accidentally write past our
+ // file.
+ DocumentProto document =
+ DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build();
+ std::string serialized_document = document.SerializeAsString();
+ ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), kHeaderReservedBytes,
+ serialized_document.data(),
+ serialized_document.size()));
+
+ Header header = ReadHeader(filesystem_, file_path_);
+
+ // Set dirty bit to true to reflect that something changed in the log.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteHeader(filesystem_, file_path_, header);
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_TRUE(create_result.has_data_loss());
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+ // We had to recalculate the checksum to detect the corruption.
+ EXPECT_TRUE(create_result.recalculated_checksum);
+
+ // We lost everything, file size is back down to the header.
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(kHeaderReservedBytes));
+
+ // At least the log is no longer dirty.
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitFalseAlarmKeepsData) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ int64_t document_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ Header header = ReadHeader(filesystem_, file_path_);
+
+ // Simulate the dirty flag set as true, but no data has been changed yet.
+ // Maybe we crashed between writing the dirty flag and erasing a proto.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteHeader(filesystem_, file_path_, header);
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ // Even though nothing changed, the false alarm dirty bit should have
+ // triggered us to recalculate our checksum.
+ EXPECT_TRUE(create_result.recalculated_checksum);
+
+ // Check that our document still exists even though dirty bit was true.
+ EXPECT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ PersistToDiskKeepsPersistedDataAndTruncatesExtraData) {
DocumentProto document1 =
DocumentBuilder().SetKey("namespace1", "uri1").Build();
DocumentProto document2 =
@@ -426,6 +586,8 @@
log_size = filesystem_.GetFileSize(file_path_.c_str());
ASSERT_GT(log_size, 0);
+
+ // PersistToDisk happens implicitly during the destructor.
}
{
@@ -453,6 +615,7 @@
auto proto_log = std::move(create_result.proto_log);
ASSERT_TRUE(create_result.has_data_loss());
ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+ ASSERT_FALSE(create_result.recalculated_checksum);
// Check that everything was persisted across instances
ASSERT_THAT(proto_log->ReadProto(document1_offset),
@@ -465,6 +628,183 @@
}
}
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterPutAndPersistToDisk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterDeleteAndPersistToDisk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write, delete, and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // The proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitIsFalseAfterPutAndDestructor) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+
+ // PersistToDisk is implicitly called as part of the destructor and
+ // PersistToDisk will clear the dirty bit.
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterDeleteAndDestructor) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write, delete, and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+
+ // The proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // PersistToDisk is implicitly called as part of the destructor and
+ // PersistToDisk will clear the dirty bit.
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
TEST_F(PortableFileBackedProtoLogTest, Iterator) {
DocumentProto document1 =
DocumentBuilder().SetKey("namespace", "uri1").Build();
@@ -508,7 +848,7 @@
{
// Iterator with bad filesystem
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+ ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
.WillByDefault(Return(Filesystem::kBadFileSize));
PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
mock_filesystem, file_path_, /*initial_offset=*/0);
diff --git a/icing/result/snippet-retriever-test-jni-layer.cc b/icing/result/snippet-retriever-test-jni-layer.cc
new file mode 100644
index 0000000..707d9ee
--- /dev/null
+++ b/icing/result/snippet-retriever-test-jni-layer.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_SnippetRetrieverJniTest_testsMain(JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index dc9f8be..2a138ec 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -157,61 +157,58 @@
}
// Finds the start position of a valid token that is after
-// window_start_min_exclusive
+// window_start_min_exclusive_utf32
//
// Returns:
// the position of the window start if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowStart(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- std::string_view value, int window_start_min_exclusive,
+ std::string_view value, int window_start_min_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenAfter(window_start_min_exclusive)) {
+ if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
- return iterator->GetToken().text.data() - value.data();
+ return iterator->CalculateTokenStart();
}
// Increments window_end_exclusive so long as the character at the position
// of window_end_exclusive is punctuation and does not exceed
-// window_end_max_exclusive.
-int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
- int window_end_max_exclusive) {
- while (window_end_exclusive < window_end_max_exclusive) {
+// window_end_max_exclusive_utf32.
+CharacterIterator IncludeTrailingPunctuation(
+ std::string_view value, CharacterIterator window_end_exclusive,
+ int window_end_max_exclusive_utf32) {
+ while (window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
int char_len = 0;
- if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
- break;
- }
- if (window_end_exclusive + char_len > window_end_max_exclusive) {
- // This is punctuation, but it goes beyond the window end max. Don't
- // include it.
+ if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
+ &char_len)) {
break;
}
// Expand window by char_len and check the next character.
- window_end_exclusive += char_len;
+ window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
}
return window_end_exclusive;
}
// Finds the end position of a valid token that is before the
-// window_end_max_exclusive.
+// window_end_max_exclusive_utf32.
//
// Returns:
// the position of the window end if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowEnd(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- std::string_view value, int window_end_max_exclusive,
+ std::string_view value, int window_end_max_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
+ if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
- int window_end_exclusive = iterator->GetToken().text.data() - value.data() +
- iterator->GetToken().text.length();
- return IncludeTrailingPunctuation(value, window_end_exclusive,
- window_end_max_exclusive);
+ ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
+ iterator->CalculateTokenEndExclusive());
+ return IncludeTrailingPunctuation(value, end_exclusive,
+ window_end_max_exclusive_utf32);
}
struct SectionData {
@@ -232,8 +229,10 @@
const SectionData& value, Tokenizer::Iterator* iterator,
const CharacterIterator& char_iterator) {
SnippetMatchProto snippet_match;
- Token match = iterator->GetToken();
- int match_pos = char_iterator.utf8_index();
+ ICING_ASSIGN_OR_RETURN(CharacterIterator start_itr,
+ iterator->CalculateTokenStart());
+ ICING_ASSIGN_OR_RETURN(CharacterIterator end_itr,
+ iterator->CalculateTokenEndExclusive());
// When finding boundaries, we have a few cases:
//
@@ -262,70 +261,65 @@
// window = |-----|
//
// We have do +1/-1 below to get the math to match up.
- int match_mid = match_pos + match.text.length() / 2;
- int window_start_min_exclusive =
- (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
- int window_end_max_exclusive =
- match_mid + (snippet_spec.max_window_bytes() + 1) / 2;
+ int match_pos_utf32 = start_itr.utf32_index();
+ int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
+ int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
+ int window_start_min_exclusive_utf32 =
+ (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1;
+ int window_end_max_exclusive_utf32 =
+ match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2;
- snippet_match.set_exact_match_byte_position(match_pos);
- snippet_match.set_exact_match_utf16_position(char_iterator.utf16_index());
-
- // Create character iterators to find the beginning and end of the window.
- CharacterIterator forward_char_iterator(char_iterator);
- CharacterIterator backwards_char_iterator(char_iterator);
-
- if (!backwards_char_iterator.AdvanceToUtf8(match_pos + match.text.length())) {
- return absl_ports::AbortedError("Could not retrieve valid utf8 character!");
- }
- snippet_match.set_exact_match_byte_length(match.text.length());
- snippet_match.set_exact_match_utf16_length(
- backwards_char_iterator.utf16_index() - char_iterator.utf16_index());
+ snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
+ snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
+ snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
+ start_itr.utf8_index());
+ snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
+ start_itr.utf16_index());
// Only include windows if it'll at least include the matched text. Otherwise,
// it'll just be an empty string anyways.
- if (snippet_spec.max_window_bytes() >= match.text.length()) {
+ if (snippet_spec.max_window_bytes() >= match_len_utf32) {
// Find the beginning of the window.
- int window_start;
- int window_start_utf16;
- if (window_start_min_exclusive < 0) {
- window_start = 0;
- window_start_utf16 = 0;
- } else {
+ ICING_ASSIGN_OR_RETURN(
+ CharacterIterator window_start,
+ DetermineWindowStart(snippet_spec, value.section_subcontent,
+ window_start_min_exclusive_utf32, iterator));
+
+ // Check. Did we get fewer characters than we requested? If so, then add it
+ // on to the window_end.
+ int extra_window_space =
+ window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
+ window_end_max_exclusive_utf32 += extra_window_space;
+
+ // Find the end of the window.
+ ICING_ASSIGN_OR_RETURN(
+ CharacterIterator window_end,
+ DetermineWindowEnd(snippet_spec, value.section_subcontent,
+ window_end_max_exclusive_utf32, iterator));
+
+ // Check one more time. Did we get fewer characters than we requested? If
+ // so, then see if we can push the start back again.
+ extra_window_space =
+ window_end_max_exclusive_utf32 - window_end.utf32_index();
+ if (extra_window_space > 0) {
+ window_start_min_exclusive_utf32 =
+ window_start.utf32_index() - 1 - extra_window_space;
ICING_ASSIGN_OR_RETURN(
window_start,
DetermineWindowStart(snippet_spec, value.section_subcontent,
- window_start_min_exclusive, iterator));
- if (!forward_char_iterator.RewindToUtf8(window_start)) {
- return absl_ports::AbortedError(
- "Could not retrieve valid utf8 character!");
- }
- window_start_utf16 = forward_char_iterator.utf16_index();
+ window_start_min_exclusive_utf32, iterator));
}
- snippet_match.set_window_byte_position(window_start);
- snippet_match.set_window_utf16_position(window_start_utf16);
- // Find the end of the window.
- int window_end_exclusive;
- if (window_end_max_exclusive >= value.section_subcontent.length()) {
- window_end_exclusive = value.section_subcontent.length();
- } else {
- ICING_ASSIGN_OR_RETURN(
- window_end_exclusive,
- DetermineWindowEnd(snippet_spec, value.section_subcontent,
- window_end_max_exclusive, iterator));
- }
- if (!backwards_char_iterator.AdvanceToUtf8(window_end_exclusive)) {
- return absl_ports::AbortedError(
- "Could not retrieve valid utf8 character!");
- }
- snippet_match.set_window_byte_length(window_end_exclusive - window_start);
- snippet_match.set_window_utf16_length(
- backwards_char_iterator.utf16_index() - window_start_utf16);
+ snippet_match.set_window_byte_position(window_start.utf8_index());
+ snippet_match.set_window_utf16_position(window_start.utf16_index());
+ snippet_match.set_window_byte_length(window_end.utf8_index() -
+ window_start.utf8_index());
+ snippet_match.set_window_utf16_length(window_end.utf16_index() -
+ window_start.utf16_index());
// DetermineWindowStart/End may change the position of the iterator. So,
// reset the iterator back to the original position.
- bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
+ bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1)
: iterator->ResetToStart();
if (!success) {
return absl_ports::InternalError(
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index c052a9e..e7988ae 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -37,6 +37,7 @@
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -88,7 +89,9 @@
GetTestFilePath("icing/icu.dat")));
}
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ jni_cache_ = GetTestJniCache();
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
language_segmenter_,
language_segmenter_factory::Create(std::move(options)));
@@ -140,6 +143,7 @@
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<SnippetRetriever> snippet_retriever_;
std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<const JniCache> jni_cache_;
ResultSpecProto::SnippetSpecProto snippet_spec_;
std::string test_dir_;
};
@@ -248,9 +252,15 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts at the space between "one" and "two". Window ends in the
- // middle of "four".
- // len=14, orig_window=" two three fou"
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (2,17).
+ // 2. trimmed, no-shifting window [4,13) "two three"
+ // 3. trimmed, shifted window [4,18) "two three four"
snippet_spec_.set_max_window_bytes(14);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -260,7 +270,7 @@
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
- ElementsAre("two three"));
+ ElementsAre("two three four"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
@@ -275,8 +285,15 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts in the middle of "one" and ends at the end of "four".
- // len=16, orig_window="e two three four"
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (1,18).
+ // 2. trimmed, no-shifting window [4,18) "two three four"
+ // 3. trimmed, shifted window [4,20) "two three four.."
snippet_spec_.set_max_window_bytes(16);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -286,7 +303,7 @@
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
- ElementsAre("two three four"));
+ ElementsAre("two three four.."));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
@@ -316,7 +333,7 @@
}
TEST_F(SnippetRetrieverTest,
- SnippetingWindowMaxWindowEndsInMiddleOfMultiBytePunctuation) {
+ SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -330,7 +347,7 @@
SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
// Window ends in the middle of all the punctuation and window starts at 0.
- // len=26, orig_window="pside down in Australia\xC2"
+ // len=26, orig_window="pside down in Australia¿"
snippet_spec_.set_max_window_bytes(24);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -340,11 +357,11 @@
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
- ElementsAre("down in Australia"));
+ ElementsAre("down in Australia¿"));
}
TEST_F(SnippetRetrieverTest,
- SnippetingWindowMaxWindowEndsInMultiBytePunctuation) {
+ SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -358,7 +375,7 @@
SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
// Window ends in the middle of all the punctuation and window starts at 0.
- // len=26, orig_window="upside down in Australia\xC2\xBF"
+ // len=26, orig_window="upside down in Australia¿ "
snippet_spec_.set_max_window_bytes(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -383,8 +400,15 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts before 0.
- // len=22, orig_window="one two three four..."
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-2,21).
+ // 2. trimmed, no-shifting window [0,21) "one two three four..."
+ // 3. trimmed, shifted window [0,22) "one two three four...."
snippet_spec_.set_max_window_bytes(22);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -394,7 +418,7 @@
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
- ElementsAre("one two three four..."));
+ ElementsAre("one two three four...."));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
@@ -435,8 +459,15 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window ends in the middle of "five"
- // len=32, orig_window="one two three four.... fiv"
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be ((-7,26).
+ // 2. trimmed, no-shifting window [0,26) "one two three four...."
+ // 3. trimmed, shifted window [0,27) "one two three four.... five"
snippet_spec_.set_max_window_bytes(32);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -446,7 +477,7 @@
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
- ElementsAre("one two three four...."));
+ ElementsAre("one two three four.... five"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
@@ -501,6 +532,142 @@
ElementsAre("one two three four.... five"));
}
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five six")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+ // String: "one two three four.... five six"
+ // ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 28 31
+ // UTF-32 idx: 0 4 8 14 23 28 31
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-10,19).
+ // 2. trimmed, no-shifting window [0,19) "one two three four."
+ // 3. trimmed, shifted window [0,27) "one two three four.... five"
+ snippet_spec_.set_max_window_bytes(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five six")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
+
+ // String: "one two three four.... five six"
+ // ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 28 31
+ // UTF-32 idx: 0 4 8 14 23 28 31
+ //
+ // Window size will go past the end of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (10,39).
+ // 2. trimmed, no-shifting window [14,31) "four.... five six"
+ // 3. trimmed, shifted window [4,31) "two three four.... five six"
+ snippet_spec_.set_max_window_bytes(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("two three four.... five six"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four....")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+ // String: "one two three four...."
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 22
+ // UTF-32 idx: 0 4 8 14 22
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-10,19).
+ // 2. trimmed, no-shifting window [0, 19) "one two three four."
+ // 3. trimmed, shifted window [0, 22) "one two three four...."
+ snippet_spec_.set_max_window_bytes(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four....")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
+
+ // String: "one two three four...."
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 22
+ // UTF-32 idx: 0 4 8 14 22
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (1,30).
+ // 2. trimmed, no-shifting window [4, 22) "two three four...."
+ // 3. trimmed, shifted window [0, 22) "one two three four...."
+ snippet_spec_.set_max_window_bytes(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
+}
+
TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
DocumentProto document =
DocumentBuilder()
@@ -578,6 +745,15 @@
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
@@ -588,10 +764,19 @@
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (54,91).
+ // 2. trimmed, no-shifting window [60, 91) "our... bar.".
+ // 3. trimmed, shifted window [31, 91) "we... bar."
EXPECT_THAT(
GetWindows(content, snippet.entries(0)),
- ElementsAre("Concerning the subject of foo, we need to begin considering",
- "our options regarding body bar."));
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our",
+ "we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
@@ -612,6 +797,16 @@
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
+ //
// Section 1 "subject" is not in the section_mask, so no snippet information
// from that section should be returned by the SnippetRetriever.
SectionIdMask section_mask = 0b00000001;
@@ -624,10 +819,19 @@
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (54,91).
+ // 2. trimmed, no-shifting window [60, 91) "our... bar.".
+ // 3. trimmed, shifted window [31, 91) "we... bar."
EXPECT_THAT(
GetWindows(content, snippet.entries(0)),
- ElementsAre("Concerning the subject of foo, we need to begin considering",
- "our options regarding body bar."));
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our",
+ "we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
}
@@ -642,6 +846,15 @@
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
SectionIdMask section_mask = 0b00000011;
// "subject" should match in both sections, but "foo" is restricted to "body"
// so it should only match in the 'body' section and not the 'subject'
@@ -656,11 +869,19 @@
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-15,50).
+ // 2. trimmed, no-shifting window [0, 47) "Concerning... begin".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
EXPECT_THAT(
GetWindows(content, snippet.entries(0)),
ElementsAre(
- "Concerning the subject of foo, we need to begin",
- "Concerning the subject of foo, we need to begin considering"));
+ "Concerning the subject of foo, we need to begin considering our",
+ "Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("subject", "foo"));
@@ -682,6 +903,15 @@
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
snippet_spec_.set_num_matches_per_property(1);
SectionIdMask section_mask = 0b00000011;
@@ -694,10 +924,14 @@
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
std::string_view content =
GetString(&document, snippet.entries(0).property_name());
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
EXPECT_THAT(
GetWindows(content, snippet.entries(0)),
ElementsAre(
- "Concerning the subject of foo, we need to begin considering"));
+ "Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
@@ -1177,7 +1411,8 @@
}
TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
- language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+ language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
language_segmenter_,
language_segmenter_factory::Create(std::move(options)));
@@ -1190,6 +1425,7 @@
// ^ ^ ^ ^^
// UTF8 idx: 0 3 9 15 18
// UTF16 idx: 0 1 3 5 6
+ // UTF32 idx: 0 1 3 5 6
// Breaks into segments: "我", "每天", "走路", "去", "上班"
constexpr std::string_view kChinese = "我每天走路去上班。";
DocumentProto document =
@@ -1205,12 +1441,11 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
- // Set a twenty byte window. This will produce a window like this:
- // String: "我每天走路去上班。"
- // ^ ^
- // UTF8 idx: 3 18
- // UTF16 idx: 1 6
- snippet_spec_.set_max_window_bytes(20);
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (0,7).
+ // 2. trimmed, no-shifting window [1, 6) "每天走路去".
+ // 3. trimmed, shifted window [0, 6) "我每天走路去"
+ snippet_spec_.set_max_window_bytes(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1227,11 +1462,11 @@
const SnippetMatchProto& match_proto = entry->snippet_matches(0);
// Ensure that the match is correct.
- EXPECT_THAT(GetWindows(content, *entry), ElementsAre("每天走路去"));
+ EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
// Ensure that the utf-16 values are also as expected
- EXPECT_THAT(match_proto.window_utf16_position(), Eq(1));
- EXPECT_THAT(match_proto.window_utf16_length(), Eq(5));
+ EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
}
TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
@@ -1285,6 +1520,7 @@
// ^ ^ ^
// UTF8 idx: 0 9 18
// UTF16 idx: 0 5 10
+ // UTF32 idx: 0 3 6
// Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
DocumentProto document =
@@ -1300,12 +1536,13 @@
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
- // Set a twenty byte window. This will produce a window like this:
+ // Set a six character window. This will produce a window like this:
// String: "𐀀𐀁 𐀂𐀃 𐀄"
// ^ ^
// UTF8 idx: 9 22
// UTF16 idx: 5 12
- snippet_spec_.set_max_window_bytes(20);
+ // UTF32 idx: 3 7
+ snippet_spec_.set_max_window_bytes(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 5f478fa..4e63b90 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -1119,6 +1119,11 @@
libtextclassifier3::StatusOr<DocumentFilterData>
DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
+ if (!DoesDocumentExist(document_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Can't get filter data, document id '%d' doesn't exist", document_id));
+ }
+
auto filter_data_or = filter_cache_->GetCopy(document_id);
if (!filter_data_or.ok()) {
ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
@@ -1127,10 +1132,6 @@
}
DocumentFilterData document_filter_data =
std::move(filter_data_or).ValueOrDie();
- if (document_filter_data.namespace_id() == kInvalidNamespaceId) {
- // An invalid namespace id means that the filter data has been deleted.
- return absl_ports::NotFoundError("Document filter data not found.");
- }
return document_filter_data;
}
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 9e1b3ec..b0cd1ce 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -231,6 +231,7 @@
//
// Returns:
// OK on success
+ // NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
// INTERNAL_ERROR on IO error
// INVALID_ARGUMENT if document_id is invalid.
libtextclassifier3::Status Delete(DocumentId document_id);
@@ -278,16 +279,11 @@
// Returns the DocumentFilterData of the document specified by the DocumentId.
//
- // NOTE: This does not check if the document exists and will return the
- // DocumentFilterData of the document even if it has been deleted. Users
- // should check DoesDocumentExist(document_id) if they only want existing
- // documents' DocumentFilterData.
- //
// Returns:
// DocumentFilterData on success
// OUT_OF_RANGE if document_id is negative or exceeds previously seen
// DocumentIds
- // NOT_FOUND if no filter data is found
+ // NOT_FOUND if the document or the filter data is not found
libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
DocumentId document_id) const;
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index b37c6de..ad3b7c4 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -1595,7 +1595,7 @@
/*length_in_tokens=*/7)));
}
-TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) {
+TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1651,6 +1651,18 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
+TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->GetDocumentFilterData(/*document_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
TEST_F(DocumentStoreTest, DeleteClearsFilterCache) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
@@ -3099,36 +3111,39 @@
#define DISABLE_BACKWARDS_COMPAT_TEST
#ifndef DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
- // The directory testdata/v0/document_store contains only the scoring_cache
- // and the document_store_header (holding the crc for the scoring_cache). If
- // the current code is compatible with the format of the v0 scoring_cache,
- // then an empty document store should be initialized, but the non-empty
- // scoring_cache should be retained.
- // The current document-asscoiated-score-data has a new field with respect to
- // the ones stored in testdata/v0, hence the document store's initialization
- // requires regenerating its derived files.
+ // The directory testdata/score_cache_without_length_in_tokens/document_store
+ // contains only the scoring_cache and the document_store_header (holding the
+ // crc for the scoring_cache). If the current code is compatible with the
+ // format of the v0 scoring_cache, then an empty document store should be
+ // initialized, but the non-empty scoring_cache should be retained. The
+ // current document-asscoiated-score-data has a new field with respect to the
+ // ones stored in testdata/score_cache_Without_length_in_tokens, hence the
+ // document store's initialization requires regenerating its derived files.
// Create dst directory
ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true);
// Get src files
- std::string document_store_v0;
+ std::string document_store_without_length_in_tokens;
if (IsAndroidPlatform() || IsIosPlatform()) {
- document_store_v0 = GetTestFilePath(
- "icing/testdata/v0/document_store_android_ios_compatible");
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store_android_ios_compatible");
} else {
- document_store_v0 =
- GetTestFilePath("icing/testdata/v0/document_store");
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store");
}
std::vector<std::string> document_store_files;
Filesystem filesystem;
- filesystem.ListDirectory(document_store_v0.c_str(), &document_store_files);
+ filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
+ &document_store_files);
- VLOG(1) << "Copying files " << document_store_v0 << ' '
- << document_store_files.size();
+ ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
+ << ' ' << document_store_files.size();
for (size_t i = 0; i != document_store_files.size(); i++) {
- std::string src =
- absl_ports::StrCat(document_store_v0, "/", document_store_files[i]);
+ std::string src = absl_ports::StrCat(
+ document_store_without_length_in_tokens, "/", document_store_files[i]);
std::string dst =
absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 74d22cd..cb31441 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -25,6 +25,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
#include "unicode/ubrk.h"
@@ -101,59 +102,149 @@
return text_.substr(term_start_index_, term_length);
}
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+ override {
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_;
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+ override {
+ if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Illegal offset provided! Offset %d is not within bounds of string "
- "of length %zu",
- offset, text_.length()));
+ if (offset < 0) {
+ // Very simple. The first term start after a negative offset is the first
+ // term. So just reset to start and Advance.
+ return ResetToStartUtf32();
}
- term_start_index_ = ubrk_following(break_iterator_, offset);
- if (term_start_index_ == UBRK_DONE) {
- MarkAsDone();
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No segments begin after provided offset %d.", offset));
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No segments begin after provided offset %d.", offset));
- }
- if (!IsValidSegment()) {
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No segments begin after provided offset %d.", offset));
+
+ // 1. Find the unicode character that contains the byte at offset.
+ if (!offset_iterator_.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator_.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
}
}
- return term_start_index_;
+ if (offset_iterator_.utf8_index() == text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+ "bounds of string of length %zu",
+ offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
+ text_.length()));
+ }
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts after this character.
+ int following_utf8_index =
+ ubrk_following(break_iterator_, offset_iterator_.utf8_index());
+ if (following_utf8_index == UBRK_DONE) {
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ term_end_index_exclusive_ = following_utf8_index;
+
+ // 3. The term_end_exclusive_ points to the start of the term that we want
+ // to return. We need to Advance so that term_start_ will now point to this
+ // term.
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
+ if (offset < 0) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Illegal offset provided! Offset %d is not within bounds of string "
"of length %zu",
offset, text_.length()));
}
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
- if (term_end_index_exclusive_ > offset) {
- // This term ends after offset. So we need to get the term just before
- // this one.
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
+
+ if (!offset_iterator_.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator_.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ // If it returned false because we hit the end. Then that's fine. We'll
+ // just treat it as if the request was for the end.
}
- return term_start_index_;
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that ends before this character.
+ int starting_utf8_index =
+ ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
+ if (starting_utf8_index == UBRK_DONE) {
+ // Rewind the end indices.
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ term_start_index_ = starting_utf8_index;
+
+ // 3. We've correctly set the start index and the iterator currently points
+ // to that position. Now we need to find the correct end position and
+ // advance the iterator to that position.
+ int ending_utf8_index = ubrk_next(break_iterator_);
+ if (ending_utf8_index == UBRK_DONE) {
+ // This shouldn't ever happen.
+ MarkAsDone();
+ return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ term_end_index_exclusive_ = ending_utf8_index;
+
+ // 4. The start and end indices point to a segment, but we need to ensure
+ // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+ // need a segment prior to this one.
+ CharacterIterator term_start_iterator = offset_iterator_;
+ if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
+ !IsValidSegment()) {
+ return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
+ }
+ return term_start_iterator.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
term_start_index_ = 0;
term_end_index_exclusive_ = 0;
if (!Advance()) {
- return absl_ports::NotFoundError("");
+ return absl_ports::NotFoundError(
+ "Unable to find any valid terms in text.");
}
- return term_start_index_;
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_.utf32_index();
}
private:
@@ -163,6 +254,7 @@
text_(text),
locale_(locale),
u_text_(UTEXT_INITIALIZER),
+ offset_iterator_(text),
term_start_index_(0),
term_end_index_exclusive_(0) {}
@@ -232,6 +324,15 @@
// utext_close() must be called after using.
UText u_text_;
+ // Offset iterator. This iterator is not guaranteed to point to any particular
+ // character, but is guaranteed to point to a valid UTF character sequence.
+ //
+ // This iterator is used to save some amount of linear traversal when seeking
+ // to a specific UTF-32 offset. Each function that uses it could just create
+ // a CharacterIterator starting at the beginning of the text and traverse
+ // forward from there.
+ CharacterIterator offset_iterator_;
+
// The start and end indices are used to track the positions of current
// term.
int term_start_index_;
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index c0d6d43..01eb7d8 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -12,24 +12,39 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <memory>
+#include <string_view>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "unicode/uloc.h"
namespace icing {
namespace lib {
-namespace {
+
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+ const std::string& locale, const JniCache* jni_cache) {
+ return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
// Returns a vector containing all terms retrieved by Advancing on the iterator.
std::vector<std::string_view> GetAllTermsAdvance(
LanguageSegmenter::Iterator* itr) {
@@ -40,70 +55,61 @@
return terms;
}
-// Returns a vector containing all terms retrieved by calling
-// ResetToStart/ResetAfter with the current position to simulate Advancing on
-// the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
std::vector<std::string_view> terms;
- if (!itr->ResetToStart().ok()) {
- return terms;
- }
- terms.push_back(itr->GetTerm());
- const char* text_begin = itr->GetTerm().data();
- // Calling ResetToTermStartingAfter with the current position should get the
- // very next term in the sequence.
- for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+ // the sequence.
+ bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
// Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
std::vector<std::string_view> terms;
-
- bool is_ok = true;
- int current_pos = 0;
+ bool is_ok = itr->Advance();
while (is_ok) {
+ terms.push_back(itr->GetTerm());
// Alternate between using Advance and ResetToTermAfter.
if (terms.size() % 2 == 0) {
is_ok = itr->Advance();
} else {
- // Calling ResetToTermStartingAfter with the current position should get
- // the very next term in the sequence.
- current_pos = itr->GetTerm().data() - text_begin;
- is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
- }
- if (is_ok) {
- terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
}
return terms;
}
// Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
- int last_pos = 0;
- while (itr->Advance()) {
- last_pos = itr->GetTerm().data() - text_begin;
- }
std::vector<std::string_view> terms;
- // Calling ResetToTermEndingBefore with the current position should get the
- // previous term in the sequence.
- for (int current_pos = last_pos;
- itr->ResetToTermEndingBefore(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+ // the previous term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
@@ -119,27 +125,34 @@
}
static std::string GetLocale() { return GetParam(); }
- static language_segmenter_factory::SegmenterOptions GetOptions() {
- return language_segmenter_factory::SegmenterOptions(GetLocale());
- }
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
+} // namespace
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
IsOkAndHolds(ElementsAre("Hello", " ", "World")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// ASCII punctuation marks are kept
EXPECT_THAT(
language_segmenter->GetAllTerms("Hello, World!!!"),
@@ -153,8 +166,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// ASCII special characters are kept
EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
@@ -169,8 +184,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
@@ -178,10 +195,12 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
- EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
- IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
IsOkAndHolds(ElementsAre("I.B.M", ".")));
EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
@@ -191,8 +210,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// According to unicode word break rules
// WB6(https://unicode.org/reports/tr29/#WB6),
// WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
@@ -274,8 +295,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
@@ -295,8 +318,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
IsOkAndHolds(ElementsAre("(", "Hello", ")")));
@@ -306,8 +331,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
@@ -317,8 +344,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Alphanumeric terms are allowed
EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -326,8 +355,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Alphanumeric terms are allowed
EXPECT_THAT(
@@ -342,8 +373,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
const int kNumSeparators = 256;
std::string text_with_spaces =
@@ -367,8 +400,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
// have whitespaces as word delimiter.
@@ -389,15 +424,19 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Turkish
EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
@@ -408,8 +447,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
"吗", "お", "元気", "です", "か")));
@@ -420,8 +461,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Validates that the input strings are not copied
const std::string text = "Hello World";
const char* word1_address = text.c_str();
@@ -437,127 +480,141 @@
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartWordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "com:google:android is package";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "com:google:android is package"
- // ^ ^^ ^^
- // Bytes: 0 18 19 21 22
- auto position_or = itr->ResetToStart();
+ // String: "com:google:android is package"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 18 19 21 22
+ // UTF-32 idx: 0 18 19 21 22
+ auto position_or = itr->ResetToStartUtf32();
EXPECT_THAT(position_or, IsOk());
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
- EXPECT_THAT(itr->GetTerm(), Eq("How"));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
- constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
-
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_TRUE(itr->Advance()); // itr points to 'How'
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- IteratorMultipleAdvancesResetToStart) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ IteratorOneAdvanceResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ IteratorMultipleAdvancesResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance()); // itr points to ' '
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
while (itr->Advance()) {
// Do nothing.
}
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterWordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package com:google:android name";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package com:google:android name"
- // ^ ^^ ^^
- // Bytes: 0 7 8 26 27
- auto position_or = itr->ResetToTermStartingAfter(8);
+ // String: "package com:google:android name"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 7 8 26 27
+ // UTF-32 idx: 0 7 8 26 27
+ auto position_or = itr->ResetToTermStartingAfterUtf32(8);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
ASSERT_THAT(itr->GetTerm(), Eq(" "));
- position_or = itr->ResetToTermStartingAfter(7);
+ position_or = itr->ResetToTermStartingAfterUtf32(7);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
ASSERT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
// Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -566,9 +623,10 @@
// terms produced by ResetToTermAfter calls with the current position
// provided as the argument.
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -580,16 +638,17 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -601,16 +660,17 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -622,7 +682,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -633,9 +693,10 @@
// should be able to mix ResetToTermAfter(current_position) calls and Advance
// calls to mimic calling Advance.
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -647,7 +708,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -655,9 +716,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -669,7 +731,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -677,9 +739,10 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -691,211 +754,234 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
EXPECT_THAT(itr->GetTerm(), Eq("你好"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
EXPECT_THAT(itr->GetTerm(), Eq("か"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
EXPECT_THAT(itr->GetTerm(), Eq("吗"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ContinuousWhitespacesResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ContinuousWhitespacesResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-832 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeWordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ResetToTermBeforeWordConnectorUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package name com:google:android!";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package name com:google:android!"
- // ^ ^^ ^^ ^
- // Bytes: 0 7 8 12 13 31
- auto position_or = itr->ResetToTermEndingBefore(31);
+ // String: "package name com:google:android!"
+ // ^ ^^ ^^ ^
+ // UTF-8 idx: 0 7 8 12 13 31
+ // UTF-32 idx: 0 7 8 12 13 31
+ auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
- position_or = itr->ResetToTermEndingBefore(21);
+ position_or = itr->ResetToTermEndingBeforeUtf32(21);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
ASSERT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
ASSERT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("are"));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
}
// Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -904,26 +990,22 @@
// terms produced by ResetToTermBefore calls with the current position
// provided as the argument (after their order has been reversed).
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -932,26 +1014,22 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -959,192 +1037,209 @@
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("元気"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
EXPECT_THAT(itr->GetTerm(), Eq("です"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ContinuousWhitespacesResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ContinuousWhitespacesResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-32 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("我"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq("去"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
EXPECT_THAT(itr->GetTerm(), Eq("てい"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Validates that the input strings are not copied
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<std::string_view> terms,
@@ -1174,6 +1269,5 @@
"" // Will fall back to ICU default locale
));
-} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc
new file mode 100644
index 0000000..3a94af3
--- /dev/null
+++ b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_LanguageSegmenterIteratorJniTest_testsMain(JNIEnv* env,
+ jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 317da04..d293581 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -18,6 +18,7 @@
#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
@@ -43,10 +44,13 @@
GetTestFilePath("icing/icu.dat")));
}
}
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -66,85 +70,91 @@
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithOffsetInText) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ResetToTermStartingAfterUtf32WithOffsetInText) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/0),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0),
IsOkAndHolds(3)); // The term " "
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/3),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/3),
IsOkAndHolds(4)); // The term "bar"
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/4),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/4),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithNegativeOffsetNotOk) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ResetToTermStartingAfterUtf32WithNegativeOffsetNotOk) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk());
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-100), IsOk());
- EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+ EXPECT_THAT(iterator->ResetToStartUtf32(), IsOkAndHolds(0));
EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
+ ResetToTermStartingAfterUtf32WithTextLengthOffsetInvalidArgument) {
std::string text = "foo bar";
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
+ ResetToTermStartingAfterUtf32WithOffsetPastTextLengthInvalidArgument) {
std::string text = "foo bar";
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeUtf32WithOffsetInText) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/6),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6),
IsOkAndHolds(3)); // The term " "
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/3),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/3),
IsOkAndHolds(0)); // The term "foo"
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/2),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithZeroNotFound) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeUtf32WithZeroNotFound) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -152,40 +162,43 @@
language_segmenter->Segment("foo bar"));
// Zero is a valid argument, but there aren't any terms that end before it.
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ResetToTermEndingBeforeUtf32WithNegativeOffsetInvalidArgument) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-100),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
+ ResetToTermEndingBeforeUtf32WithOffsetPastTextEndInvalidArgument) {
std::string text = "foo bar";
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()),
+ IsOk());
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length() + 1),
+ IsOk());
}
} // namespace
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index 7ca31d1..913386a 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -21,6 +21,8 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -56,51 +58,81 @@
// true.
virtual std::string_view GetTerm() const = 0;
- // Resets the iterator to point to the first term that starts after offset.
+ // RETURNS:
+ // On success, a CharacterIterator pointing to the beginning of the
+ // current term.
+ // ABORTED if an invalid unicode character is encountered while
+ // calculating the term start.
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTermStart() {
+ return absl_ports::UnimplementedError("");
+ }
+
+ // RETURNS:
+ // On success, a CharacterIterator pointing just past the end of the
+ // current term.
+ // ABORTED if an invalid unicode character is encountered while
+ // calculating the term end.
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTermEndExclusive() {
+ return absl_ports::UnimplementedError("");
+ }
+
+ // Resets the iterator to point to the first term that starts after UTF-32
+ // offset.
// GetTerm will now return that term. For example:
//
// language_segmenter = language_segmenter_factory::Create(type);
// iterator = language_segmenter->Segment("foo bar baz");
- // iterator.ResetToTermStartingAfter(4);
+ // iterator.ResetToTermStartingAfterUtf32(4);
// iterator.GetTerm() // returns "baz";
//
// Return types of OK and NOT_FOUND indicate that the function call was
// valid and the state of the iterator has changed. Return type of
- // INVALID_ARGUMENT will leave the iterator unchanged.
+ // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+ // of ABORTED means that the iterator may be left in an undefined state and
+ // no longer be usable.
//
// Returns:
- // On success, the starting position of the first term that starts after
+ // On success, the UTF-32 offset of the first term that starts after
// offset.
// NOT_FOUND if an error occurred or there are no terms that start after
// offset.
- // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // INVALID_ARGUMENT if offset is beyond the end of the text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) = 0;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
+ int32_t offset) {
+ return absl_ports::UnimplementedError("");
+ }
- // Resets the iterator to point to the first term that ends before offset.
+ // Resets the iterator to point to the first term that ends before UTF-32
+ // offset.
// GetTerm will now return that term. For example:
//
// language_segmenter = language_segmenter_factory::Create(type);
// iterator = language_segmenter->Segment("foo bar baz");
- // iterator.ResetToTermEndingBefore(7);
+ // iterator.ResetToTermEndingBeforeUtf32(7);
// iterator.GetTerm() // returns "bar";
//
// Return types of OK and NOT_FOUND indicate that the function call was
// valid and the state of the iterator has changed. Return type of
- // INVALID_ARGUMENT will leave the iterator unchanged.
+ // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+ // of ABORTED means that the iterator may be left in an undefined state and
+ // no longer be usable.
//
// Returns:
- // On success, the starting position of the first term that ends before
+ // On success, the UTF-32 offset of the first term that ends before
// offset.
// NOT_FOUND if an error occurred or there are no terms that ends before
// offset.
- // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // INVALID_ARGUMENT if offset is negative
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) = 0;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
+ int32_t offset) {
+ return absl_ports::UnimplementedError("");
+ }
// Resets the iterator to point to the first term.
// GetTerm will now return that term. For example:
@@ -108,7 +140,7 @@
// language_segmenter = language_segmenter_factory::Create(type);
// iterator = language_segmenter->Segment("foo bar baz");
// iterator.Advance();
- // iterator.ResetToStart();
+ // iterator.ResetToStartUtf32();
// iterator.GetTerm() // returns "foo";
//
// Return types of OK and NOT_FOUND indicate that the function call was
@@ -119,7 +151,7 @@
// NOT_FOUND if an error occurred or there are no valid terms in the text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0;
};
// Segments the input text into terms.
diff --git a/icing/tokenization/plain-tokenizer-test-jni-layer.cc b/icing/tokenization/plain-tokenizer-test-jni-layer.cc
new file mode 100644
index 0000000..efa6427
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer-test-jni-layer.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_PlainTokenizerJniTest_testsMain(JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 6e54af9..13fe550 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -18,6 +18,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
@@ -70,8 +71,18 @@
return Token(Token::REGULAR, current_term_);
}
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+ override {
+ return base_iterator_->CalculateTermStart();
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+ override {
+ return base_iterator_->CalculateTermEndExclusive();
+ }
+
bool ResetToTokenAfter(int32_t offset) override {
- if (!base_iterator_->ResetToTermStartingAfter(offset).ok()) {
+ if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
@@ -84,20 +95,20 @@
bool ResetToTokenBefore(int32_t offset) override {
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
current_term_ = base_iterator_->GetTerm();
while (!IsValidTerm(current_term_)) {
// Haven't found a valid term yet. Retrieve the term prior to this one
// from the segmenter.
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
current_term_ = base_iterator_->GetTerm();
}
return true;
}
bool ResetToStart() override {
- if (!base_iterator_->ResetToStart().ok()) {
+ if (!base_iterator_->ResetToStartUtf32().ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index 2fb9750..7490bfa 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -22,6 +22,7 @@
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
@@ -43,6 +44,8 @@
GetTestFilePath("icing/icu.dat")));
}
}
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
@@ -53,7 +56,8 @@
}
TEST_F(PlainTokenizerTest, Simple) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -87,7 +91,8 @@
}
TEST_F(PlainTokenizerTest, Whitespace) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -115,7 +120,8 @@
}
TEST_F(PlainTokenizerTest, Punctuation) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -161,7 +167,8 @@
}
TEST_F(PlainTokenizerTest, SpecialCharacters) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -187,7 +194,8 @@
// In plain tokenizer, CJKT characters are handled the same way as non-CJKT
// characters, just add these tests as sanity checks.
// Chinese
- language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+ language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -202,7 +210,8 @@
EqualsToken(Token::REGULAR, "去"),
EqualsToken(Token::REGULAR, "上班"))));
// Japanese
- options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE);
+ options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -272,7 +281,8 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -291,7 +301,8 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -310,7 +321,8 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
@@ -360,7 +372,8 @@
}
TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
new file mode 100644
index 0000000..6b1cb3a
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
@@ -0,0 +1,187 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
+
+#include <jni.h>
+#include <math.h>
+
+#include <cassert>
+#include <cctype>
+#include <map>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Chosen based on results in go/reverse-jni-benchmarks
+static constexpr int kBatchSize = 100;
+} // namespace
+
+// -----------------------------------------------------------------------------
+// Implementations that call out to JVM. Behold the beauty.
+// -----------------------------------------------------------------------------
+libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ReverseJniBreakIterator::Create(const JniCache* jni_cache,
+ std::string_view text,
+ std::string_view locale) {
+ if (jni_cache == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "Create must be called with a valid JniCache pointer!");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_text,
+ jni_cache->ConvertToJavaString(text.data(), text.length()));
+ if (java_text.get() == nullptr) {
+ return absl_ports::AbortedError("Failed to create Java String from input.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
+ jni_cache->ConvertToJavaString(locale.data(), locale.length()));
+ if (java_locale_string.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java String from locale.");
+ }
+
+ JNIEnv* jenv = jni_cache->GetEnv();
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> java_locale,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
+ java_locale_string.get()));
+ if (java_locale.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java Locale from locale.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->breakiterator_class.get(),
+ jni_cache->breakiterator_constructor, java_locale.get()));
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
+ libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
+ jni_cache->jvm);
+ if (iterator_batcher.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java BreakIteratorBatcher.");
+ }
+
+ ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
+ jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
+ java_text.get()));
+ return std::unique_ptr<ReverseJniBreakIterator>(
+ new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+}
+
+ReverseJniBreakIterator::ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+ : jni_cache_(jni_cache),
+ iterator_batcher_(std::move(iterator_batcher)),
+ is_done_(false),
+ is_almost_done_(false) {}
+
+int ReverseJniBreakIterator::Next() {
+ if (is_done_) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ if (break_indices_cache_.empty()) {
+ if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
+ // Either there were no more results or an error occurred. Either way,
+ // mark ourselves as done and return.
+ is_done_ = true;
+ return ReverseJniBreakIterator::kDone;
+ }
+ is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+ }
+ int break_index = break_indices_cache_.front();
+ break_indices_cache_.pop();
+ is_done_ = is_almost_done_ && break_indices_cache_.empty();
+ return break_index;
+}
+
+int ReverseJniBreakIterator::First() {
+ const int first_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_first);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return first_index;
+}
+
+int ReverseJniBreakIterator::Preceding(int offset) {
+ const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return preceding_index;
+}
+
+int ReverseJniBreakIterator::Following(int offset) {
+ const int following_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return following_index;
+}
+
+int ReverseJniBreakIterator::FetchNextBatch() {
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
+ libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
+ jni_cache_->GetEnv(), iterator_batcher_.get(),
+ jni_cache_->breakiterator_next, kBatchSize),
+ ReverseJniBreakIterator::kDone);
+ if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
+ if (num_indices == 0) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint* break_indices_arr =
+ static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
+ break_indices.get(), nullptr));
+ for (int i = 0; i < num_indices; ++i) {
+ break_indices_cache_.push(break_indices_arr[i]);
+ }
+ jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
+ break_indices_arr,
+ /*mode=*/0);
+ return num_indices;
+}
+
+void ReverseJniBreakIterator::ClearCache() {
+ break_indices_cache_ = std::queue<int>();
+ is_done_ = false;
+ is_almost_done_ = false;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
new file mode 100644
index 0000000..41b470c
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+
+#include <jni.h>
+
+#include <queue>
+#include <string>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
+// hides the batching element to provide an interface akin to
+// java.text.BreakIterator.
+//
+// Example:
+// std::string text = "我每天走路去上班。";
+// ASSERT_THAT(text, SizeIs(27));
+// std::unique_ptr<ReverseJniBreakIterator> itr =
+// ReverseJniBreakIterator::Create(jni_cache, text, locale);
+// std::vector<int> nexts;
+// int next = itr->Next();
+// while (next != ReverseJniBreakIterator::kDone) {
+// nexts.push_back(next);
+// next = itr->Next();
+// }
+// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
+class ReverseJniBreakIterator {
+ public:
+ static constexpr int kDone = -1;
+
+ // Creates a ReverseJniBreakiterator with the given text and locale.
+ //
+ // Returns:
+ // A ReverseJniBreakIterator on success
+ // INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
+ // INTERNAL if unable to create any of the required Java objects
+ static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ Create(const JniCache* jni_cache, std::string_view text,
+ std::string_view locale);
+
+ // Returns the UTF-16 boundary following the current boundary. If the current
+ // boundary is the last text boundary, it returns
+ // ReverseJniBreakIterator::kDONE.
+ //
+ // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
+ // boundary. Callers interested in the UTF-8 boundary are required to maintain
+ // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
+ int Next();
+
+ // Returns the first UTF-16 boundary. The iterator's current position is set
+ // to the first text boundary and any cached data is cleared.
+ int First();
+
+ // Returns the position of the first UTF-16 boundary preceding the UTF-16
+ // offset. If there is no boundary preceding the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary was
+ // returned and any cached data is cleared.
+ int Preceding(int offset);
+
+ // Returns the position of the first UTF-16 boundary following the UTF-16
+ // offset. If there is no boundary following the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary
+ // was returned and any cached data is cleared.
+ int Following(int offset);
+
+ private:
+ ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+
+ // Fetches the results of up to kBatchSize next calls and stores them in
+ // break_indices_cache_. Returns the number of results or kDone if no more
+ // results could be fetched.
+ int FetchNextBatch();
+
+ // Empties the cache and sets is_done_ and is_almost_done_ to false.
+ void ClearCache();
+
+ // Keeps track of references to Java classes and methods. Does NOT own.
+ const JniCache* jni_cache_;
+
+ // The reference to the actual instance of BreakIteratorBatcher that
+ // this class interacts with.
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
+
+ // The cache holding the most recent batch of return values from
+ // BreakIteratorBatcher#next.
+ std::queue<int> break_indices_cache_;
+
+ bool is_done_;
+
+ // The last batch was incomplete (< kBatchSize results were returned). The
+ // next call to BreakIteratorBatcher#next is guaranteed to return an
+ // empty array. Once the results from the last batch are evicted from
+ // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
+ bool is_almost_done_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index bb26364..76219b5 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -19,11 +19,11 @@
#include <string>
#include <string_view>
-#include "icing/jni/reverse-jni-break-iterator.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
@@ -44,13 +44,13 @@
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
// Prerequisite check
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
return false;
}
if (term_end_exclusive_.utf16_index() == 0) {
int first = break_iterator_->First();
- if (!term_start_.AdvanceToUtf16(first)) {
+ if (!term_start_.MoveToUtf16(first)) {
// First is guaranteed to succeed and return a position within bonds. So
// the only possible failure could be an invalid sequence. Mark as DONE
// and return.
@@ -67,7 +67,7 @@
MarkAsDone();
return false;
}
- if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
// next_utf16_index_exclusive is guaranteed to be within bonds thanks to
// the check for kDone above. So the only possible failure could be an
// invalid sequence. Mark as DONE and return.
@@ -87,6 +87,9 @@
// Returns the current term. It can be called only when Advance() returns
// true.
std::string_view GetTerm() const override {
+ if (IsDone()) {
+ return text_.substr(0, 0);
+ }
int term_length =
term_end_exclusive_.utf8_index() - term_start_.utf8_index();
if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
@@ -96,6 +99,16 @@
return text_.substr(term_start_.utf8_index(), term_length);
}
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+ override {
+ return term_start_;
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+ override {
+ return term_end_exclusive_;
+ }
+
// Resets the iterator to point to the first term that starts after offset.
// GetTerm will now return that term.
//
@@ -107,15 +120,14 @@
// INVALID_ARGUMENT if offset is out of bounds for the provided text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Illegal offset provided! Offset %d is not within bounds of string "
- "of length %zu",
- offset, text_.length()));
+ if (offset < 0) {
+ // Very simple. The first term start after a negative offset is the first
+ // term. So just reset to start.
+ return ResetToStartUtf32();
}
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
// We're done. Need to start from the beginning if we're going to reset
// properly.
term_start_ = CharacterIterator(text_);
@@ -123,43 +135,48 @@
}
// 1. Find the unicode character that contains the byte at offset.
- CharacterIterator offset_iterator = term_end_exclusive_;
- bool success = (offset > offset_iterator.utf8_index())
- ? offset_iterator.AdvanceToUtf8(offset)
- : offset_iterator.RewindToUtf8(offset);
- if (!success) {
- // Offset is guaranteed to be within bounds thanks to the check above. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+ ? term_start_
+ : term_end_exclusive_;
+ if (!offset_iterator.MoveToUtf32(offset)) {
+ if (offset_iterator.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ }
+ // Check to see if offset is past the end of the text. If it is, then
+ // there's no term starting after it. Return an invalid argument.
+ if (offset_iterator.utf8_index() == text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+ "bounds of string of length %zu",
+ offset_iterator.utf32_index(), offset_iterator.utf8_index(),
+ text_.length()));
}
// 2. We've got the unicode character containing byte offset. Now, we need
// to point to the segment that starts after this character.
int following_utf16_index =
break_iterator_->Following(offset_iterator.utf16_index());
- if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+ if (following_utf16_index == ReverseJniBreakIterator::kDone ||
+ !offset_iterator.MoveToUtf16(following_utf16_index)) {
MarkAsDone();
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
- // following_utf16_index is guaranteed to be within bonds thanks to the
- // check for kDone above. So the only possible failure could be an invalid
- // sequence. Mark as DONE and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
- }
term_end_exclusive_ = offset_iterator;
- // 3. The term_end_exclusive_ points to the term that we want to return. We
- // need to Advance so that term_start_ will now point to this term.
+ // 3. The term_end_exclusive_ points to the start of the term that we want
+ // to return. We need to Advance so that term_start_ will now point to this
+ // term.
if (!Advance()) {
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
// Resets the iterator to point to the first term that ends before offset.
@@ -173,52 +190,48 @@
// INVALID_ARGUMENT if offset is out of bounds for the provided text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
+ if (offset < 0) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Illegal offset provided! Offset %d is not within bounds of string "
"of length %zu",
offset, text_.length()));
}
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
// We're done. Need to start from the beginning if we're going to reset
// properly.
term_start_ = CharacterIterator(text_);
term_end_exclusive_ = CharacterIterator(text_);
}
- // 1. Find the unicode character that contains the byte at offset.
- CharacterIterator offset_iterator = term_end_exclusive_;
- bool success = (offset > offset_iterator.utf8_index())
- ? offset_iterator.AdvanceToUtf8(offset)
- : offset_iterator.RewindToUtf8(offset);
- if (!success) {
- // Offset is guaranteed to be within bounds thanks to the check above. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
- MarkAsDone();
- return absl_ports::AbortedError(
- "Could not retrieve valid utf8 character!");
+ CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+ ? term_start_
+ : term_end_exclusive_;
+ if (!offset_iterator.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ // If it returned false because we hit the end. Then that's fine. We'll
+ // just treat it as if the request was for the end.
}
// 2. We've got the unicode character containing byte offset. Now, we need
- // to point to the segment that starts before this character.
+ // to point to the segment that ends before this character.
int starting_utf16_index =
break_iterator_->Preceding(offset_iterator.utf16_index());
- if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+ if (starting_utf16_index == ReverseJniBreakIterator::kDone ||
+ !offset_iterator.MoveToUtf16(starting_utf16_index)) {
// Rewind the end indices.
MarkAsDone();
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments end before provided offset %d.", offset));
}
- if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
- // starting_utf16_index is guaranteed to be within bonds thanks to the
- // check for kDone above. So the only possible failure could be an invalid
- // sequence. Mark as DONE and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
- }
term_start_ = offset_iterator;
// 3. We've correctly set the start index and the iterator currently points
@@ -226,24 +239,25 @@
// advance the iterator to that position.
int end_utf16_index = break_iterator_->Next();
term_end_exclusive_ = term_start_;
- term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+ term_end_exclusive_.MoveToUtf16(end_utf16_index);
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) {
- return ResetToTermEndingBefore(term_start_.utf8_index());
+ if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) {
+ return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
term_start_ = CharacterIterator(text_);
term_end_exclusive_ = CharacterIterator(text_);
if (!Advance()) {
- return absl_ports::NotFoundError("");
+ return absl_ports::NotFoundError(
+ "Unable to find any valid terms in text.");
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
private:
@@ -255,11 +269,19 @@
// break_iterator_ may be in any state.
void MarkAsDone() {
term_start_ =
- CharacterIterator(text_, /*utf8_index=*/0,
- /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone,
+ /*utf32_index=*/ReverseJniBreakIterator::kDone);
term_end_exclusive_ =
- CharacterIterator(text_, /*utf8_index=*/0,
- /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone,
+ /*utf32_index=*/ReverseJniBreakIterator::kDone);
+ }
+ bool IsDone() const {
+ // We could just as easily check the other utf indices or the values in
+ // term_start_ to check for done. There's no particular reason to choose any
+ // one since they should all hold kDone.
+ return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
}
bool IsValidTerm() const {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 72c3180..b1a8f72 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -27,6 +27,7 @@
#include "icing/testing/jni-test-helpers.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "unicode/uloc.h"
namespace icing {
@@ -56,68 +57,60 @@
}
// Returns a vector containing all terms retrieved by calling ResetAfter with
-// the current position to simulate Advancing on the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
std::vector<std::string_view> terms;
- if (!itr->ResetToStart().ok()) {
- return terms;
- }
- terms.push_back(itr->GetTerm());
- const char* text_begin = itr->GetTerm().data();
- // Calling ResetToTermStartingAfter with the current position should get the
- // very next term in the sequence.
- for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+ // the sequence.
+ bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
// Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
std::vector<std::string_view> terms;
-
- bool is_ok = true;
- int current_pos = 0;
+ bool is_ok = itr->Advance();
while (is_ok) {
+ terms.push_back(itr->GetTerm());
// Alternate between using Advance and ResetToTermAfter.
if (terms.size() % 2 == 0) {
is_ok = itr->Advance();
} else {
- // Calling ResetToTermStartingAfter with the current position should get
- // the very next term in the sequence.
- current_pos = itr->GetTerm().data() - text_begin;
- is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
- }
- if (is_ok) {
- terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
}
return terms;
}
// Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
- int last_pos = 0;
- while (itr->Advance()) {
- last_pos = itr->GetTerm().data() - text_begin;
- }
std::vector<std::string_view> terms;
- // Calling ResetToTermEndingBefore with the current position should get the
- // previous term in the sequence.
- for (int current_pos = last_pos;
- itr->ResetToTermEndingBefore(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+ // the previous term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
@@ -481,7 +474,7 @@
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -489,15 +482,16 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "com:google:android is package"
- // ^ ^^ ^^
- // Bytes: 0 18 19 21 22
- auto position_or = itr->ResetToStart();
+ // String: "com:google:android is package"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 18 19 21 22
+ // UTF-32 idx: 0 18 19 21 22
+ auto position_or = itr->ResetToStartUtf32();
EXPECT_THAT(position_or, IsOk());
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -505,14 +499,15 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -520,15 +515,17 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
ASSERT_TRUE(itr->Advance()); // itr points to 'How'
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest,
+ IteratorMultipleAdvancesResetToStartUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -536,18 +533,19 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance());
ASSERT_TRUE(itr->Advance()); // itr points to ' '
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -555,17 +553,18 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
while (itr->Advance()) {
// Do nothing.
}
- EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -573,21 +572,22 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package com:google:android name"
- // ^ ^^ ^^
- // Bytes: 0 7 8 26 27
- auto position_or = itr->ResetToTermStartingAfter(8);
+ // String: "package com:google:android name"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 7 8 26 27
+ // UTF-32 idx: 0 7 8 26 27
+ auto position_or = itr->ResetToTermStartingAfterUtf32(8);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
ASSERT_THAT(itr->GetTerm(), Eq(" "));
- position_or = itr->ResetToTermStartingAfter(7);
+ position_or = itr->ResetToTermStartingAfterUtf32(7);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -595,19 +595,19 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
ASSERT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
// Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -616,7 +616,7 @@
// terms produced by ResetToTermAfter calls with the current position
// provided as the argument.
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -631,14 +631,14 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermAfterEquivalentToAdvance) {
+ ThaiResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -653,14 +653,14 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermAfterEquivalentToAdvance) {
+ KoreanResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -675,7 +675,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -686,7 +686,7 @@
// should be able to mix ResetToTermAfter(current_position) calls and Advance
// calls to mimic calling Advance.
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -701,7 +701,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -709,7 +709,7 @@
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermAfterInteroperableWithAdvance) {
+ ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -724,7 +724,7 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -732,7 +732,7 @@
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermAfterInteroperableWithAdvance) {
+ KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -747,14 +747,14 @@
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -763,33 +763,35 @@
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
EXPECT_THAT(itr->GetTerm(), Eq("你好"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
EXPECT_THAT(itr->GetTerm(), Eq("か"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
EXPECT_THAT(itr->GetTerm(), Eq("吗"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ContinuousWhitespacesResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -799,35 +801,36 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -837,21 +840,22 @@
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-832 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -860,21 +864,22 @@
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -882,21 +887,22 @@
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -905,24 +911,25 @@
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -930,21 +937,22 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package name com:google:android!"
- // ^ ^^ ^^ ^
- // Bytes: 0 7 8 12 13 31
- auto position_or = itr->ResetToTermEndingBefore(31);
+ // String: "package name com:google:android!"
+ // ^ ^^ ^^ ^
+ // UTF-8 idx: 0 7 8 12 13 31
+ // UTF-32 idx: 0 7 8 12 13 31
+ auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
- position_or = itr->ResetToTermEndingBefore(21);
+ position_or = itr->ResetToTermEndingBeforeUtf32(21);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
ASSERT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -952,19 +960,19 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
ASSERT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("are"));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
}
// Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -973,7 +981,7 @@
// terms produced by ResetToTermBefore calls with the current position
// provided as the argument (after their order has been reversed).
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -983,17 +991,12 @@
segmenter->Segment(kText));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -1002,7 +1005,7 @@
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermBeforeEquivalentToAdvance) {
+ ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -1012,17 +1015,12 @@
segmenter->Segment(kThai));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -1030,7 +1028,7 @@
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermBeforeEquivalentToAdvance) {
+ KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -1040,24 +1038,19 @@
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1066,35 +1059,36 @@
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("元気"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
EXPECT_THAT(itr->GetTerm(), Eq("です"));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ContinuousWhitespacesResetToTermBefore) {
+ ContinuousWhitespacesResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1104,34 +1098,35 @@
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1141,21 +1136,22 @@
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-32 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("我"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq("去"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1164,21 +1160,22 @@
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
EXPECT_THAT(itr->GetTerm(), Eq("てい"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1186,21 +1183,22 @@
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1209,20 +1207,21 @@
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
deleted file mode 100644
index 856ba0a..0000000
--- a/icing/tokenization/simple/space-language-segmenter-factory.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/simple/space-language-segmenter.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-namespace language_segmenter_factory {
-
-// Creates a language segmenter with the given locale.
-//
-// Returns:
-// A LanguageSegmenter on success
-// INVALID_ARGUMENT if locale string is invalid
-//
-// TODO(b/156383798): Figure out if we want to verify locale strings and notify
-// users. Right now illegal locale strings will be ignored by ICU. ICU
-// components will be created with its default locale.
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterOptions) {
- return std::make_unique<SpaceLanguageSegmenter>();
-}
-
-} // namespace language_segmenter_factory
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
deleted file mode 100644
index 7e301ec..0000000
--- a/icing/tokenization/simple/space-language-segmenter.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/simple/space-language-segmenter.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-constexpr char kASCIISpace = ' ';
-} // namespace
-
-class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
- public:
- SpaceLanguageSegmenterIterator(std::string_view text)
- : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
-
- // Advances to the next term. Returns false if it has reached the end.
- bool Advance() override {
- if (term_end_index_exclusive_ >= text_.size() ||
- term_start_index_ >= text_.size()) {
- // Reached the end
- return false;
- }
-
- // Next term starts where we left off.
- term_start_index_ = term_end_index_exclusive_;
-
- // We know a term is at least one length, so we can +1 first.
- term_end_index_exclusive_++;
-
- // We alternate terms between space and non-space. Figure out what type of
- // term we're currently on so we know how to stop.
- bool is_space = text_[term_start_index_] == kASCIISpace;
-
- while (term_end_index_exclusive_ < text_.size()) {
- bool end_is_space = text_[term_end_index_exclusive_] == kASCIISpace;
- if (is_space != end_is_space) {
- // We finally see a different type of character, reached the end.
- break;
- }
- // We're still seeing the same types of characters (saw a space and
- // still seeing spaces, or saw a non-space and still seeing non-spaces).
- // Haven't reached the next term yet, keep advancing.
- term_end_index_exclusive_++;
- }
-
- return true;
- }
-
- // Returns the current term. It can be called only when Advance() returns
- // true.
- std::string_view GetTerm() const override {
- if (text_[term_start_index_] == kASCIISpace) {
- // Rule: multiple continuous whitespaces are treated as one.
- return std::string_view(&text_[term_start_index_], 1);
- }
- return text_.substr(term_start_index_,
- term_end_index_exclusive_ - term_start_index_);
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) override {
- if (offset < 0) {
- // Start over from the beginning to find the first term.
- term_start_index_ = 0;
- term_end_index_exclusive_ = 0;
- } else {
- // Offset points to a term right now. Advance to get past the current
- // term.
- term_end_index_exclusive_ = offset;
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that starts after offset %d",
- std::string(text_).c_str(), offset));
- }
- }
-
- // Advance again so we can point to the next term.
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that starts after offset %d",
- std::string(text_).c_str(), offset));
- }
-
- return term_start_index_;
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) override {
- if (offset <= 0 || offset > text_.size()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that ends before offset %d",
- std::string(text_).c_str(), offset));
- }
-
- if (offset == text_.size()) {
- // Special-case if the offset is the text length, this is the last term in
- // the text, which is also considered to be "ending before" the offset.
- term_end_index_exclusive_ = offset;
- ICING_ASSIGN_OR_RETURN(term_start_index_, GetTermStartingBefore(offset));
- return term_start_index_;
- }
-
- // Otherwise, this is just the end of the previous term and we still need to
- // find the start of the previous term.
- ICING_ASSIGN_OR_RETURN(term_end_index_exclusive_,
- GetTermStartingBefore(offset));
-
- if (term_end_index_exclusive_ == 0) {
- // The current term starts at the beginning of the underlying text_.
- // There is no term before this.
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that ends before offset %d",
- std::string(text_).c_str(), offset));
- }
-
- // Reset ourselves to find the term before the end.
- ICING_ASSIGN_OR_RETURN(
- term_start_index_,
- GetTermStartingBefore(term_end_index_exclusive_ - 1));
- return term_start_index_;
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
- term_start_index_ = 0;
- term_end_index_exclusive_ = 0;
- if (!Advance()) {
- return absl_ports::NotFoundError("");
- }
- return term_start_index_;
- }
-
- private:
- // Return the start offset of the term starting right before the given offset.
- libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
- bool is_space = text_[offset] == kASCIISpace;
-
- // Special-case that if offset was the text length, then we're already at
- // the "end" of our current term.
- if (offset == text_.size()) {
- is_space = text_[--offset] == kASCIISpace;
- }
-
- // While it's the same type of character (space vs non-space), we're in the
- // same term. So keep iterating backwards until we see a change.
- while (offset >= 0 && (text_[offset] == kASCIISpace) == is_space) {
- --offset;
- }
-
- // +1 is because offset was off-by-one to exit the while-loop.
- return ++offset;
- }
-
- // Text to be segmented
- std::string_view text_;
-
- // The start and end indices are used to track the positions of current
- // term.
- int term_start_index_;
- int term_end_index_exclusive_;
-};
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-SpaceLanguageSegmenter::Segment(const std::string_view text) const {
- return std::make_unique<SpaceLanguageSegmenterIterator>(text);
-}
-
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-SpaceLanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- Segment(text));
- std::vector<std::string_view> terms;
- while (iterator->Advance()) {
- terms.push_back(iterator->GetTerm());
- }
- return terms;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
deleted file mode 100644
index de0a6d3..0000000
--- a/icing/tokenization/simple/space-language-segmenter.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/tokenization/language-segmenter.h"
-
-namespace icing {
-namespace lib {
-
-// Simple segmenter that splits on spaces, regardless of language. Continuous
-// whitespaces will be returned as a single whitespace character.
-class SpaceLanguageSegmenter : public LanguageSegmenter {
- public:
- SpaceLanguageSegmenter() = default;
- SpaceLanguageSegmenter(const SpaceLanguageSegmenter&) = delete;
- SpaceLanguageSegmenter& operator=(const SpaceLanguageSegmenter&) = delete;
-
- // Segmentation is based purely on whitespace; does not take into account the
- // language of the text.
- //
- // Returns:
- // An iterator of terms on success
- libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const override;
-
- // Does not take into account the language of the text.
- //
- // Returns:
- // A list of terms on success
- // INTERNAL_ERROR if any error occurs
- libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
- std::string_view text) const override;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
deleted file mode 100644
index 6c5e3f6..0000000
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-TEST(SpaceLanguageSegmenterTest, EmptyText) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
- EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST(SpaceLanguageSegmenterTest, SimpleText) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Punctuation) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
- IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
- EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
- IsOkAndHolds(ElementsAre("Open-source", " ", "project")));
- EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
- IsOkAndHolds(ElementsAre("100%")));
- EXPECT_THAT(language_segmenter->GetAllTerms("(A&B)"),
- IsOkAndHolds(ElementsAre("(A&B)")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
- IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Number) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(
- language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
- IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
- IsOkAndHolds(ElementsAre("3,456.789")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
- IsOkAndHolds(ElementsAre("-123")));
-}
-
-TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
-
- // Multiple continuous whitespaces are treated as one.
- const int kNumSeparators = 256;
- const std::string text_with_spaces =
- absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
- language_segmenter_factory::SegmenterOptions options(ULOC_US);
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(std::move(options)));
- // Validates that the input strings are not copied
- const std::string text = "Hello World";
- const char* word1_address = text.c_str();
- const char* word2_address = text.c_str() + 6;
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
- language_segmenter->GetAllTerms(text));
- ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
- const char* word1_result_address = terms.at(0).data();
- const char* word2_result_address = terms.at(2).data();
-
- // The underlying char* should be the same
- EXPECT_THAT(word1_address, Eq(word1_result_address));
- EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 38c4745..b4f0c6e 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -20,7 +20,9 @@
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
#include "icing/tokenization/token.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -64,6 +66,18 @@
// true, otherwise an invalid token could be returned.
virtual Token GetToken() const = 0;
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTokenStart() {
+ return absl_ports::UnimplementedError(
+ "CalculateTokenStart is not implemented!");
+ }
+
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTokenEndExclusive() {
+ return absl_ports::UnimplementedError(
+ "CalculateTokenEndExclusive is not implemented!");
+ }
+
// Sets the tokenizer to point at the first token that *starts* *after*
// offset. Returns false if there are no valid tokens starting after
// offset.
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
index 3707f95..6c5faef 100644
--- a/icing/util/character-iterator.cc
+++ b/icing/util/character-iterator.cc
@@ -30,6 +30,11 @@
} // namespace
+bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
+ return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
+ : RewindToUtf8(desired_utf8_index);
+}
+
bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
if (desired_utf8_index > text_.length()) {
// Enforce the requirement.
@@ -50,6 +55,7 @@
}
utf8_index_ += utf8_length;
utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ ++utf32_index_;
}
return true;
}
@@ -76,10 +82,17 @@
return false;
}
utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
}
return true;
}
+bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
+ return (desired_utf16_index > utf16_index_)
+ ? AdvanceToUtf16(desired_utf16_index)
+ : RewindToUtf16(desired_utf16_index);
+}
+
bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
while (utf16_index_ < desired_utf16_index) {
UChar32 uchar32 =
@@ -100,6 +113,7 @@
}
utf8_index_ += utf8_length;
utf16_index_ += utf16_length;
+ ++utf32_index_;
}
return true;
}
@@ -111,6 +125,11 @@
while (utf16_index_ > desired_utf16_index) {
--utf8_index_;
utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
// We've found the start of a unicode char!
UChar32 uchar32 =
i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
@@ -119,6 +138,59 @@
return false;
}
utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
+ }
+ return true;
+}
+
+bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
+ return (desired_utf32_index > utf32_index_)
+ ? AdvanceToUtf32(desired_utf32_index)
+ : RewindToUtf32(desired_utf32_index);
+}
+
+bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
+ while (utf32_index_ < desired_utf32_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ ++utf32_index_;
+ }
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
+ if (desired_utf32_index < 0) {
+ return false;
+ }
+ while (utf32_index_ > desired_utf32_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
}
return true;
}
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
index 22de6c5..9df7bee 100644
--- a/icing/util/character-iterator.h
+++ b/icing/util/character-iterator.h
@@ -15,6 +15,7 @@
#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
#define ICING_UTIL_CHARACTER_ITERATOR_H_
+#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/i18n-utils.h"
namespace icing {
@@ -23,23 +24,35 @@
class CharacterIterator {
public:
explicit CharacterIterator(std::string_view text)
- : CharacterIterator(text, 0, 0) {}
+ : CharacterIterator(text, 0, 0, 0) {}
- CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
- : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index,
+ int utf32_index)
+ : text_(text),
+ utf8_index_(utf8_index),
+ utf16_index_(utf16_index),
+ utf32_index_(utf32_index) {}
- // Moves from current position to the character that includes the specified
+ // Moves current position to desired_utf8_index.
+ // REQUIRES: 0 <= desired_utf8_index <= text_.length()
+ bool MoveToUtf8(int desired_utf8_index);
+
+ // Advances from current position to the character that includes the specified
// UTF-8 index.
// REQUIRES: desired_utf8_index <= text_.length()
// desired_utf8_index is allowed to point one index past the end, but no
// further.
bool AdvanceToUtf8(int desired_utf8_index);
- // Moves from current position to the character that includes the specified
+ // Rewinds from current position to the character that includes the specified
// UTF-8 index.
// REQUIRES: 0 <= desired_utf8_index
bool RewindToUtf8(int desired_utf8_index);
+ // Moves current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index <= text_.utf16_length()
+ bool MoveToUtf16(int desired_utf16_index);
+
// Advances current position to desired_utf16_index.
// REQUIRES: desired_utf16_index <= text_.utf16_length()
// desired_utf16_index is allowed to point one index past the end, but no
@@ -50,18 +63,39 @@
// REQUIRES: 0 <= desired_utf16_index
bool RewindToUtf16(int desired_utf16_index);
+ // Moves current position to desired_utf32_index.
+ // REQUIRES: 0 <= desired_utf32_index <= text_.utf32_length()
+ bool MoveToUtf32(int desired_utf32_index);
+
+ // Advances current position to desired_utf32_index.
+ // REQUIRES: desired_utf32_index <= text_.utf32_length()
+ // desired_utf32_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf32(int desired_utf32_index);
+
+ // Rewinds current position to desired_utf32_index.
+ // REQUIRES: 0 <= desired_utf32_index
+ bool RewindToUtf32(int desired_utf32_index);
+
int utf8_index() const { return utf8_index_; }
int utf16_index() const { return utf16_index_; }
+ int utf32_index() const { return utf32_index_; }
bool operator==(const CharacterIterator& rhs) const {
return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
- utf16_index_ == rhs.utf16_index_;
+ utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_;
+ }
+
+ std::string DebugString() const {
+ return IcingStringUtil::StringPrintf("(u8:%d,u16:%d,u32:%d)", utf8_index_,
+ utf16_index_, utf32_index_);
}
private:
std::string_view text_;
int utf8_index_;
int utf16_index_;
+ int utf32_index_;
};
} // namespace lib
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index 2019033..64f98f6 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -59,6 +59,7 @@
import java.util.Map;
import org.junit.After;
import org.junit.Before;
+import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
@@ -489,6 +490,7 @@
}
@Test
+ @Ignore("b/190845688")
public void testCJKTSnippets() throws Exception {
assertStatusOk(icingSearchEngine.initialize().getStatus());
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 4069810..35ad6d9 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=375495869)
+set(synced_AOSP_CL_number=378695940)