Merge remote-tracking branch 'goog/upstream-master' into androidx-platform-dev
* goog/upstream-master:
Sync from upstream.
Change-Id: I9f2e45c0b2438c91aacb9ebc32c6176deadfb808
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 3258d64..a281f22 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -7035,6 +7035,68 @@
ASSERT_THAT(result->snippet().entries(), IsEmpty());
}
+TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kChinese)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("走");
+ search_spec.set_term_match_type(MATCH_PREFIX);
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(
+ std::numeric_limits<int>::max());
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+ std::numeric_limits<int>::max());
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Get the content for "subject" and see what the match is.
+ std::string_view content = GetString(&result->document(), "body");
+ ASSERT_THAT(content, Eq(kChinese));
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(9));
+ EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(6));
+ std::string_view match =
+ content.substr(match_proto.exact_match_byte_position(),
+ match_proto.exact_match_byte_length());
+ ASSERT_THAT(match, Eq("走路"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/portable/endian.h b/icing/portable/endian.h
new file mode 100644
index 0000000..42f6c02
--- /dev/null
+++ b/icing/portable/endian.h
@@ -0,0 +1,206 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions that depend on bytesex. We define htonll and ntohll,
+// as well as "Google" versions of all the standards: ghtonl, ghtons, and
+// so on. These functions do exactly the same as their standard variants,
+// but don't require including the dangerous netinet/in.h.
+
+#ifndef ICING_PORTABLE_ENDIAN_H_
+#define ICING_PORTABLE_ENDIAN_H_
+
+#include <cstdint>
+
+// IS_LITTLE_ENDIAN, IS_BIG_ENDIAN
+#if defined OS_LINUX || defined OS_ANDROID || defined(__ANDROID__)
+// _BIG_ENDIAN
+#include <endian.h>
+
+#elif defined(__APPLE__)
+
+// BIG_ENDIAN
+#include <machine/endian.h> // NOLINT(build/include)
+
+/* Let's try and follow the Linux convention */
+#define __BYTE_ORDER BYTE_ORDER
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#define __BIG_ENDIAN BIG_ENDIAN
+
+#endif // operating system
+
+// defines __BYTE_ORDER for MSVC
+#ifdef COMPILER_MSVC
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#else // COMPILER_MSVC
+
+// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN
+// using the above endian definitions from endian.h if
+// endian.h was included
+#ifdef __BYTE_ORDER
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#endif // __BYTE_ORDER == __LITTLE_ENDIAN
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif // __BYTE_ORDER == __BIG_ENDIAN
+
+#else // __BYTE_ORDER
+
+#if defined(__LITTLE_ENDIAN__)
+#define IS_LITTLE_ENDIAN
+#elif defined(__BIG_ENDIAN__)
+#define IS_BIG_ENDIAN
+#endif // __LITTLE_ENDIAN__ or __BIG_ENDIAN__
+
+#endif // __BYTE_ORDER
+#endif // COMPILER_MSVC
+
+// byte swap functions (bswap_16, bswap_32, bswap_64).
+// byte swap functions reverse the order of bytes, e.g.
+// byteswap of 102030405060708 = 807060504030201
+// byteswap of 1020304 = 4030201
+
+// The following guarantees declaration of the byte swap functions
+#ifdef COMPILER_MSVC
+#include <stdlib.h> // NOLINT(build/include)
+
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__GLIBC__) || defined(__BIONIC__) || defined(__ASYLO__)
+#include <byteswap.h> // IWYU pragma: export
+
+#else // built-in byteswap functions
+
+static inline uint16 bswap_16(uint16 x) {
+#ifdef __cplusplus
+ return static_cast<uint16>(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));
+#else // __cplusplus
+ return (uint16)(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8)); // NOLINT
+#endif // __cplusplus
+}
+#define bswap_16(x) bswap_16(x)
+static inline uint32 bswap_32(uint32 x) {
+ return (((x & 0xFF) << 24) | ((x & 0xFF00) << 8) | ((x & 0xFF0000) >> 8) |
+ ((x & 0xFF000000) >> 24));
+}
+#define bswap_32(x) bswap_32(x)
+static inline uint64 bswap_64(uint64 x) {
+ return (((x & (uint64_t)0xFF) << 56) | ((x & (uint64_t)0xFF00) << 40) |
+ ((x & (uint64_t)0xFF0000) << 24) | ((x & (uint64_t)0xFF000000) << 8) |
+ ((x & (uint64_t)0xFF00000000) >> 8) |
+ ((x & (uint64_t)0xFF0000000000) >> 24) |
+ ((x & (uint64_t)0xFF000000000000) >> 40) |
+ ((x & (uint64_t)0xFF00000000000000) >> 56));
+}
+#define bswap_64(x) bswap_64(x)
+
+#endif // end byteswap functions
+
+// Use compiler byte-swapping intrinsics if they are available. 32-bit
+// and 64-bit versions are available in Clang and GCC as of GCC 4.3.0.
+// The 16-bit version is available in Clang and GCC only as of GCC 4.8.0.
+// For simplicity, we enable them all only for GCC 4.8.0 or later.
+#if defined(__clang__) || \
+ (defined(__GNUC__) && \
+ ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+
+inline uint64_t gbswap_64(uint64_t host_int) {
+ return __builtin_bswap64(host_int);
+}
+inline uint32_t gbswap_32(uint32_t host_int) {
+ return __builtin_bswap32(host_int);
+}
+inline uint16_t gbswap_16(uint16_t host_int) {
+ return __builtin_bswap16(host_int);
+}
+
+#else // intrinsics available
+
+inline uint64 gbswap_64(uint64 host_int) {
+#if defined(__GNUC__) && defined(__x86_64__) && \
+ !(defined(__APPLE__) && defined(__MACH__))
+ // Adapted from /usr/include/byteswap.h. Not available on Mac.
+ if (__builtin_constant_p(host_int)) {
+ return __bswap_constant_64(host_int);
+ } else {
+ uint64 result;
+ __asm__("bswap %0" : "=r"(result) : "0"(host_int));
+ return result;
+ }
+#elif defined(bswap_64)
+ return bswap_64(host_int);
+#else // bswap_64
+ return static_cast<uint64>(bswap_32(static_cast<uint32>(host_int >> 32))) |
+ (static_cast<uint64>(bswap_32(static_cast<uint32>(host_int))) << 32);
+#endif // bswap_64
+}
+inline uint32 gbswap_32(uint32 host_int) { return bswap_32(host_int); }
+inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); }
+
+#endif // intrinsics available
+
+#ifdef IS_LITTLE_ENDIAN
+
+// Definitions for ntohl etc. that don't require us to include
+// netinet/in.h. We wrap gbswap_32 and gbswap_16 in functions rather
+// than just #defining them because in debug mode, gcc doesn't
+// correctly handle the (rather involved) definitions of bswap_32.
+// gcc guarantees that inline functions are as fast as macros, so
+// this isn't a performance hit.
+inline uint16_t ghtons(uint16_t x) { return gbswap_16(x); }
+inline uint32_t ghtonl(uint32_t x) { return gbswap_32(x); }
+inline uint64_t ghtonll(uint64_t x) { return gbswap_64(x); }
+
+#elif defined IS_BIG_ENDIAN
+
+// These definitions are simpler on big-endian machines
+// These are functions instead of macros to avoid self-assignment warnings
+// on calls such as "i = ghtnol(i);". This also provides type checking.
+inline uint16 ghtons(uint16 x) { return x; }
+inline uint32 ghtonl(uint32 x) { return x; }
+inline uint64 ghtonll(uint64 x) { return x; }
+
+#else // bytesex
+#error \
+ "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT
+#endif // bytesex
+
+#ifndef htonll
+// With the rise of 64-bit, some systems are beginning to define this.
+#define htonll(x) ghtonll(x)
+#endif // htonll
+
+// ntoh* and hton* are the same thing for any size and bytesex,
+// since the function is an involution, i.e., its own inverse.
+inline uint16_t gntohs(uint16_t x) { return ghtons(x); }
+inline uint32_t gntohl(uint32_t x) { return ghtonl(x); }
+inline uint64_t gntohll(uint64_t x) { return ghtonll(x); }
+
+#ifndef ntohll
+#define ntohll(x) htonll(x)
+#endif // ntohll
+
+#endif // ICING_PORTABLE_ENDIAN_H_
diff --git a/icing/portable/platform.h b/icing/portable/platform.h
index 0cccd57..8712835 100644
--- a/icing/portable/platform.h
+++ b/icing/portable/platform.h
@@ -15,8 +15,6 @@
#ifndef ICING_PORTABLE_PLATFORM_H_
#define ICING_PORTABLE_PLATFORM_H_
-// This file is meant to hold util functions for tests that help the test
-// determine which platform-specific configuration it may be running in.
namespace icing {
namespace lib {
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 31a2e5f..dc9f8be 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -39,6 +39,7 @@
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
@@ -218,12 +219,21 @@
std::string_view section_subcontent;
};
+// Creates a snippet match proto for the match pointed to by the iterator and
+// char_iterator
+//
+// Returns:
+// the position of the window start if successful
+// INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
+// in an invalid state
+// ABORTED_ERROR - if an invalid utf-8 sequence is encountered
libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- const SectionData& value, Tokenizer::Iterator* iterator) {
+ const SectionData& value, Tokenizer::Iterator* iterator,
+ const CharacterIterator& char_iterator) {
SnippetMatchProto snippet_match;
Token match = iterator->GetToken();
- int match_pos = match.text.data() - value.section_subcontent.data();
+ int match_pos = char_iterator.utf8_index();
// When finding boundaries, we have a few cases:
//
@@ -258,23 +268,42 @@
int window_end_max_exclusive =
match_mid + (snippet_spec.max_window_bytes() + 1) / 2;
- snippet_match.set_exact_match_position(match_pos);
- snippet_match.set_exact_match_bytes(match.text.length());
+ snippet_match.set_exact_match_byte_position(match_pos);
+ snippet_match.set_exact_match_utf16_position(char_iterator.utf16_index());
+
+ // Create character iterators to find the beginning and end of the window.
+ CharacterIterator forward_char_iterator(char_iterator);
+ CharacterIterator backwards_char_iterator(char_iterator);
+
+ if (!backwards_char_iterator.AdvanceToUtf8(match_pos + match.text.length())) {
+ return absl_ports::AbortedError("Could not retrieve valid utf8 character!");
+ }
+ snippet_match.set_exact_match_byte_length(match.text.length());
+ snippet_match.set_exact_match_utf16_length(
+ backwards_char_iterator.utf16_index() - char_iterator.utf16_index());
// Only include windows if it'll at least include the matched text. Otherwise,
// it'll just be an empty string anyways.
if (snippet_spec.max_window_bytes() >= match.text.length()) {
// Find the beginning of the window.
int window_start;
+ int window_start_utf16;
if (window_start_min_exclusive < 0) {
window_start = 0;
+ window_start_utf16 = 0;
} else {
ICING_ASSIGN_OR_RETURN(
window_start,
DetermineWindowStart(snippet_spec, value.section_subcontent,
window_start_min_exclusive, iterator));
+ if (!forward_char_iterator.RewindToUtf8(window_start)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ window_start_utf16 = forward_char_iterator.utf16_index();
}
- snippet_match.set_window_position(window_start);
+ snippet_match.set_window_byte_position(window_start);
+ snippet_match.set_window_utf16_position(window_start_utf16);
// Find the end of the window.
int window_end_exclusive;
@@ -286,7 +315,13 @@
DetermineWindowEnd(snippet_spec, value.section_subcontent,
window_end_max_exclusive, iterator));
}
- snippet_match.set_window_bytes(window_end_exclusive - window_start);
+ if (!backwards_char_iterator.AdvanceToUtf8(window_end_exclusive)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ snippet_match.set_window_byte_length(window_end_exclusive - window_start);
+ snippet_match.set_window_utf16_length(
+ backwards_char_iterator.utf16_index() - window_start_utf16);
// DetermineWindowStart/End may change the position of the iterator. So,
// reset the iterator back to the original position.
@@ -332,16 +367,38 @@
std::string_view value = current_property->string_values(i);
std::unique_ptr<Tokenizer::Iterator> iterator =
tokenizer->Tokenize(value).ValueOrDie();
+ CharacterIterator char_iterator(value);
while (iterator->Advance()) {
Token token = iterator->GetToken();
if (matcher->Matches(token)) {
- // If there was an error while retrieving the match, the tokenizer
- // iterator is probably in an invalid state. There's nothing we can do
- // here, so just return.
+ if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) {
+ // We can't get the char_iterator to a valid position, so there's no
+ // way for us to provide valid utf-16 indices. There's nothing more we
+ // can do here, so just return whatever we've built up so far.
+ if (!snippet_entry.snippet_matches().empty()) {
+ *snippet_proto->add_entries() = std::move(snippet_entry);
+ }
+ return;
+ }
SectionData data = {property_path, value};
- SnippetMatchProto match =
- RetrieveMatch(match_options->snippet_spec, data, iterator.get())
- .ValueOrDie();
+ auto match_or = RetrieveMatch(match_options->snippet_spec, data,
+ iterator.get(), char_iterator);
+ if (!match_or.ok()) {
+ if (absl_ports::IsAborted(match_or.status())) {
+ // Only an aborted. We can't get this match, but we might be able to
+ // retrieve others. Just continue.
+ continue;
+ } else {
+ // Probably an internal error. The tokenizer iterator is probably in
+ // an invalid state. There's nothing more we can do here, so just
+ // return whatever we've built up so far.
+ if (!snippet_entry.snippet_matches().empty()) {
+ *snippet_proto->add_entries() = std::move(snippet_entry);
+ }
+ return;
+ }
+ }
+ SnippetMatchProto match = std::move(match_or).ValueOrDie();
snippet_entry.mutable_snippet_matches()->Add(std::move(match));
if (--match_options->max_matches_remaining <= 0) {
*snippet_proto->add_entries() = std::move(snippet_entry);
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index ff38372..c052a9e 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -1134,6 +1134,201 @@
"B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
}
+TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kChinese)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
+TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kChinese)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+ // Set a twenty byte window. This will produce a window like this:
+ // String: "我每天走路去上班。"
+ // ^ ^
+ // UTF8 idx: 3 18
+ // UTF16 idx: 1 6
+ snippet_spec_.set_max_window_bytes(20);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetWindows(content, *entry), ElementsAre("每天走路去"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.window_utf16_position(), Eq(1));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(5));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
+ // The following string has four-byte UTF-8 characters. Most importantly, it
+ // is also two code units in UTF-16.
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF8 idx: 0 9 18
+ // UTF16 idx: 0 5 10
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kText)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
+ // The following string has four-byte UTF-8 characters. Most importantly, it
+ // is also two code units in UTF-16.
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF8 idx: 0 9 18
+ // UTF16 idx: 0 5 10
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kText)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+ // Set a twenty byte window. This will produce a window like this:
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^
+ // UTF8 idx: 9 22
+ // UTF16 idx: 5 12
+ snippet_spec_.set_max_window_bytes(20);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetWindows(content, *entry), ElementsAre("𐀂𐀃 𐀄"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 9631e29..afcae86 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -459,44 +459,42 @@
schema_type_id = schema_type_id_or.ValueOrDie();
}
- ICING_ASSIGN_OR_RETURN(
- NamespaceId namespace_id,
- namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
- namespace_mapper_->num_keys()));
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+ namespace_mapper_->num_keys()));
- // Update corpus maps
- std::string corpus =
- MakeFingerprint(document_wrapper.document().namespace_(),
- document_wrapper.document().schema());
- ICING_ASSIGN_OR_RETURN(
- CorpusId corpusId,
- corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
+ // Update corpus maps
+ std::string corpus =
+ MakeFingerprint(document_wrapper.document().namespace_(),
+ document_wrapper.document().schema());
+ ICING_ASSIGN_OR_RETURN(
+ CorpusId corpusId,
+ corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
- ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
- GetCorpusAssociatedScoreDataToUpdate(corpusId));
- scoring_data.AddDocument(
- document_wrapper.document().internal_fields().length_in_tokens());
+ ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+ GetCorpusAssociatedScoreDataToUpdate(corpusId));
+ scoring_data.AddDocument(
+ document_wrapper.document().internal_fields().length_in_tokens());
- ICING_RETURN_IF_ERROR(
- UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+ ICING_RETURN_IF_ERROR(
+ UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
- ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- new_document_id,
- DocumentAssociatedScoreData(
- corpusId, document_wrapper.document().score(),
- document_wrapper.document().creation_timestamp_ms(),
- document_wrapper.document()
- .internal_fields()
- .length_in_tokens())));
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(
+ corpusId, document_wrapper.document().score(),
+ document_wrapper.document().creation_timestamp_ms(),
+ document_wrapper.document().internal_fields().length_in_tokens())));
- int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
- document_wrapper.document().creation_timestamp_ms(),
- document_wrapper.document().ttl_ms());
+ int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
+ document_wrapper.document().creation_timestamp_ms(),
+ document_wrapper.document().ttl_ms());
- ICING_RETURN_IF_ERROR(UpdateFilterCache(
- new_document_id, DocumentFilterData(namespace_id, schema_type_id,
- expiration_timestamp_ms)));
- iterator_status = iterator.Advance();
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+ expiration_timestamp_ms)));
+ iterator_status = iterator.Advance();
}
if (!absl_ports::IsOutOfRange(iterator_status)) {
@@ -833,19 +831,16 @@
expiration_timestamp_ms)));
if (old_document_id_or.ok()) {
+ // The old document exists, copy over the usage scores and delete the old
+ // document.
DocumentId old_document_id = old_document_id_or.ValueOrDie();
- auto offset_or = DoesDocumentExistAndGetFileOffset(old_document_id);
- if (offset_or.ok()) {
- // The old document exists, copy over the usage scores.
- ICING_RETURN_IF_ERROR(
- usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
- /*to_document_id=*/new_document_id));
+ ICING_RETURN_IF_ERROR(
+ usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
+ /*to_document_id=*/new_document_id));
- // Delete the old document.
- ICING_RETURN_IF_ERROR(document_log_->EraseProto(offset_or.ValueOrDie()));
- ICING_RETURN_IF_ERROR(ClearDerivedData(old_document_id));
- }
+ // Delete the old document.
+ ICING_RETURN_IF_ERROR(Delete(old_document_id));
}
if (put_document_stats != nullptr) {
@@ -886,8 +881,15 @@
libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
DocumentId document_id, bool clear_internal_fields) const {
- ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
- DoesDocumentExistAndGetFileOffset(document_id));
+ ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
+
+ auto document_log_offset_or = document_id_mapper_->Get(document_id);
+ if (!document_log_offset_or.ok()) {
+ // Since we've just checked that our document_id is valid a few lines above,
+ // there's no reason this should fail and an error should never happen.
+ return absl_ports::InternalError("Failed to find document offset.");
+ }
+ int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
// TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
@@ -938,7 +940,7 @@
}
const DocumentFilterData* data = status_or_data.ValueOrDie();
- if (DoesDocumentExist(document_id)) {
+ if (InternalDoesDocumentExist(document_id)) {
existing_namespace_ids.insert(data->namespace_id());
}
}
@@ -951,40 +953,74 @@
return existing_namespaces;
}
-libtextclassifier3::StatusOr<int64_t>
-DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
+bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
if (!IsDocumentIdValid(document_id)) {
- return absl_ports::InvalidArgumentError(
- IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id));
+ return false;
}
- auto file_offset_or = document_id_mapper_->Get(document_id);
-
- bool deleted =
- file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag;
- if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) {
- // Document has been deleted or doesn't exist
- return absl_ports::NotFoundError(
- IcingStringUtil::StringPrintf("Document %d not found", document_id));
+ if (document_id >= document_id_mapper_->num_elements()) {
+ // Somehow got an validly constructed document_id that the document store
+ // doesn't know about
+ return false;
}
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
- filter_cache_->Get(document_id));
- if (clock_.GetSystemTimeMilliseconds() >=
- filter_data->expiration_timestamp_ms()) {
- // Past the expiration time, so also return NOT FOUND since it *shouldn't*
- // exist anymore.
- return absl_ports::NotFoundError(
- IcingStringUtil::StringPrintf("Document %d not found", document_id));
- }
-
- ICING_RETURN_IF_ERROR(file_offset_or.status());
- return *file_offset_or.ValueOrDie();
+ return InternalDoesDocumentExist(document_id);
}
-bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
- // If we can successfully get the document log offset, the document exists.
- return DoesDocumentExistAndGetFileOffset(document_id).ok();
+libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus(
+ DocumentId document_id) const {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id '%d' invalid.", document_id));
+ }
+
+ if (document_id >= document_id_mapper_->num_elements()) {
+ // Somehow got a validly constructed document_id that the document store
+ // doesn't know about.
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Unknown document id '%d'.", document_id));
+ }
+
+ if (!InternalDoesDocumentExist(document_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Document id '%d' doesn't exist", document_id));
+ };
+ return libtextclassifier3::Status::OK;
+}
+
+bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const {
+ return !IsDeleted(document_id) && !IsExpired(document_id);
+}
+
+bool DocumentStore::IsDeleted(DocumentId document_id) const {
+ auto file_offset_or = document_id_mapper_->Get(document_id);
+ if (!file_offset_or.ok()) {
+ // This would only happen if document_id is out of range of the
+ // document_id_mapper, meaning we got some invalid document_id. Callers
+ // should already have checked that their document_id is valid or used
+ // DoesDocumentExist(WithStatus). Regardless, return true since the document
+ // doesn't exist.
+ return true;
+ }
+ int64_t file_offset = *file_offset_or.ValueOrDie();
+ return file_offset == kDocDeletedFlag;
+}
+
+bool DocumentStore::IsExpired(DocumentId document_id) const {
+ auto filter_data_or = filter_cache_->Get(document_id);
+ if (!filter_data_or.ok()) {
+ // This would only happen if document_id is out of range of the
+ // filter_cache, meaning we got some invalid document_id. Callers should
+ // already have checked that their document_id is valid or used
+ // DoesDocumentExist(WithStatus). Regardless, return true since the document
+ // doesn't exist.
+ return true;
+ }
+ const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
+
+ // Check if it's past the expiration time
+ return clock_.GetSystemTimeMilliseconds() >=
+ filter_data->expiration_timestamp_ms();
}
libtextclassifier3::Status DocumentStore::Delete(
@@ -1001,9 +1037,14 @@
}
libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) {
- // Copy out the document to get namespace and uri.
- ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
- DoesDocumentExistAndGetFileOffset(document_id));
+ ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
+
+ auto document_log_offset_or = document_id_mapper_->Get(document_id);
+ if (!document_log_offset_or.ok()) {
+ return absl_ports::InternalError("Failed to find document offset.");
+ }
+ int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
+
// Erases document proto.
ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
return ClearDerivedData(document_id);
@@ -1262,15 +1303,9 @@
for (DocumentId document_id = 0;
document_id < document_id_mapper_->num_elements(); ++document_id) {
// Check if it's deleted first.
- auto location_or = document_id_mapper_->Get(document_id);
- if (!location_or.ok()) {
- ICING_VLOG(1) << "Error trying to get document offsets for document "
- "store storage info counts.";
- continue;
- }
- if (*location_or.ValueOrDie() == kDocDeletedFlag) {
- // We don't have the namespace id of hard deleted documents anymore, so we
- // can't add to our namespace storage info.
+ if (IsDeleted(document_id)) {
+ // We don't have the namespace id of hard deleted documents anymore, so
+ // we can't add to our namespace storage info.
++total_num_deleted;
continue;
}
@@ -1308,23 +1343,7 @@
UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
// Update our stats
- if (DoesDocumentExist(document_id)) {
- ++total_num_alive;
- namespace_storage_info.set_num_alive_documents(
- namespace_storage_info.num_alive_documents() + 1);
- if (usage_scores.usage_type1_count > 0) {
- namespace_storage_info.set_num_alive_documents_usage_type1(
- namespace_storage_info.num_alive_documents_usage_type1() + 1);
- }
- if (usage_scores.usage_type2_count > 0) {
- namespace_storage_info.set_num_alive_documents_usage_type2(
- namespace_storage_info.num_alive_documents_usage_type2() + 1);
- }
- if (usage_scores.usage_type3_count > 0) {
- namespace_storage_info.set_num_alive_documents_usage_type3(
- namespace_storage_info.num_alive_documents_usage_type3() + 1);
- }
- } else {
+ if (IsExpired(document_id)) {
++total_num_expired;
namespace_storage_info.set_num_expired_documents(
namespace_storage_info.num_expired_documents() + 1);
@@ -1340,6 +1359,22 @@
namespace_storage_info.set_num_expired_documents_usage_type3(
namespace_storage_info.num_expired_documents_usage_type3() + 1);
}
+ } else {
+ ++total_num_alive;
+ namespace_storage_info.set_num_alive_documents(
+ namespace_storage_info.num_alive_documents() + 1);
+ if (usage_scores.usage_type1_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type1(
+ namespace_storage_info.num_alive_documents_usage_type1() + 1);
+ }
+ if (usage_scores.usage_type2_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type2(
+ namespace_storage_info.num_alive_documents_usage_type2() + 1);
+ }
+ if (usage_scores.usage_type3_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type3(
+ namespace_storage_info.num_alive_documents_usage_type3() + 1);
+ }
}
}
@@ -1422,16 +1457,9 @@
int size = document_id_mapper_->num_elements();
for (DocumentId document_id = 0; document_id < size; document_id++) {
- auto exists_or = DoesDocumentExistAndGetFileOffset(document_id);
- if (absl_ports::IsNotFound(exists_or.status())) {
+ if (!InternalDoesDocumentExist(document_id)) {
// Skip nonexistent documents
continue;
- } else if (!exists_or.ok()) {
- // Real error, pass up
- return absl_ports::Annotate(
- exists_or.status(),
- IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d",
- document_id));
}
// Guaranteed that the document exists now.
@@ -1508,11 +1536,9 @@
for (DocumentId document_id = 0; document_id < size; document_id++) {
auto document_or = Get(document_id, /*clear_internal_fields=*/false);
if (absl_ports::IsNotFound(document_or.status())) {
- // Don't optimize nonexistent documents, but collect stats
- auto location_or = document_id_mapper_->Get(document_id);
- if (location_or.ok() && *location_or.ValueOrDie() == kDocDeletedFlag) {
+ if (IsDeleted(document_id)) {
++num_deleted;
- } else {
+ } else if (IsExpired(document_id)) {
++num_expired;
}
continue;
@@ -1576,7 +1602,7 @@
int32_t num_documents = document_id_mapper_->num_elements();
for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
++document_id) {
- if (!DoesDocumentExist(document_id)) {
+ if (!InternalDoesDocumentExist(document_id)) {
++optimize_info.optimizable_docs;
}
@@ -1614,10 +1640,10 @@
ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
document_key_mapper_->GetElementsSize());
- // We don't include the namespace_mapper or the corpus_mapper because it's not
- // clear if we could recover any space even if Optimize were called. Deleting
- // 100s of documents could still leave a few documents of a namespace, and
- // then there would be no change.
+ // We don't include the namespace_mapper or the corpus_mapper because it's
+ // not clear if we could recover any space even if Optimize were called.
+ // Deleting 100s of documents could still leave a few documents of a
+ // namespace, and then there would be no change.
int64_t total_size = document_log_file_size + document_key_mapper_size +
document_id_mapper_file_size + score_cache_file_size +
@@ -1647,8 +1673,8 @@
libtextclassifier3::Status DocumentStore::ClearDerivedData(
DocumentId document_id) {
// We intentionally leave the data in key_mapper_ because locating that data
- // requires fetching namespace and uri. Leaving data in key_mapper_ should be
- // fine because the data is hashed.
+ // requires fetching namespace and uri. Leaving data in key_mapper_ should
+ // be fine because the data is hashed.
ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 832c470..a8d87c8 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -198,6 +198,12 @@
// Check if a document exists. Existence means it hasn't been deleted and it
// hasn't expired yet.
//
+ // NOTE: This should be used when callers don't care about error messages,
+ // expect documents to be deleted/not found, or in frequently called code
+ // paths that could cause performance issues. A signficant amount of CPU
+ // cycles can be saved if we don't construct strings and create new Status
+ // objects on the heap. See b/185822483.
+ //
// Returns:
// boolean whether a document exists or not
bool DoesDocumentExist(DocumentId document_id) const;
@@ -625,22 +631,46 @@
libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
- // Helper method to validate the document id and return the file offset of the
- // associated document in document_log_.
- //
- // This can be a more informative call than just DoesDocumentExist because it
- // can return more status errors on whether the Document actually doesn't
- // exist or if there was an internal error while accessing files.
+ // Check if a document exists. Existence means it hasn't been deleted and it
+ // hasn't expired yet.
//
// Returns:
- // The file offset on success
+ // OK if the document exists
// INVALID_ARGUMENT if document_id is less than 0 or greater than the
// maximum value
// NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
+ libtextclassifier3::Status DoesDocumentExistWithStatus(
DocumentId document_id) const;
+ // Check if a document exists. Existence means it hasn't been deleted and it
+ // hasn't expired yet.
+ //
+ // This is for internal-use only because we assume that the document_id is
+ // already valid. If you're unsure if the document_id is valid, use
+ // DoesDocumentExist(document_id) instead, which will perform those additional
+ // checks.
+ //
+ // Returns:
+ // boolean whether a document exists or not
+ bool InternalDoesDocumentExist(DocumentId document_id) const;
+
+ // Checks if a document has been deleted
+ //
+ // This is for internal-use only because we assume that the document_id is
+ // already valid. If you're unsure if the document_id is valid, use
+ // DoesDocumentExist(document_id) instead, which will perform those additional
+ // checks.
+ bool IsDeleted(DocumentId document_id) const;
+
+ // Checks if a document has expired.
+ //
+ // This is for internal-use only because we assume that the document_id is
+ // already valid. If you're unsure if the document_id is valid, use
+ // DoesDocumentExist(document_id) instead, which will perform those additional
+ // checks.
+ bool IsExpired(DocumentId document_id) const;
+
// Updates the entry in the score cache for document_id.
libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data);
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
new file mode 100644
index 0000000..f68e115
--- /dev/null
+++ b/icing/store/document-store_benchmark.cc
@@ -0,0 +1,174 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/store:document-store_benchmark
+//
+// $ blaze-bin/icing/store/document-store_benchmark
+// --benchmarks=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/store:document-store_benchmark
+//
+// $ adb push blaze-bin/icing/store/document-store_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/document-store_benchmark
+// --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto_Cardinality_Code_OPTIONAL;
+
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+ StringIndexingConfig_TokenizerType_Code_PLAIN;
+
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+
+class DestructibleDirectory {
+ public:
+ explicit DestructibleDirectory(const Filesystem& filesystem,
+ const std::string& dir)
+ : filesystem_(filesystem), dir_(dir) {
+ filesystem_.CreateDirectoryRecursively(dir_.c_str());
+ }
+ ~DestructibleDirectory() {
+ filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+ }
+
+ private:
+ Filesystem filesystem_;
+ std::string dir_;
+};
+
+DocumentProto CreateDocument(const std::string namespace_,
+ const std::string uri) {
+ return DocumentBuilder()
+ .SetKey(namespace_, uri)
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .Build();
+}
+
+SchemaProto CreateSchema() {
+ return SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+}
+
+std::unique_ptr<SchemaStore> CreateSchemaStore(Filesystem filesystem,
+ const std::string directory,
+ const Clock* clock) {
+ const std::string schema_store_dir = directory + "/schema";
+ filesystem.CreateDirectoryRecursively(schema_store_dir.data());
+ std::unique_ptr<SchemaStore> schema_store =
+ SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie();
+
+ auto set_schema_status = schema_store->SetSchema(CreateSchema());
+ if (!set_schema_status.ok()) {
+ ICING_LOG(ERROR) << set_schema_status.status().error_message();
+ }
+
+ return schema_store;
+}
+
+void BM_DoesDocumentExistBenchmark(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ int max_document_id = 300000;
+ for (int i = 0; i < max_document_id; ++i) {
+ // Put and delete a lot of documents to fill up our derived files with
+ // stuff.
+ ICING_ASSERT_OK(document_store->Put(
+ CreateDocument("namespace", /*uri=*/std::to_string(i))));
+ document_store->Delete("namespace", /*uri=*/std::to_string(i));
+ }
+
+ std::default_random_engine random;
+ std::uniform_int_distribution<> dist(1, max_document_id);
+ for (auto s : state) {
+ // Check random document ids to see if they exist. Hopefully to simulate
+ // page faulting in different sections of our mmapped derived files.
+ int document_id = dist(random);
+ benchmark::DoNotOptimize(document_store->DoesDocumentExist(document_id));
+ }
+}
+BENCHMARK(BM_DoesDocumentExistBenchmark);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 42aabde..076d642 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -327,7 +327,7 @@
EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1)));
}
-TEST_F(DocumentStoreTest, IsDocumentExisting) {
+TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
index 6a017ef..cfd20c2 100644
--- a/icing/testing/snippet-helpers.cc
+++ b/icing/testing/snippet-helpers.cc
@@ -61,8 +61,8 @@
std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
std::vector<std::string_view> windows;
for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
- windows.push_back(
- content.substr(match.window_position(), match.window_bytes()));
+ windows.push_back(content.substr(match.window_byte_position(),
+ match.window_byte_length()));
}
return windows;
}
@@ -71,8 +71,8 @@
std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
std::vector<std::string_view> matches;
for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
- matches.push_back(content.substr(match.exact_match_position(),
- match.exact_match_bytes()));
+ matches.push_back(content.substr(match.exact_match_byte_position(),
+ match.exact_match_byte_length()));
}
return matches;
}
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index bca0223..2019033 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -45,6 +45,8 @@
import com.google.android.icing.proto.SearchResultProto;
import com.google.android.icing.proto.SearchSpecProto;
import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.SnippetMatchProto;
+import com.google.android.icing.proto.SnippetProto;
import com.google.android.icing.proto.StatusProto;
import com.google.android.icing.proto.StorageInfoResultProto;
import com.google.android.icing.proto.StringIndexingConfig;
@@ -486,6 +488,140 @@
assertStatusOk(reportUsageResultProto.getStatus());
}
+ @Test
+ public void testCJKTSnippets() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+ assertStatusOk(
+ icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ String chinese = "我每天走路去上班。";
+ assertThat(chinese.length()).isEqualTo(9);
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese))
+ .build();
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("每")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto =
+ ResultSpecProto.newBuilder()
+ .setSnippetSpec(
+ ResultSpecProto.SnippetSpecProto.newBuilder()
+ .setNumToSnippet(Integer.MAX_VALUE)
+ .setNumMatchesPerProperty(Integer.MAX_VALUE))
+ .build();
+
+ // Search and make sure that we got a single successful results
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+ // Ensure that one and only one property was matched and it was "subject"
+ SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+ assertThat(snippetProto.getEntriesList()).hasSize(1);
+ SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+ assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+ // Get the content for "subject" and see what the match is.
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument.getPropertiesList()).hasSize(1);
+ PropertyProto subjectProperty = resultDocument.getProperties(0);
+ assertThat(subjectProperty.getName()).isEqualTo("subject");
+ assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+ String content = subjectProperty.getStringValues(0);
+
+ // Ensure that there is one and only one match within "subject"
+ assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+ SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+ int matchStart = matchProto.getExactMatchUtf16Position();
+ int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+ assertThat(matchStart).isEqualTo(1);
+ assertThat(matchEnd).isEqualTo(3);
+ String match = content.substring(matchStart, matchEnd);
+ assertThat(match).isEqualTo("每天");
+ }
+
+ @Test
+ public void testUtf16MultiByteSnippets() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+ assertStatusOk(
+ icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF16 idx: 0 5 10
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ String text = "𐀀𐀁 𐀂𐀃 𐀄";
+ assertThat(text.length()).isEqualTo(12);
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(text))
+ .build();
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("𐀂")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto =
+ ResultSpecProto.newBuilder()
+ .setSnippetSpec(
+ ResultSpecProto.SnippetSpecProto.newBuilder()
+ .setNumToSnippet(Integer.MAX_VALUE)
+ .setNumMatchesPerProperty(Integer.MAX_VALUE))
+ .build();
+
+ // Search and make sure that we got a single successful results
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+ // Ensure that one and only one property was matched and it was "subject"
+ SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+ assertThat(snippetProto.getEntriesList()).hasSize(1);
+ SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+ assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+ // Get the content for "subject" and see what the match is.
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument.getPropertiesList()).hasSize(1);
+ PropertyProto subjectProperty = resultDocument.getProperties(0);
+ assertThat(subjectProperty.getName()).isEqualTo("subject");
+ assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+ String content = subjectProperty.getStringValues(0);
+
+ // Ensure that there is one and only one match within "subject"
+ assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+ SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+ int matchStart = matchProto.getExactMatchUtf16Position();
+ int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+ assertThat(matchStart).isEqualTo(5);
+ assertThat(matchEnd).isEqualTo(9);
+ String match = content.substring(matchStart, matchEnd);
+ assertThat(match).isEqualTo("𐀂𐀃");
+ }
+
private static void assertStatusOk(StatusProto status) {
assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK);
}
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index 4e48ad7..66fdbe6 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -136,18 +136,29 @@
}
// The representation of a single match within a DocumentProto property.
-// Next tag: 6
+// Next tag: 10
message SnippetMatchProto {
- // The position and length within the matched string at which the exact
- // match begins.
- optional int32 exact_match_position = 2;
+ // The index of the byte in the string at which the match begins and the
+ // length in bytes of the match.
+ optional int32 exact_match_byte_position = 2;
+ optional int32 exact_match_byte_length = 3;
- optional int32 exact_match_bytes = 3;
+ // The index of the UTF-16 code unit in the string at which the match begins
+ // and the length in UTF-16 code units of the match. This is for use with
+ // UTF-16 encoded strings like Java.lang.String.
+ optional int32 exact_match_utf16_position = 6;
+ optional int32 exact_match_utf16_length = 7;
- // The position and length of the suggested snippet window.
- optional int32 window_position = 4;
+ // The index of the byte in the string at which the suggested snippet window
+ // begins and the length in bytes of the window.
+ optional int32 window_byte_position = 4;
+ optional int32 window_byte_length = 5;
- optional int32 window_bytes = 5;
+ // The index of the UTF-16 code unit in the string at which the suggested
+ // snippet window begins and the length in UTF-16 code units of the window.
+ // This is for use with UTF-16 encoded strings like Java.lang.String.
+ optional int32 window_utf16_position = 8;
+ optional int32 window_utf16_length = 9;
reserved 1;
}
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 85538b5..e8a1601 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=370944273)
+set(synced_AOSP_CL_number=372608866)