Merge remote-tracking branch 'goog/androidx-platform-dev' into sc-dev am: b7ee27c61c am: f44c7456c4 am: 221895a694 Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/14347420 Change-Id: I8254f99378e1ac65fb1fc7d758486d024b29498f

commit: c0775b889b49d98cb37bc21a046a8559fc71146f [log] [tgz]
author: Alexander Dorokhine <adorokhine@google.com> Sun May 16 05:34:31 2021 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> Sun May 16 05:34:31 2021 +0000
tree: b6f19f93feb9c255e8247f7c096bd41a0a737d26
parent: 1601eef22fad0b21a0322cdd900f1263cce15988 [diff]
parent: 221895a6947ba80eccb89a312789b7bf73654127 [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a740924..70f6852 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -45,7 +45,7 @@
 # Compile libandroidicu
 set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../icu/libandroidicu")
 set(ICU_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/icu-target")
-add_subdirectory(${ICU_SOURCE_DIR} ${ICU_TARGET_BINARY_DIR})
+add_subdirectory("${ICU_SOURCE_DIR}/static_shim" ${ICU_TARGET_BINARY_DIR})
 
 # Glob Icing proto sources. Results look like this: icing/proto/document.proto
 file(

diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 3258d64..a281f22 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc

@@ -7035,6 +7035,68 @@
   ASSERT_THAT(result->snippet().entries(), IsEmpty());
 }
 
+TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
+  IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+  ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+  ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document = DocumentBuilder()
+                               .SetKey("namespace", "uri1")
+                               .SetSchema("Message")
+                               .AddStringProperty("body", kChinese)
+                               .Build();
+  ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+  // Search and request snippet matching but no windowing.
+  SearchSpecProto search_spec;
+  search_spec.set_query("走");
+  search_spec.set_term_match_type(MATCH_PREFIX);
+
+  ResultSpecProto result_spec;
+  result_spec.mutable_snippet_spec()->set_num_to_snippet(
+      std::numeric_limits<int>::max());
+  result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+      std::numeric_limits<int>::max());
+
+  // Search and make sure that we got a single successful result
+  SearchResultProto search_results = icing.Search(
+      search_spec, ScoringSpecProto::default_instance(), result_spec);
+  ASSERT_THAT(search_results.status(), ProtoIsOk());
+  ASSERT_THAT(search_results.results(), SizeIs(1));
+  const SearchResultProto::ResultProto* result = &search_results.results(0);
+  EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("body"));
+
+  // Get the content for "subject" and see what the match is.
+  std::string_view content = GetString(&result->document(), "body");
+  ASSERT_THAT(content, Eq(kChinese));
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(9));
+  EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(6));
+  std::string_view match =
+      content.substr(match_proto.exact_match_byte_position(),
+                     match_proto.exact_match_byte_length());
+  ASSERT_THAT(match, Eq("走路"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/portable/endian.h b/icing/portable/endian.h
new file mode 100644
index 0000000..42f6c02
--- /dev/null
+++ b/icing/portable/endian.h

@@ -0,0 +1,206 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions that depend on bytesex. We define htonll and ntohll,
+// as well as "Google" versions of all the standards: ghtonl, ghtons, and
+// so on. These functions do exactly the same as their standard variants,
+// but don't require including the dangerous netinet/in.h.
+
+#ifndef ICING_PORTABLE_ENDIAN_H_
+#define ICING_PORTABLE_ENDIAN_H_
+
+#include <cstdint>
+
+// IS_LITTLE_ENDIAN, IS_BIG_ENDIAN
+#if defined OS_LINUX || defined OS_ANDROID || defined(__ANDROID__)
+// _BIG_ENDIAN
+#include <endian.h>
+
+#elif defined(__APPLE__)
+
+// BIG_ENDIAN
+#include <machine/endian.h>  // NOLINT(build/include)
+
+/* Let's try and follow the Linux convention */
+#define __BYTE_ORDER BYTE_ORDER
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#define __BIG_ENDIAN BIG_ENDIAN
+
+#endif  // operating system
+
+// defines __BYTE_ORDER for MSVC
+#ifdef COMPILER_MSVC
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#else  // COMPILER_MSVC
+
+// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN
+// using the above endian definitions from endian.h if
+// endian.h was included
+#ifdef __BYTE_ORDER
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#endif  // __BYTE_ORDER == __LITTLE_ENDIAN
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif  // __BYTE_ORDER == __BIG_ENDIAN
+
+#else  // __BYTE_ORDER
+
+#if defined(__LITTLE_ENDIAN__)
+#define IS_LITTLE_ENDIAN
+#elif defined(__BIG_ENDIAN__)
+#define IS_BIG_ENDIAN
+#endif  // __LITTLE_ENDIAN__ or __BIG_ENDIAN__
+
+#endif  // __BYTE_ORDER
+#endif  // COMPILER_MSVC
+
+// byte swap functions (bswap_16, bswap_32, bswap_64).
+// byte swap functions reverse the order of bytes, e.g.
+//   byteswap of 102030405060708 = 807060504030201
+//   byteswap of 1020304 = 4030201
+
+// The following guarantees declaration of the byte swap functions
+#ifdef COMPILER_MSVC
+#include <stdlib.h>  // NOLINT(build/include)
+
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__GLIBC__) || defined(__BIONIC__) || defined(__ASYLO__)
+#include <byteswap.h>  // IWYU pragma: export
+
+#else  // built-in byteswap functions
+
+static inline uint16 bswap_16(uint16 x) {
+#ifdef __cplusplus
+  return static_cast<uint16>(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));
+#else   // __cplusplus
+  return (uint16)(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));  // NOLINT
+#endif  // __cplusplus
+}
+#define bswap_16(x) bswap_16(x)
+static inline uint32 bswap_32(uint32 x) {
+  return (((x & 0xFF) << 24) | ((x & 0xFF00) << 8) | ((x & 0xFF0000) >> 8) |
+          ((x & 0xFF000000) >> 24));
+}
+#define bswap_32(x) bswap_32(x)
+static inline uint64 bswap_64(uint64 x) {
+  return (((x & (uint64_t)0xFF) << 56) | ((x & (uint64_t)0xFF00) << 40) |
+          ((x & (uint64_t)0xFF0000) << 24) | ((x & (uint64_t)0xFF000000) << 8) |
+          ((x & (uint64_t)0xFF00000000) >> 8) |
+          ((x & (uint64_t)0xFF0000000000) >> 24) |
+          ((x & (uint64_t)0xFF000000000000) >> 40) |
+          ((x & (uint64_t)0xFF00000000000000) >> 56));
+}
+#define bswap_64(x) bswap_64(x)
+
+#endif  // end byteswap functions
+
+// Use compiler byte-swapping intrinsics if they are available.  32-bit
+// and 64-bit versions are available in Clang and GCC as of GCC 4.3.0.
+// The 16-bit version is available in Clang and GCC only as of GCC 4.8.0.
+// For simplicity, we enable them all only for GCC 4.8.0 or later.
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+
+inline uint64_t gbswap_64(uint64_t host_int) {
+  return __builtin_bswap64(host_int);
+}
+inline uint32_t gbswap_32(uint32_t host_int) {
+  return __builtin_bswap32(host_int);
+}
+inline uint16_t gbswap_16(uint16_t host_int) {
+  return __builtin_bswap16(host_int);
+}
+
+#else  // intrinsics available
+
+inline uint64 gbswap_64(uint64 host_int) {
+#if defined(__GNUC__) && defined(__x86_64__) && \
+    !(defined(__APPLE__) && defined(__MACH__))
+  // Adapted from /usr/include/byteswap.h.  Not available on Mac.
+  if (__builtin_constant_p(host_int)) {
+    return __bswap_constant_64(host_int);
+  } else {
+    uint64 result;
+    __asm__("bswap %0" : "=r"(result) : "0"(host_int));
+    return result;
+  }
+#elif defined(bswap_64)
+  return bswap_64(host_int);
+#else   // bswap_64
+  return static_cast<uint64>(bswap_32(static_cast<uint32>(host_int >> 32))) |
+         (static_cast<uint64>(bswap_32(static_cast<uint32>(host_int))) << 32);
+#endif  // bswap_64
+}
+inline uint32 gbswap_32(uint32 host_int) { return bswap_32(host_int); }
+inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); }
+
+#endif  // intrinsics available
+
+#ifdef IS_LITTLE_ENDIAN
+
+// Definitions for ntohl etc. that don't require us to include
+// netinet/in.h. We wrap gbswap_32 and gbswap_16 in functions rather
+// than just #defining them because in debug mode, gcc doesn't
+// correctly handle the (rather involved) definitions of bswap_32.
+// gcc guarantees that inline functions are as fast as macros, so
+// this isn't a performance hit.
+inline uint16_t ghtons(uint16_t x) { return gbswap_16(x); }
+inline uint32_t ghtonl(uint32_t x) { return gbswap_32(x); }
+inline uint64_t ghtonll(uint64_t x) { return gbswap_64(x); }
+
+#elif defined IS_BIG_ENDIAN
+
+// These definitions are simpler on big-endian machines
+// These are functions instead of macros to avoid self-assignment warnings
+// on calls such as "i = ghtnol(i);".  This also provides type checking.
+inline uint16 ghtons(uint16 x) { return x; }
+inline uint32 ghtonl(uint32 x) { return x; }
+inline uint64 ghtonll(uint64 x) { return x; }
+
+#else  // bytesex
+#error \
+    "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined"  // NOLINT
+#endif  // bytesex
+
+#ifndef htonll
+// With the rise of 64-bit, some systems are beginning to define this.
+#define htonll(x) ghtonll(x)
+#endif  // htonll
+
+// ntoh* and hton* are the same thing for any size and bytesex,
+// since the function is an involution, i.e., its own inverse.
+inline uint16_t gntohs(uint16_t x) { return ghtons(x); }
+inline uint32_t gntohl(uint32_t x) { return ghtonl(x); }
+inline uint64_t gntohll(uint64_t x) { return ghtonll(x); }
+
+#ifndef ntohll
+#define ntohll(x) htonll(x)
+#endif  // ntohll
+
+#endif  // ICING_PORTABLE_ENDIAN_H_

diff --git a/icing/portable/platform.h b/icing/portable/platform.h
index 0cccd57..8712835 100644
--- a/icing/portable/platform.h
+++ b/icing/portable/platform.h

@@ -15,8 +15,6 @@
 #ifndef ICING_PORTABLE_PLATFORM_H_
 #define ICING_PORTABLE_PLATFORM_H_
 
-// This file is meant to hold util functions for tests that help the test
-// determine which platform-specific configuration it may be running in.
 namespace icing {
 namespace lib {
 

diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 31a2e5f..dc9f8be 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc

@@ -39,6 +39,7 @@
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
 #include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
@@ -218,12 +219,21 @@
   std::string_view section_subcontent;
 };
 
+// Creates a snippet match proto for the match pointed to by the iterator and
+// char_iterator
+//
+// Returns:
+//   the position of the window start if successful
+//   INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
+//     in an invalid state
+//   ABORTED_ERROR - if an invalid utf-8 sequence is encountered
 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    const SectionData& value, Tokenizer::Iterator* iterator) {
+    const SectionData& value, Tokenizer::Iterator* iterator,
+    const CharacterIterator& char_iterator) {
   SnippetMatchProto snippet_match;
   Token match = iterator->GetToken();
-  int match_pos = match.text.data() - value.section_subcontent.data();
+  int match_pos = char_iterator.utf8_index();
 
   // When finding boundaries,  we have a few cases:
   //
@@ -258,23 +268,42 @@
   int window_end_max_exclusive =
       match_mid + (snippet_spec.max_window_bytes() + 1) / 2;
 
-  snippet_match.set_exact_match_position(match_pos);
-  snippet_match.set_exact_match_bytes(match.text.length());
+  snippet_match.set_exact_match_byte_position(match_pos);
+  snippet_match.set_exact_match_utf16_position(char_iterator.utf16_index());
+
+  // Create character iterators to find the beginning and end of the window.
+  CharacterIterator forward_char_iterator(char_iterator);
+  CharacterIterator backwards_char_iterator(char_iterator);
+
+  if (!backwards_char_iterator.AdvanceToUtf8(match_pos + match.text.length())) {
+    return absl_ports::AbortedError("Could not retrieve valid utf8 character!");
+  }
+  snippet_match.set_exact_match_byte_length(match.text.length());
+  snippet_match.set_exact_match_utf16_length(
+      backwards_char_iterator.utf16_index() - char_iterator.utf16_index());
 
   // Only include windows if it'll at least include the matched text. Otherwise,
   // it'll just be an empty string anyways.
   if (snippet_spec.max_window_bytes() >= match.text.length()) {
     // Find the beginning of the window.
     int window_start;
+    int window_start_utf16;
     if (window_start_min_exclusive < 0) {
       window_start = 0;
+      window_start_utf16 = 0;
     } else {
       ICING_ASSIGN_OR_RETURN(
           window_start,
           DetermineWindowStart(snippet_spec, value.section_subcontent,
                                window_start_min_exclusive, iterator));
+      if (!forward_char_iterator.RewindToUtf8(window_start)) {
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
+      }
+      window_start_utf16 = forward_char_iterator.utf16_index();
     }
-    snippet_match.set_window_position(window_start);
+    snippet_match.set_window_byte_position(window_start);
+    snippet_match.set_window_utf16_position(window_start_utf16);
 
     // Find the end of the window.
     int window_end_exclusive;
@@ -286,7 +315,13 @@
           DetermineWindowEnd(snippet_spec, value.section_subcontent,
                              window_end_max_exclusive, iterator));
     }
-    snippet_match.set_window_bytes(window_end_exclusive - window_start);
+    if (!backwards_char_iterator.AdvanceToUtf8(window_end_exclusive)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    snippet_match.set_window_byte_length(window_end_exclusive - window_start);
+    snippet_match.set_window_utf16_length(
+        backwards_char_iterator.utf16_index() - window_start_utf16);
 
     // DetermineWindowStart/End may change the position of the iterator. So,
     // reset the iterator back to the original position.
@@ -332,16 +367,38 @@
     std::string_view value = current_property->string_values(i);
     std::unique_ptr<Tokenizer::Iterator> iterator =
         tokenizer->Tokenize(value).ValueOrDie();
+    CharacterIterator char_iterator(value);
     while (iterator->Advance()) {
       Token token = iterator->GetToken();
       if (matcher->Matches(token)) {
-        // If there was an error while retrieving the match, the tokenizer
-        // iterator is probably in an invalid state. There's nothing we can do
-        // here, so just return.
+        if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) {
+          // We can't get the char_iterator to a valid position, so there's no
+          // way for us to provide valid utf-16 indices. There's nothing more we
+          // can do here, so just return whatever we've built up so far.
+          if (!snippet_entry.snippet_matches().empty()) {
+            *snippet_proto->add_entries() = std::move(snippet_entry);
+          }
+          return;
+        }
         SectionData data = {property_path, value};
-        SnippetMatchProto match =
-            RetrieveMatch(match_options->snippet_spec, data, iterator.get())
-                .ValueOrDie();
+        auto match_or = RetrieveMatch(match_options->snippet_spec, data,
+                                      iterator.get(), char_iterator);
+        if (!match_or.ok()) {
+          if (absl_ports::IsAborted(match_or.status())) {
+            // Only an aborted. We can't get this match, but we might be able to
+            // retrieve others. Just continue.
+            continue;
+          } else {
+            // Probably an internal error. The tokenizer iterator is probably in
+            // an invalid state. There's nothing more we can do here, so just
+            // return whatever we've built up so far.
+            if (!snippet_entry.snippet_matches().empty()) {
+              *snippet_proto->add_entries() = std::move(snippet_entry);
+            }
+            return;
+          }
+        }
+        SnippetMatchProto match = std::move(match_or).ValueOrDie();
         snippet_entry.mutable_snippet_matches()->Add(std::move(match));
         if (--match_options->max_matches_remaining <= 0) {
           *snippet_proto->add_entries() = std::move(snippet_entry);

diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index ff38372..c052a9e 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc

@@ -1134,6 +1134,201 @@
                   "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
 }
 
+TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kChinese)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
+TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+  ICING_ASSERT_OK_AND_ASSIGN(
+      language_segmenter_,
+      language_segmenter_factory::Create(std::move(options)));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      snippet_retriever_,
+      SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+                               normalizer_.get()));
+
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kChinese)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+  // Set a twenty byte window. This will produce a window like this:
+  // String:     "我每天走路去上班。"
+  //                ^       ^
+  // UTF8 idx:      3       18
+  // UTF16 idx:     1       6
+  snippet_spec_.set_max_window_bytes(20);
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("每天走路去"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.window_utf16_position(), Eq(1));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(5));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
+  // The following string has four-byte UTF-8 characters. Most importantly, it
+  // is also two code units in UTF-16.
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //              ^  ^  ^
+  // UTF8 idx:    0  9  18
+  // UTF16 idx:   0  5  10
+  // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+  constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kText)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
+  EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
+  // The following string has four-byte UTF-8 characters. Most importantly, it
+  // is also two code units in UTF-16.
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //              ^  ^  ^
+  // UTF8 idx:    0  9  18
+  // UTF16 idx:   0  5  10
+  // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+  constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", kText)
+          .AddStringProperty("body",
+                             "Concerning the subject of foo, we need to begin "
+                             "considering our options regarding body bar.")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+  // Set a twenty byte window. This will produce a window like this:
+  // String:     "𐀀𐀁 𐀂𐀃 𐀄"
+  //                 ^   ^
+  // UTF8 idx:       9   22
+  // UTF16 idx:      5   12
+  snippet_spec_.set_max_window_bytes(20);
+
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // Ensure that one and only one property was matched and it was "body"
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  EXPECT_THAT(entry->property_name(), Eq("subject"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+
+  // Ensure that there is one and only one match within "subject"
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+  // Ensure that the match is correct.
+  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("𐀂𐀃 𐀄"));
+
+  // Ensure that the utf-16 values are also as expected
+  EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
+}
+
 }  // namespace
 
 }  // namespace lib

diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 9631e29..d79c861 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc

@@ -459,44 +459,42 @@
       schema_type_id = schema_type_id_or.ValueOrDie();
     }
 
-      ICING_ASSIGN_OR_RETURN(
-          NamespaceId namespace_id,
-          namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
-                                      namespace_mapper_->num_keys()));
+    ICING_ASSIGN_OR_RETURN(
+        NamespaceId namespace_id,
+        namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+                                    namespace_mapper_->num_keys()));
 
-      // Update corpus maps
-      std::string corpus =
-          MakeFingerprint(document_wrapper.document().namespace_(),
-                          document_wrapper.document().schema());
-      ICING_ASSIGN_OR_RETURN(
-          CorpusId corpusId,
-          corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
+    // Update corpus maps
+    std::string corpus =
+        MakeFingerprint(document_wrapper.document().namespace_(),
+                        document_wrapper.document().schema());
+    ICING_ASSIGN_OR_RETURN(
+        CorpusId corpusId,
+        corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
 
-      ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
-                             GetCorpusAssociatedScoreDataToUpdate(corpusId));
-      scoring_data.AddDocument(
-          document_wrapper.document().internal_fields().length_in_tokens());
+    ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+                           GetCorpusAssociatedScoreDataToUpdate(corpusId));
+    scoring_data.AddDocument(
+        document_wrapper.document().internal_fields().length_in_tokens());
 
-      ICING_RETURN_IF_ERROR(
-          UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+    ICING_RETURN_IF_ERROR(
+        UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
 
-      ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
-          new_document_id,
-          DocumentAssociatedScoreData(
-              corpusId, document_wrapper.document().score(),
-              document_wrapper.document().creation_timestamp_ms(),
-              document_wrapper.document()
-                  .internal_fields()
-                  .length_in_tokens())));
+    ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+        new_document_id,
+        DocumentAssociatedScoreData(
+            corpusId, document_wrapper.document().score(),
+            document_wrapper.document().creation_timestamp_ms(),
+            document_wrapper.document().internal_fields().length_in_tokens())));
 
-      int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
-          document_wrapper.document().creation_timestamp_ms(),
-          document_wrapper.document().ttl_ms());
+    int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
+        document_wrapper.document().creation_timestamp_ms(),
+        document_wrapper.document().ttl_ms());
 
-      ICING_RETURN_IF_ERROR(UpdateFilterCache(
-          new_document_id, DocumentFilterData(namespace_id, schema_type_id,
-                                              expiration_timestamp_ms)));
-      iterator_status = iterator.Advance();
+    ICING_RETURN_IF_ERROR(UpdateFilterCache(
+        new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+                                            expiration_timestamp_ms)));
+    iterator_status = iterator.Advance();
   }
 
   if (!absl_ports::IsOutOfRange(iterator_status)) {
@@ -833,18 +831,20 @@
                                           expiration_timestamp_ms)));
 
   if (old_document_id_or.ok()) {
+    // The old document exists, copy over the usage scores and delete the old
+    // document.
     DocumentId old_document_id = old_document_id_or.ValueOrDie();
-    auto offset_or = DoesDocumentExistAndGetFileOffset(old_document_id);
 
-    if (offset_or.ok()) {
-      // The old document exists, copy over the usage scores.
-      ICING_RETURN_IF_ERROR(
-          usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
-                                         /*to_document_id=*/new_document_id));
+    ICING_RETURN_IF_ERROR(
+        usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
+                                       /*to_document_id=*/new_document_id));
 
-      // Delete the old document.
-      ICING_RETURN_IF_ERROR(document_log_->EraseProto(offset_or.ValueOrDie()));
-      ICING_RETURN_IF_ERROR(ClearDerivedData(old_document_id));
+    // Delete the old document. It's fine if it's not found since it might have
+    // been deleted previously.
+    auto delete_status = Delete(old_document_id);
+    if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+      // Real error, pass it up.
+      return delete_status;
     }
   }
 
@@ -886,8 +886,16 @@
 
 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
     DocumentId document_id, bool clear_internal_fields) const {
-  ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
-                         DoesDocumentExistAndGetFileOffset(document_id));
+  ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
+
+  auto document_log_offset_or = document_id_mapper_->Get(document_id);
+  if (!document_log_offset_or.ok()) {
+    // Since we've just checked that our document_id is valid a few lines
+    // above, there's no reason this should fail and an error should never
+    // happen.
+    return absl_ports::InternalError("Failed to find document offset.");
+  }
+  int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
 
   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
   // that can support error logging.
@@ -938,7 +946,7 @@
     }
     const DocumentFilterData* data = status_or_data.ValueOrDie();
 
-    if (DoesDocumentExist(document_id)) {
+    if (InternalDoesDocumentExist(document_id)) {
       existing_namespace_ids.insert(data->namespace_id());
     }
   }
@@ -951,40 +959,74 @@
   return existing_namespaces;
 }
 
-libtextclassifier3::StatusOr<int64_t>
-DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
+bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
   if (!IsDocumentIdValid(document_id)) {
-    return absl_ports::InvalidArgumentError(
-        IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id));
+    return false;
   }
 
-  auto file_offset_or = document_id_mapper_->Get(document_id);
-
-  bool deleted =
-      file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag;
-  if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) {
-    // Document has been deleted or doesn't exist
-    return absl_ports::NotFoundError(
-        IcingStringUtil::StringPrintf("Document %d not found", document_id));
+  if (document_id >= document_id_mapper_->num_elements()) {
+    // Somehow got an validly constructed document_id that the document store
+    // doesn't know about
+    return false;
   }
 
-  ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
-                         filter_cache_->Get(document_id));
-  if (clock_.GetSystemTimeMilliseconds() >=
-      filter_data->expiration_timestamp_ms()) {
-    // Past the expiration time, so also return NOT FOUND since it *shouldn't*
-    // exist anymore.
-    return absl_ports::NotFoundError(
-        IcingStringUtil::StringPrintf("Document %d not found", document_id));
-  }
-
-  ICING_RETURN_IF_ERROR(file_offset_or.status());
-  return *file_offset_or.ValueOrDie();
+  return InternalDoesDocumentExist(document_id);
 }
 
-bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
-  // If we can successfully get the document log offset, the document exists.
-  return DoesDocumentExistAndGetFileOffset(document_id).ok();
+libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus(
+    DocumentId document_id) const {
+  if (!IsDocumentIdValid(document_id)) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Document id '%d' invalid.", document_id));
+  }
+
+  if (document_id >= document_id_mapper_->num_elements()) {
+    // Somehow got a validly constructed document_id that the document store
+    // doesn't know about.
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Unknown document id '%d'.", document_id));
+  }
+
+  if (!InternalDoesDocumentExist(document_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Document id '%d' doesn't exist", document_id));
+  };
+  return libtextclassifier3::Status::OK;
+}
+
+bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const {
+  return !IsDeleted(document_id) && !IsExpired(document_id);
+}
+
+bool DocumentStore::IsDeleted(DocumentId document_id) const {
+  auto file_offset_or = document_id_mapper_->Get(document_id);
+  if (!file_offset_or.ok()) {
+    // This would only happen if document_id is out of range of the
+    // document_id_mapper, meaning we got some invalid document_id. Callers
+    // should already have checked that their document_id is valid or used
+    // DoesDocumentExist(WithStatus). Regardless, return true since the
+    // document doesn't exist.
+    return true;
+  }
+  int64_t file_offset = *file_offset_or.ValueOrDie();
+  return file_offset == kDocDeletedFlag;
+}
+
+bool DocumentStore::IsExpired(DocumentId document_id) const {
+  auto filter_data_or = filter_cache_->Get(document_id);
+  if (!filter_data_or.ok()) {
+    // This would only happen if document_id is out of range of the
+    // filter_cache, meaning we got some invalid document_id. Callers should
+    // already have checked that their document_id is valid or used
+    // DoesDocumentExist(WithStatus). Regardless, return true since the
+    // document doesn't exist.
+    return true;
+  }
+  const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
+
+  // Check if it's past the expiration time
+  return clock_.GetSystemTimeMilliseconds() >=
+         filter_data->expiration_timestamp_ms();
 }
 
 libtextclassifier3::Status DocumentStore::Delete(
@@ -1001,9 +1043,14 @@
 }
 
 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) {
-  // Copy out the document to get namespace and uri.
-  ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
-                         DoesDocumentExistAndGetFileOffset(document_id));
+  ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id));
+
+  auto document_log_offset_or = document_id_mapper_->Get(document_id);
+  if (!document_log_offset_or.ok()) {
+    return absl_ports::InternalError("Failed to find document offset.");
+  }
+  int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
+
   // Erases document proto.
   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
   return ClearDerivedData(document_id);
@@ -1185,8 +1232,8 @@
       continue;
     }
 
-    // The document has the desired namespace and schema type, it either exists
-    // or has expired.
+    // The document has the desired namespace and schema type, it either
+    // exists or has expired.
     libtextclassifier3::Status delete_status = Delete(document_id);
     if (absl_ports::IsNotFound(delete_status)) {
       continue;
@@ -1262,15 +1309,9 @@
   for (DocumentId document_id = 0;
        document_id < document_id_mapper_->num_elements(); ++document_id) {
     // Check if it's deleted first.
-    auto location_or = document_id_mapper_->Get(document_id);
-    if (!location_or.ok()) {
-      ICING_VLOG(1) << "Error trying to get document offsets for document "
-                       "store storage info counts.";
-      continue;
-    }
-    if (*location_or.ValueOrDie() == kDocDeletedFlag) {
-      // We don't have the namespace id of hard deleted documents anymore, so we
-      // can't add to our namespace storage info.
+    if (IsDeleted(document_id)) {
+      // We don't have the namespace id of hard deleted documents anymore, so
+      // we can't add to our namespace storage info.
       ++total_num_deleted;
       continue;
     }
@@ -1308,23 +1349,7 @@
     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
 
     // Update our stats
-    if (DoesDocumentExist(document_id)) {
-      ++total_num_alive;
-      namespace_storage_info.set_num_alive_documents(
-          namespace_storage_info.num_alive_documents() + 1);
-      if (usage_scores.usage_type1_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type1(
-            namespace_storage_info.num_alive_documents_usage_type1() + 1);
-      }
-      if (usage_scores.usage_type2_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type2(
-            namespace_storage_info.num_alive_documents_usage_type2() + 1);
-      }
-      if (usage_scores.usage_type3_count > 0) {
-        namespace_storage_info.set_num_alive_documents_usage_type3(
-            namespace_storage_info.num_alive_documents_usage_type3() + 1);
-      }
-    } else {
+    if (IsExpired(document_id)) {
       ++total_num_expired;
       namespace_storage_info.set_num_expired_documents(
           namespace_storage_info.num_expired_documents() + 1);
@@ -1340,6 +1365,22 @@
         namespace_storage_info.set_num_expired_documents_usage_type3(
             namespace_storage_info.num_expired_documents_usage_type3() + 1);
       }
+    } else {
+      ++total_num_alive;
+      namespace_storage_info.set_num_alive_documents(
+          namespace_storage_info.num_alive_documents() + 1);
+      if (usage_scores.usage_type1_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type1(
+            namespace_storage_info.num_alive_documents_usage_type1() + 1);
+      }
+      if (usage_scores.usage_type2_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type2(
+            namespace_storage_info.num_alive_documents_usage_type2() + 1);
+      }
+      if (usage_scores.usage_type3_count > 0) {
+        namespace_storage_info.set_num_alive_documents_usage_type3(
+            namespace_storage_info.num_alive_documents_usage_type3() + 1);
+      }
     }
   }
 
@@ -1422,16 +1463,9 @@
 
   int size = document_id_mapper_->num_elements();
   for (DocumentId document_id = 0; document_id < size; document_id++) {
-    auto exists_or = DoesDocumentExistAndGetFileOffset(document_id);
-    if (absl_ports::IsNotFound(exists_or.status())) {
+    if (!InternalDoesDocumentExist(document_id)) {
       // Skip nonexistent documents
       continue;
-    } else if (!exists_or.ok()) {
-      // Real error, pass up
-      return absl_ports::Annotate(
-          exists_or.status(),
-          IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d",
-                                        document_id));
     }
 
     // Guaranteed that the document exists now.
@@ -1508,11 +1542,9 @@
   for (DocumentId document_id = 0; document_id < size; document_id++) {
     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
     if (absl_ports::IsNotFound(document_or.status())) {
-      // Don't optimize nonexistent documents, but collect stats
-      auto location_or = document_id_mapper_->Get(document_id);
-      if (location_or.ok() && *location_or.ValueOrDie() == kDocDeletedFlag) {
+      if (IsDeleted(document_id)) {
         ++num_deleted;
-      } else {
+      } else if (IsExpired(document_id)) {
         ++num_expired;
       }
       continue;
@@ -1576,7 +1608,7 @@
   int32_t num_documents = document_id_mapper_->num_elements();
   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
        ++document_id) {
-    if (!DoesDocumentExist(document_id)) {
+    if (!InternalDoesDocumentExist(document_id)) {
       ++optimize_info.optimizable_docs;
     }
 
@@ -1614,10 +1646,10 @@
   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
                          document_key_mapper_->GetElementsSize());
 
-  // We don't include the namespace_mapper or the corpus_mapper because it's not
-  // clear if we could recover any space even if Optimize were called. Deleting
-  // 100s of documents could still leave a few documents of a namespace, and
-  // then there would be no change.
+  // We don't include the namespace_mapper or the corpus_mapper because it's
+  // not clear if we could recover any space even if Optimize were called.
+  // Deleting 100s of documents could still leave a few documents of a
+  // namespace, and then there would be no change.
 
   int64_t total_size = document_log_file_size + document_key_mapper_size +
                        document_id_mapper_file_size + score_cache_file_size +
@@ -1647,8 +1679,8 @@
 libtextclassifier3::Status DocumentStore::ClearDerivedData(
     DocumentId document_id) {
   // We intentionally leave the data in key_mapper_ because locating that data
-  // requires fetching namespace and uri. Leaving data in key_mapper_ should be
-  // fine because the data is hashed.
+  // requires fetching namespace and uri. Leaving data in key_mapper_ should
+  // be fine because the data is hashed.
 
   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
 

diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 832c470..a8d87c8 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h

@@ -198,6 +198,12 @@
   // Check if a document exists. Existence means it hasn't been deleted and it
   // hasn't expired yet.
   //
+  // NOTE: This should be used when callers don't care about error messages,
+  // expect documents to be deleted/not found, or in frequently called code
+  // paths that could cause performance issues. A signficant amount of CPU
+  // cycles can be saved if we don't construct strings and create new Status
+  // objects on the heap. See b/185822483.
+  //
   // Returns:
   //   boolean whether a document exists or not
   bool DoesDocumentExist(DocumentId document_id) const;
@@ -625,22 +631,46 @@
   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
 
-  // Helper method to validate the document id and return the file offset of the
-  // associated document in document_log_.
-  //
-  // This can be a more informative call than just DoesDocumentExist because it
-  // can return more status errors on whether the Document actually doesn't
-  // exist or if there was an internal error while accessing files.
+  // Check if a document exists. Existence means it hasn't been deleted and it
+  // hasn't expired yet.
   //
   // Returns:
-  //   The file offset on success
+  //   OK if the document exists
   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
   //                    maximum value
   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
   //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
+  libtextclassifier3::Status DoesDocumentExistWithStatus(
       DocumentId document_id) const;
 
+  // Check if a document exists. Existence means it hasn't been deleted and it
+  // hasn't expired yet.
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  //
+  // Returns:
+  //   boolean whether a document exists or not
+  bool InternalDoesDocumentExist(DocumentId document_id) const;
+
+  // Checks if a document has been deleted
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  bool IsDeleted(DocumentId document_id) const;
+
+  // Checks if a document has expired.
+  //
+  // This is for internal-use only because we assume that the document_id is
+  // already valid. If you're unsure if the document_id is valid, use
+  // DoesDocumentExist(document_id) instead, which will perform those additional
+  // checks.
+  bool IsExpired(DocumentId document_id) const;
+
   // Updates the entry in the score cache for document_id.
   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
       DocumentId document_id, const DocumentAssociatedScoreData& score_data);

diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
new file mode 100644
index 0000000..f68e115
--- /dev/null
+++ b/icing/store/document-store_benchmark.cc

@@ -0,0 +1,174 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+// Run on a Linux workstation:
+//    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/store:document-store_benchmark
+//
+//    $ blaze-bin/icing/store/document-store_benchmark
+//    --benchmarks=all --benchmark_memory_usage
+//
+// Run on an Android device:
+//    $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+//    --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/store:document-store_benchmark
+//
+//    $ adb push blaze-bin/icing/store/document-store_benchmark
+//    /data/local/tmp/
+//
+//    $ adb shell /data/local/tmp/document-store_benchmark
+//    --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
+    PropertyConfigProto_Cardinality_Code_OPTIONAL;
+
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
+    StringIndexingConfig_TokenizerType_Code_PLAIN;
+
+constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+
+class DestructibleDirectory {
+ public:
+  explicit DestructibleDirectory(const Filesystem& filesystem,
+                                 const std::string& dir)
+      : filesystem_(filesystem), dir_(dir) {
+    filesystem_.CreateDirectoryRecursively(dir_.c_str());
+  }
+  ~DestructibleDirectory() {
+    filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+  }
+
+ private:
+  Filesystem filesystem_;
+  std::string dir_;
+};
+
+DocumentProto CreateDocument(const std::string namespace_,
+                             const std::string uri) {
+  return DocumentBuilder()
+      .SetKey(namespace_, uri)
+      .SetSchema("email")
+      .AddStringProperty("subject", "subject foo")
+      .AddStringProperty("body", "body bar")
+      .Build();
+}
+
+SchemaProto CreateSchema() {
+  return SchemaBuilder()
+      .AddType(
+          SchemaTypeConfigBuilder()
+              .SetType("email")
+              .AddProperty(PropertyConfigBuilder()
+                               .SetName("subject")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL))
+              .AddProperty(PropertyConfigBuilder()
+                               .SetName("body")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL)))
+      .Build();
+}
+
+std::unique_ptr<SchemaStore> CreateSchemaStore(Filesystem filesystem,
+                                               const std::string directory,
+                                               const Clock* clock) {
+  const std::string schema_store_dir = directory + "/schema";
+  filesystem.CreateDirectoryRecursively(schema_store_dir.data());
+  std::unique_ptr<SchemaStore> schema_store =
+      SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie();
+
+  auto set_schema_status = schema_store->SetSchema(CreateSchema());
+  if (!set_schema_status.ok()) {
+    ICING_LOG(ERROR) << set_schema_status.status().error_message();
+  }
+
+  return schema_store;
+}
+
+void BM_DoesDocumentExistBenchmark(benchmark::State& state) {
+  Filesystem filesystem;
+  Clock clock;
+
+  std::string directory = GetTestTempDir() + "/icing";
+  DestructibleDirectory ddir(filesystem, directory);
+
+  std::string document_store_dir = directory + "/store";
+  std::unique_ptr<SchemaStore> schema_store =
+      CreateSchemaStore(filesystem, directory, &clock);
+
+  filesystem.CreateDirectoryRecursively(document_store_dir.data());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem, document_store_dir, &clock,
+                            schema_store.get()));
+  std::unique_ptr<DocumentStore> document_store =
+      std::move(create_result.document_store);
+
+  int max_document_id = 300000;
+  for (int i = 0; i < max_document_id; ++i) {
+    // Put and delete a lot of documents to fill up our derived files with
+    // stuff.
+    ICING_ASSERT_OK(document_store->Put(
+        CreateDocument("namespace", /*uri=*/std::to_string(i))));
+    document_store->Delete("namespace", /*uri=*/std::to_string(i));
+  }
+
+  std::default_random_engine random;
+  std::uniform_int_distribution<> dist(1, max_document_id);
+  for (auto s : state) {
+    // Check random document ids to see if they exist. Hopefully to simulate
+    // page faulting in different sections of our mmapped derived files.
+    int document_id = dist(random);
+    benchmark::DoNotOptimize(document_store->DoesDocumentExist(document_id));
+  }
+}
+BENCHMARK(BM_DoesDocumentExistBenchmark);
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 42aabde..ebc5ec3 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc

@@ -327,7 +327,7 @@
   EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1)));
 }
 
-TEST_F(DocumentStoreTest, IsDocumentExisting) {
+TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -832,6 +832,19 @@
               IsOkAndHolds(EqualsProto(message_document)));
 }
 
+TEST_F(DocumentStoreTest, PutDeleteThenPut) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  std::unique_ptr<DocumentStore> doc_store =
+      std::move(create_result.document_store);
+  ICING_EXPECT_OK(doc_store->Put(test_document1_));
+  ICING_EXPECT_OK(
+      doc_store->Delete(test_document1_.namespace_(), test_document1_.uri()));
+  ICING_EXPECT_OK(doc_store->Put(test_document1_));
+}
+
 TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
   SchemaProto schema =
       SchemaBuilder()

diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
index 6a017ef..cfd20c2 100644
--- a/icing/testing/snippet-helpers.cc
+++ b/icing/testing/snippet-helpers.cc

@@ -61,8 +61,8 @@
     std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
   std::vector<std::string_view> windows;
   for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
-    windows.push_back(
-        content.substr(match.window_position(), match.window_bytes()));
+    windows.push_back(content.substr(match.window_byte_position(),
+                                     match.window_byte_length()));
   }
   return windows;
 }
@@ -71,8 +71,8 @@
     std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
   std::vector<std::string_view> matches;
   for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
-    matches.push_back(content.substr(match.exact_match_position(),
-                                     match.exact_match_bytes()));
+    matches.push_back(content.substr(match.exact_match_byte_position(),
+                                     match.exact_match_byte_length()));
   }
   return matches;
 }

diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index bca0223..2019033 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java

@@ -45,6 +45,8 @@
 import com.google.android.icing.proto.SearchResultProto;
 import com.google.android.icing.proto.SearchSpecProto;
 import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.SnippetMatchProto;
+import com.google.android.icing.proto.SnippetProto;
 import com.google.android.icing.proto.StatusProto;
 import com.google.android.icing.proto.StorageInfoResultProto;
 import com.google.android.icing.proto.StringIndexingConfig;
@@ -486,6 +488,140 @@
     assertStatusOk(reportUsageResultProto.getStatus());
   }
 
+  @Test
+  public void testCJKTSnippets() throws Exception {
+    assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+    SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+    assertStatusOk(
+        icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+    // String:     "我每天走路去上班。"
+    //              ^ ^  ^   ^^
+    // UTF16 idx:   0 1  3   5 6
+    // Breaks into segments: "我", "每天", "走路", "去", "上班"
+    String chinese = "我每天走路去上班。";
+    assertThat(chinese.length()).isEqualTo(9);
+    DocumentProto emailDocument1 =
+        createEmailDocument("namespace", "uri1").toBuilder()
+            .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese))
+            .build();
+    assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+    // Search and request snippet matching but no windowing.
+    SearchSpecProto searchSpec =
+        SearchSpecProto.newBuilder()
+            .setQuery("每")
+            .setTermMatchType(TermMatchType.Code.PREFIX)
+            .build();
+    ResultSpecProto resultSpecProto =
+        ResultSpecProto.newBuilder()
+            .setSnippetSpec(
+                ResultSpecProto.SnippetSpecProto.newBuilder()
+                    .setNumToSnippet(Integer.MAX_VALUE)
+                    .setNumMatchesPerProperty(Integer.MAX_VALUE))
+            .build();
+
+    // Search and make sure that we got a single successful results
+    SearchResultProto searchResultProto =
+        icingSearchEngine.search(
+            searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+    assertStatusOk(searchResultProto.getStatus());
+    assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+    // Ensure that one and only one property was matched and it was "subject"
+    SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+    assertThat(snippetProto.getEntriesList()).hasSize(1);
+    SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+    assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+    // Get the content for "subject" and see what the match is.
+    DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+    assertThat(resultDocument.getPropertiesList()).hasSize(1);
+    PropertyProto subjectProperty = resultDocument.getProperties(0);
+    assertThat(subjectProperty.getName()).isEqualTo("subject");
+    assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+    String content = subjectProperty.getStringValues(0);
+
+    // Ensure that there is one and only one match within "subject"
+    assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+    SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+    int matchStart = matchProto.getExactMatchUtf16Position();
+    int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+    assertThat(matchStart).isEqualTo(1);
+    assertThat(matchEnd).isEqualTo(3);
+    String match = content.substring(matchStart, matchEnd);
+    assertThat(match).isEqualTo("每天");
+  }
+
+  @Test
+  public void testUtf16MultiByteSnippets() throws Exception {
+    assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+    SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+    assertStatusOk(
+        icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+    // String:    "𐀀𐀁 𐀂𐀃 𐀄"
+    //             ^  ^  ^
+    // UTF16 idx:  0  5  10
+    // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+    String text = "𐀀𐀁 𐀂𐀃 𐀄";
+    assertThat(text.length()).isEqualTo(12);
+    DocumentProto emailDocument1 =
+        createEmailDocument("namespace", "uri1").toBuilder()
+            .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(text))
+            .build();
+    assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+    // Search and request snippet matching but no windowing.
+    SearchSpecProto searchSpec =
+        SearchSpecProto.newBuilder()
+            .setQuery("𐀂")
+            .setTermMatchType(TermMatchType.Code.PREFIX)
+            .build();
+    ResultSpecProto resultSpecProto =
+        ResultSpecProto.newBuilder()
+            .setSnippetSpec(
+                ResultSpecProto.SnippetSpecProto.newBuilder()
+                    .setNumToSnippet(Integer.MAX_VALUE)
+                    .setNumMatchesPerProperty(Integer.MAX_VALUE))
+            .build();
+
+    // Search and make sure that we got a single successful results
+    SearchResultProto searchResultProto =
+        icingSearchEngine.search(
+            searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+    assertStatusOk(searchResultProto.getStatus());
+    assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+    // Ensure that one and only one property was matched and it was "subject"
+    SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+    assertThat(snippetProto.getEntriesList()).hasSize(1);
+    SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+    assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+    // Get the content for "subject" and see what the match is.
+    DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+    assertThat(resultDocument.getPropertiesList()).hasSize(1);
+    PropertyProto subjectProperty = resultDocument.getProperties(0);
+    assertThat(subjectProperty.getName()).isEqualTo("subject");
+    assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+    String content = subjectProperty.getStringValues(0);
+
+    // Ensure that there is one and only one match within "subject"
+    assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+    SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+    int matchStart = matchProto.getExactMatchUtf16Position();
+    int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+    assertThat(matchStart).isEqualTo(5);
+    assertThat(matchEnd).isEqualTo(9);
+    String match = content.substring(matchStart, matchEnd);
+    assertThat(match).isEqualTo("𐀂𐀃");
+  }
+
   private static void assertStatusOk(StatusProto status) {
     assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK);
   }

diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index 4e48ad7..66fdbe6 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto

@@ -136,18 +136,29 @@
 }
 
 // The representation of a single match within a DocumentProto property.
-// Next tag: 6
+// Next tag: 10
 message SnippetMatchProto {
-  // The position and length within the matched string at which the exact
-  // match begins.
-  optional int32 exact_match_position = 2;
+  // The index of the byte in the string at which the match begins and the
+  // length in bytes of the match.
+  optional int32 exact_match_byte_position = 2;
+  optional int32 exact_match_byte_length = 3;
 
-  optional int32 exact_match_bytes = 3;
+  // The index of the UTF-16 code unit in the string at which the match begins
+  // and the length in UTF-16 code units of the match. This is for use with
+  // UTF-16 encoded strings like Java.lang.String.
+  optional int32 exact_match_utf16_position = 6;
+  optional int32 exact_match_utf16_length = 7;
 
-  // The position and length of the suggested snippet window.
-  optional int32 window_position = 4;
+  // The index of the byte in the string at which the suggested snippet window
+  // begins and the length in bytes of the window.
+  optional int32 window_byte_position = 4;
+  optional int32 window_byte_length = 5;
 
-  optional int32 window_bytes = 5;
+  // The index of the UTF-16 code unit in the string at which the suggested
+  // snippet window begins and the length in UTF-16 code units of the window.
+  // This is for use with UTF-16 encoded strings like Java.lang.String.
+  optional int32 window_utf16_position = 8;
+  optional int32 window_utf16_length = 9;
 
   reserved 1;
 }

diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 85538b5..6f5faa0 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt

@@ -1 +1 @@
-set(synced_AOSP_CL_number=370944273)
+set(synced_AOSP_CL_number=373174102)
commit	c0775b889b49d98cb37bc21a046a8559fc71146f	[log] [tgz]
author	Alexander Dorokhine <adorokhine@google.com>	Sun May 16 05:34:31 2021 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	Sun May 16 05:34:31 2021 +0000
tree	b6f19f93feb9c255e8247f7c096bd41a0a737d26
parent	1601eef22fad0b21a0322cdd900f1263cce15988 [diff]
parent	221895a6947ba80eccb89a312789b7bf73654127 [diff]