Merge remote-tracking branch 'goog/androidx-platform-dev' am: 77207b21c2 Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/15027062 Change-Id: I5a113ed426158cc7229523ab125cbb9e677efe21

commit: 058975937a9a12c3da9b7e099ef58ad2f942cf7b [log] [tgz]
author: Tim Barron <tjbarron@google.com> Mon Jun 21 20:17:44 2021 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> Mon Jun 21 20:17:44 2021 +0000
tree: 28fec0e533e72f993e6e183af3675d87a1462d59
parent: 1a698115fe367b4e3907b31ca4dfa5d6ae430469 [diff]
parent: 77207b21c25fce96d03cc1a1d4f294a99b6868a6 [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70f6852..01ee8eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -15,6 +15,9 @@
 cmake_minimum_required(VERSION 3.10.2)
 
 add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1")
+set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds")
+set(CMAKE_SHARED_LINKER_FLAGS
+    "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections -Wl,--version-script=${VERSION_SCRIPT}")
 
 set(
     Protobuf_PREBUILTS_DIR

diff --git a/build.gradle b/build.gradle
index 437f57f..882a929 100644
--- a/build.gradle
+++ b/build.gradle

@@ -69,6 +69,9 @@
 
     generateProtoTasks {
         all().each { task ->
+            project.tasks.named("extractReleaseAnnotations").configure {
+                it.dependsOn(task)
+            }
             task.builtins {
                 java {
                     option 'lite'

diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 6a596f5..0655cb9 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc

@@ -466,7 +466,13 @@
 
 bool Filesystem::CopyFile(const char* src, const char* dst) const {
   ScopedFd src_fd(OpenForRead(src));
+
+  std::string dir = GetDirname(dst);
+  if (!CreateDirectoryRecursively(dir.c_str())) {
+    return false;
+  }
   ScopedFd dst_fd(OpenForWrite(dst));
+
   if (!src_fd.is_valid() || !dst_fd.is_valid()) {
     return false;
   }
@@ -478,6 +484,49 @@
   return Write(*dst_fd, buf.get(), size);
 }
 
+bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir,
+                               bool recursive) const {
+  DIR* dir = opendir(src_dir);
+  if (!dir) {
+    LogOpenError("Unable to open directory ", src_dir, ": ", errno);
+    return false;
+  }
+
+  dirent* p;
+  // readdir's implementation seems to be thread safe.
+  while ((p = readdir(dir)) != nullptr) {
+    std::string file_name(p->d_name);
+    if (file_name == "." || file_name == "..") {
+      continue;
+    }
+
+    std::string full_src_path = absl_ports::StrCat(src_dir, "/", p->d_name);
+    std::string full_dst_path = absl_ports::StrCat(dst_dir, "/", p->d_name);
+
+    // Directories are copied when writing a non-directory file, so no
+    // explicit copying of a directory is required.
+    if (p->d_type != DT_DIR) {
+      if (!CopyFile(full_src_path.c_str(), full_dst_path.c_str())) {
+        return false;
+      }
+    }
+
+    // Recurse down directories, if requested.
+    if (recursive && (p->d_type == DT_DIR)) {
+      std::string src_sub_dir = absl_ports::StrCat(src_dir, "/", p->d_name);
+      std::string dst_sub_dir = absl_ports::StrCat(dst_dir, "/", p->d_name);
+      if (!CopyDirectory(src_sub_dir.c_str(), dst_sub_dir.c_str(), recursive)) {
+        return false;
+      }
+    }
+  }
+  if (closedir(dir) != 0) {
+    ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s",
+                                                      src_dir, strerror(errno));
+  }
+  return true;
+}
+
 bool Filesystem::PWrite(int fd, off_t offset, const void* data,
                         size_t data_size) const {
   size_t write_len = data_size;

diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index d3c7787..6bed8e6 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h

@@ -86,8 +86,12 @@
   // Copies the src file to the dst file.
   virtual bool CopyFile(const char* src, const char* dst) const;
 
+  // Copies the src directory and its contents to the dst dir.
+  virtual bool CopyDirectory(const char* src_dir, const char* dst_dir,
+                             bool recursive) const;
+
   // Returns true if a file exists.  False if the file doesn't exist.
-  // If there is an error getting stat on the file, it logs the error and //
+  // If there is an error getting stat on the file, it logs the error and
   // asserts.
   virtual bool FileExists(const char* file_name) const;
 

diff --git a/icing/file/filesystem_test.cc b/icing/file/filesystem_test.cc
index 492a50d..214180e 100644
--- a/icing/file/filesystem_test.cc
+++ b/icing/file/filesystem_test.cc

@@ -38,6 +38,7 @@
 using ::testing::Le;
 using ::testing::Ne;
 using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
 
 namespace icing {
 namespace lib {
@@ -450,5 +451,47 @@
   EXPECT_THAT(hello, Eq("hello"));
 }
 
+TEST_F(FilesystemTest, CopyDirectory) {
+  Filesystem filesystem;
+
+  // File structure:
+  // <temp_dir>/
+  //   src_dir/
+  //     file1
+  //     file2
+  //     sub_dir/
+  //       file3
+  const std::string src_dir = temp_dir_ + "/src_dir";
+  const std::string sub_dir = "sub_dir";
+  const std::string sub_dir_path = src_dir + "/" + sub_dir;
+  vector<std::string> some_files = {"file1", "file2", sub_dir + "/file3"};
+
+  // Make sure there is no pre-existing test-dir structure
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+
+  // Setup a test-dir structure
+  ASSERT_TRUE(filesystem.CreateDirectoryRecursively(
+      sub_dir_path.c_str()));  // deepest path for test
+  CreateTestFiles(some_files, src_dir);
+
+  const std::string dst_dir = temp_dir_ + "/dst_dir";
+  EXPECT_TRUE(filesystem.CopyDirectory(src_dir.c_str(), dst_dir.c_str(),
+                                       /*recursive=*/true));
+
+  vector<std::string> src_dir_files;
+  EXPECT_TRUE(filesystem.ListDirectory(src_dir.c_str(), /*exclude=*/{},
+                                       /*recursive=*/true, &src_dir_files));
+
+  vector<std::string> dst_dir_files;
+  EXPECT_TRUE(filesystem.ListDirectory(dst_dir.c_str(), /*exclude=*/{},
+                                       /*recursive=*/true, &dst_dir_files));
+
+  EXPECT_THAT(dst_dir_files, UnorderedElementsAreArray(src_dir_files));
+
+  // Clean up
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+  ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(dst_dir.c_str()));
+}
+
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h
index 88475cd..32817d4 100644
--- a/icing/file/mock-filesystem.h
+++ b/icing/file/mock-filesystem.h

@@ -44,6 +44,17 @@
           return real_filesystem_.DeleteDirectoryRecursively(dir_name);
         });
 
+    ON_CALL(*this, CopyFile)
+        .WillByDefault([this](const char* src, const char* dst) {
+          return real_filesystem_.CopyFile(src, dst);
+        });
+
+    ON_CALL(*this, CopyDirectory)
+        .WillByDefault(
+            [this](const char* src, const char* dst, bool recursive) {
+              return real_filesystem_.CopyDirectory(src, dst, recursive);
+            });
+
     ON_CALL(*this, FileExists).WillByDefault([this](const char* file_name) {
       return real_filesystem_.FileExists(file_name);
     });
@@ -227,6 +238,9 @@
 
   MOCK_METHOD(bool, CopyFile, (const char* src, const char* dst), (const));
 
+  MOCK_METHOD(bool, CopyDirectory,
+              (const char* src, const char* dst, bool recursive), (const));
+
   MOCK_METHOD(bool, FileExists, (const char* file_name), (const));
 
   MOCK_METHOD(bool, DirectoryExists, (const char* dir_name), (const));

diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index 95c3949..000ab3d 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h

@@ -147,80 +147,92 @@
       Crc32 crc;
 
       // Get a string_view of all the fields of the Header, excluding the
-      // magic_nbytes and header_checksum_nbytes
-      std::string_view header_str(reinterpret_cast<const char*>(this) +
-                                      offsetof(Header, header_checksum_nbytes) +
-                                      sizeof(header_checksum_nbytes),
-                                  sizeof(Header) - sizeof(magic_nbytes) -
-                                      sizeof(header_checksum_nbytes));
+      // magic_nbytes_ and header_checksum_nbytes_
+      std::string_view header_str(
+          reinterpret_cast<const char*>(this) +
+              offsetof(Header, header_checksum_nbytes_) +
+              sizeof(header_checksum_nbytes_),
+          sizeof(Header) - sizeof(magic_nbytes_) -
+              sizeof(header_checksum_nbytes_));
       crc.Append(header_str);
       return crc.Get();
     }
 
-    int32_t GetMagic() const { return gntohl(magic_nbytes); }
+    int32_t GetMagic() const { return gntohl(magic_nbytes_); }
 
-    void SetMagic(int32_t magic_in) { magic_nbytes = ghtonl(magic_in); }
+    void SetMagic(int32_t magic_in) { magic_nbytes_ = ghtonl(magic_in); }
 
     int32_t GetFileFormatVersion() const {
-      return gntohl(file_format_version_nbytes);
+      return gntohl(file_format_version_nbytes_);
     }
 
     void SetFileFormatVersion(int32_t file_format_version_in) {
-      file_format_version_nbytes = ghtonl(file_format_version_in);
+      file_format_version_nbytes_ = ghtonl(file_format_version_in);
     }
 
-    int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes); }
+    int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes_); }
 
     void SetMaxProtoSize(int32_t max_proto_size_in) {
-      max_proto_size_nbytes = ghtonl(max_proto_size_in);
+      max_proto_size_nbytes_ = ghtonl(max_proto_size_in);
     }
 
-    int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes); }
+    int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes_); }
 
     void SetLogChecksum(int32_t log_checksum_in) {
-      log_checksum_nbytes = ghtonl(log_checksum_in);
+      log_checksum_nbytes_ = ghtonl(log_checksum_in);
     }
 
-    int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes); }
+    int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes_); }
 
     void SetRewindOffset(int64_t rewind_offset_in) {
-      rewind_offset_nbytes = ghtonll(rewind_offset_in);
+      rewind_offset_nbytes_ = ghtonll(rewind_offset_in);
     }
 
-    int32_t GetHeaderChecksum() const { return gntohl(header_checksum_nbytes); }
+    int32_t GetHeaderChecksum() const {
+      return gntohl(header_checksum_nbytes_);
+    }
 
     void SetHeaderChecksum(int32_t header_checksum_in) {
-      header_checksum_nbytes = ghtonl(header_checksum_in);
+      header_checksum_nbytes_ = ghtonl(header_checksum_in);
     }
 
-    bool GetCompressFlag() const {
-      uint16_t host_order_flags = gntohs(flags_nbytes);
-      return bit_util::BitfieldGet(host_order_flags, kCompressBit, /*len=*/1);
-    }
+    bool GetCompressFlag() const { return GetFlag(kCompressBit); }
 
-    void SetCompressFlag(bool compress) {
-      uint16_t host_order_flags = gntohs(flags_nbytes);
-      bit_util::BitfieldSet(compress, kCompressBit,
-                            /*len=*/1, &host_order_flags);
-      flags_nbytes = ghtons(host_order_flags);
-    }
+    void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
+
+    bool GetDirtyFlag() { return GetFlag(kDirtyBit); }
+
+    void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
 
    private:
     // The least-significant bit offset at which the compress flag is stored in
-    // 'flags_nbytes'. Represents whether the protos in the log are compressed
+    // 'flags_nbytes_'. Represents whether the protos in the log are compressed
     // or not.
     static constexpr int32_t kCompressBit = 0;
 
+    // The least-significant bit offset at which the dirty flag is stored in
+    // 'flags'. Represents whether the checksummed portion of the log has been
+    // modified after the last checksum was computed.
+    static constexpr int32_t kDirtyBit = 1;
+
+    bool GetFlag(int offset) const {
+      return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
+    }
+
+    void SetFlag(int offset, bool value) {
+      bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
+    }
+
     // Holds the magic as a quick sanity check against file corruption.
     //
     // Field is in network-byte order.
-    int32_t magic_nbytes = ghtonl(kMagic);
+    int32_t magic_nbytes_ = ghtonl(kMagic);
 
     // Must be at the beginning after kMagic. Contains the crc checksum of
     // the following fields.
     //
     // Field is in network-byte order.
-    uint32_t header_checksum_nbytes = 0;
+    uint32_t header_checksum_nbytes_ = 0;
 
     // Last known good offset at which the log and its checksum were updated.
     // If we crash between writing to the log and updating the checksum, we can
@@ -228,7 +240,7 @@
     // valid instead of throwing away the entire log.
     //
     // Field is in network-byte order.
-    int64_t rewind_offset_nbytes = ghtonll(kHeaderReservedBytes);
+    int64_t rewind_offset_nbytes_ = ghtonll(kHeaderReservedBytes);
 
     // Version number tracking how we serialize the file to disk. If we change
     // how/what we write to disk, this version should be updated and this class
@@ -237,23 +249,23 @@
     // Currently at kFileFormatVersion.
     //
     // Field is in network-byte order.
-    int32_t file_format_version_nbytes = 0;
+    int32_t file_format_version_nbytes_ = 0;
 
     // The maximum proto size that can be written to the log.
     //
     // Field is in network-byte order.
-    int32_t max_proto_size_nbytes = 0;
+    int32_t max_proto_size_nbytes_ = 0;
 
     // Checksum of the log elements, doesn't include the header fields.
     //
     // Field is in network-byte order.
-    uint32_t log_checksum_nbytes = 0;
+    uint32_t log_checksum_nbytes_ = 0;
 
     // Bits are used to hold various flags.
     //   Lowest bit is whether the protos are compressed or not.
     //
-    // Field is in network-byte order.
-    uint16_t flags_nbytes = 0;
+    // Field is only 1 byte, so is byte-order agnostic.
+    uint8_t flags_ = 0;
 
     // NOTE: New fields should *almost always* be added to the end here. Since
     // this class may have already been written to disk, appending fields
@@ -270,7 +282,14 @@
     // happen if the file is corrupted or some previously added data was
     // unpersisted. This may be used to signal that any derived data off of the
     // proto log may need to be regenerated.
-    DataLoss data_loss;
+    DataLoss data_loss = DataLoss::NONE;
+
+    // Whether the proto log had to recalculate the checksum to check its
+    // integrity. This can be avoided if no changes were made or the log was
+    // able to update its checksum before shutting down. But it may have to
+    // recalculate if it's unclear if we crashed after updating the log, but
+    // before updating our checksum.
+    bool recalculated_checksum = false;
 
     bool has_data_loss() {
       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
@@ -638,7 +657,7 @@
       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
                                                  std::move(header))),
-      /*data_loss=*/DataLoss::NONE};
+      /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
 
   return create_result;
 }
@@ -649,6 +668,7 @@
 PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
     const Filesystem* filesystem, const std::string& file_path,
     const Options& options, int64_t file_size) {
+  bool header_changed = false;
   if (file_size < kHeaderReservedBytes) {
     return absl_ports::InternalError(
         absl_ports::StrCat("File header too short for: ", file_path));
@@ -687,61 +707,85 @@
         header->GetCompressFlag(), options.compress));
   }
 
-  if (header->GetMaxProtoSize() > options.max_proto_size) {
+  int32_t existing_max_proto_size = header->GetMaxProtoSize();
+  if (existing_max_proto_size > options.max_proto_size) {
     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
         "Max proto size cannot be smaller than previous "
         "instantiations, previous size %d, wanted size %d",
         header->GetMaxProtoSize(), options.max_proto_size));
+  } else if (existing_max_proto_size < options.max_proto_size) {
+    // It's fine if our new max size is greater than our previous one. Existing
+    // data is still valid.
+    header->SetMaxProtoSize(options.max_proto_size);
+    header_changed = true;
   }
-  header->SetMaxProtoSize(options.max_proto_size);
 
   DataLoss data_loss = DataLoss::NONE;
-  ICING_ASSIGN_OR_RETURN(
-      Crc32 calculated_log_checksum,
-      ComputeChecksum(filesystem, file_path, Crc32(),
-                      /*start=*/kHeaderReservedBytes, /*end=*/file_size));
 
-  // Double check that the log checksum is the same as the one that was
-  // persisted last time. If not, we start recovery logic.
-  if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
-    // Need to rewind the proto log since the checksums don't match.
-    // Worst case, we have to rewind the entire log back to just the header
-    int64_t last_known_good = kHeaderReservedBytes;
+  // If we have any documents in our tail, get rid of them since they're not in
+  // our checksum. Our checksum reflects content up to the rewind offset.
+  if (file_size > header->GetRewindOffset()) {
+    if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
+      return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+          "Failed to truncate '%s' to size %lld", file_path.data(),
+          static_cast<long long>(header->GetRewindOffset())));
+    };
+    data_loss = DataLoss::PARTIAL;
+  }
 
-    // Calculate the checksum of the log contents just up to the last rewind
-    // offset point. This will be valid if we just appended contents to the log
-    // without updating the checksum, and we can rewind back to this point
-    // safely.
-    ICING_ASSIGN_OR_RETURN(calculated_log_checksum,
-                           ComputeChecksum(filesystem, file_path, Crc32(),
-                                           /*start=*/kHeaderReservedBytes,
-                                           /*end=*/header->GetRewindOffset()));
-    if (header->GetLogChecksum() == calculated_log_checksum.Get()) {
-      // Check if it matches our last rewind state. If so, this becomes our last
-      // good state and we can safely truncate and recover from here.
-      last_known_good = header->GetRewindOffset();
-      data_loss = DataLoss::PARTIAL;
-    } else {
-      // Otherwise, we're going to truncate the entire log and this resets the
-      // checksum to an empty log state.
-      header->SetLogChecksum(0);
-      data_loss = DataLoss::COMPLETE;
+  bool recalculated_checksum = false;
+
+  // If our dirty flag is set, that means we might have crashed in the middle of
+  // erasing a proto. This could have happened anywhere between:
+  //   A. Set dirty flag to true and update header checksum
+  //   B. Erase the proto
+  //   C. Set dirty flag to false, update log checksum, update header checksum
+  //
+  // Scenario 1: We went down between A and B. Maybe our dirty flag is a
+  // false alarm and we can keep all our data.
+  //
+  // Scenario 2: We went down between B and C. Our data is compromised and
+  // we need to throw everything out.
+  if (header->GetDirtyFlag()) {
+    // Recompute the log's checksum to detect which scenario we're in.
+    ICING_ASSIGN_OR_RETURN(
+        Crc32 calculated_log_checksum,
+        ComputeChecksum(filesystem, file_path, Crc32(),
+                        /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+
+    if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
+      // Still doesn't match, we're in Scenario 2. Throw out all our data now
+      // and initialize as a new instance.
+      ICING_ASSIGN_OR_RETURN(CreateResult create_result,
+                             InitializeNewFile(filesystem, file_path, options));
+      create_result.data_loss = DataLoss::COMPLETE;
+      create_result.recalculated_checksum = true;
+      return create_result;
     }
+    // Otherwise we're good, checksum matches our contents so continue
+    // initializing like normal.
+    recalculated_checksum = true;
 
-    if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
+    // Update our header.
+    header->SetDirtyFlag(false);
+    header_changed = true;
+  }
+
+  if (header_changed) {
+    header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+    if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
+                            sizeof(Header))) {
       return absl_ports::InternalError(
-          absl_ports::StrCat("Error truncating file: ", file_path));
+          absl_ports::StrCat("Failed to update header to: ", file_path));
     }
-
-    ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
-                    << last_known_good;
   }
 
   CreateResult create_result = {
       std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
           new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
                                                  std::move(header))),
-      data_loss};
+      data_loss, recalculated_checksum};
 
   return create_result;
 }
@@ -963,7 +1007,18 @@
 
   // We need to update the crc checksum if the erased area is before the
   // rewind position.
-  if (file_offset + sizeof(metadata) < header_->GetRewindOffset()) {
+  int32_t new_crc;
+  int64_t erased_proto_offset = file_offset + sizeof(metadata);
+  if (erased_proto_offset < header_->GetRewindOffset()) {
+    // Set to "dirty" before we start writing anything.
+    header_->SetDirtyFlag(true);
+    header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+    if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+                             sizeof(Header))) {
+      return absl_ports::InternalError(absl_ports::StrCat(
+          "Failed to update dirty bit of header to: ", file_path_));
+    }
+
     // We need to calculate [original string xor 0s].
     // The xored string is the same as the original string because 0 xor 0 =
     // 0, 1 xor 0 = 1.
@@ -972,13 +1027,20 @@
 
     Crc32 crc(header_->GetLogChecksum());
     ICING_ASSIGN_OR_RETURN(
-        uint32_t new_crc,
-        crc.UpdateWithXor(xored_str,
-                          /*full_data_size=*/header_->GetRewindOffset() -
-                              kHeaderReservedBytes,
-                          /*position=*/file_offset + sizeof(metadata) -
-                              kHeaderReservedBytes));
+        new_crc, crc.UpdateWithXor(
+                     xored_str,
+                     /*full_data_size=*/header_->GetRewindOffset() -
+                         kHeaderReservedBytes,
+                     /*position=*/erased_proto_offset - kHeaderReservedBytes));
+  }
 
+  // Clear the region.
+  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+
+  // If we cleared something in our checksummed area, we should update our
+  // checksum and reset our dirty bit.
+  if (erased_proto_offset < header_->GetRewindOffset()) {
+    header_->SetDirtyFlag(false);
     header_->SetLogChecksum(new_crc);
     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
 
@@ -989,7 +1051,6 @@
     }
   }
 
-  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
   return libtextclassifier3::Status::OK;
 }
 

diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index dfb67aa..69b8a1a 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc

@@ -42,6 +42,20 @@
 using ::testing::Pair;
 using ::testing::Return;
 
+using Header = PortableFileBackedProtoLog<DocumentProto>::Header;
+
+Header ReadHeader(Filesystem filesystem, const std::string& file_path) {
+  Header header;
+  filesystem.PRead(file_path.c_str(), &header, sizeof(Header),
+                   /*offset=*/0);
+  return header;
+}
+
+void WriteHeader(Filesystem filesystem, const std::string& file_path,
+                 Header& header) {
+  filesystem.Write(file_path.c_str(), &header, sizeof(Header));
+}
+
 class PortableFileBackedProtoLogTest : public ::testing::Test {
  protected:
   // Adds a user-defined default construct because a const member variable may
@@ -79,6 +93,7 @@
                                                              max_proto_size_)));
   EXPECT_THAT(create_result.proto_log, NotNull());
   EXPECT_FALSE(create_result.has_data_loss());
+  EXPECT_FALSE(create_result.recalculated_checksum);
 
   // Can't recreate the same file with different options.
   ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
@@ -300,12 +315,12 @@
     EXPECT_FALSE(create_result.has_data_loss());
   }
 
-  int corrupt_value = 24;
+  int corrupt_checksum = 24;
 
-  // Offset after the kMagic and the header_checksum.
-  int offset_after_checksum = 8;
-  filesystem_.PWrite(file_path_.c_str(), offset_after_checksum, &corrupt_value,
-                     sizeof(corrupt_value));
+  // Write the corrupted header
+  Header header = ReadHeader(filesystem_, file_path_);
+  header.SetHeaderChecksum(corrupt_checksum);
+  WriteHeader(filesystem_, file_path_, header);
 
   {
     // Reinitialize the same proto_log
@@ -331,8 +346,12 @@
 
     // Corrupt the magic that's stored at the beginning of the header.
     int invalid_magic = -1;
-    filesystem_.PWrite(file_path_.c_str(), /*offset=*/0, &invalid_magic,
-                       sizeof(invalid_magic));
+    ASSERT_THAT(invalid_magic, Not(Eq(Header::kMagic)));
+
+    // Write the corrupted header
+    Header header = ReadHeader(filesystem_, file_path_);
+    header.SetMagic(invalid_magic);
+    WriteHeader(filesystem_, file_path_, header);
   }
 
   {
@@ -346,7 +365,17 @@
   }
 }
 
-TEST_F(PortableFileBackedProtoLogTest, CorruptContent) {
+TEST_F(PortableFileBackedProtoLogTest,
+       UnableToDetectCorruptContentWithoutDirtyBit) {
+  // This is intentional that we can't detect corruption. We're trading off
+  // earlier corruption detection for lower initialization latency. By not
+  // calculating the checksum on initialization, we can initialize much faster,
+  // but at the cost of detecting corruption. Note that even if we did detect
+  // corruption, there was nothing we could've done except throw an error to
+  // clients. We'll still do that, but at some later point when the log is
+  // attempting to be accessed and we can't actually deserialize a proto from
+  // it. See the description in cl/374278280 for more details.
+
   {
     ICING_ASSERT_OK_AND_ASSIGN(
         PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
@@ -361,19 +390,20 @@
         DocumentBuilder().SetKey("namespace1", "uri1").Build();
 
     // Write and persist an document.
-    ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
                                proto_log->WriteProto(document));
     ICING_ASSERT_OK(proto_log->PersistToDisk());
 
     // "Corrupt" the content written in the log.
     document.set_uri("invalid");
     std::string serialized_document = document.SerializeAsString();
-    filesystem_.PWrite(file_path_.c_str(), document_offset,
-                       serialized_document.data(), serialized_document.size());
+    ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), document_offset,
+                                   serialized_document.data(),
+                                   serialized_document.size()));
   }
 
   {
-    // We can recover, but we have data loss.
+    // We can recover, and we don't have data loss.
     ICING_ASSERT_OK_AND_ASSIGN(
         PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
         PortableFileBackedProtoLog<DocumentProto>::Create(
@@ -381,17 +411,147 @@
             PortableFileBackedProtoLog<DocumentProto>::Options(
                 compress_, max_proto_size_)));
     auto proto_log = std::move(create_result.proto_log);
-    ASSERT_TRUE(create_result.has_data_loss());
-    ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+    EXPECT_FALSE(create_result.has_data_loss());
+    EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+    EXPECT_FALSE(create_result.recalculated_checksum);
 
-    // Lost everything in the log since the rewind position doesn't help if
-    // there's been data corruption within the persisted region
-    ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
-              kHeaderReservedBytes);
+    // We still have the corrupted content in our file, we didn't throw
+    // everything out.
+    EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+                Gt(kHeaderReservedBytes));
   }
 }
 
-TEST_F(PortableFileBackedProtoLogTest, PersistToDisk) {
+TEST_F(PortableFileBackedProtoLogTest,
+       DetectAndThrowOutCorruptContentWithDirtyBit) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder()
+            .SetKey("namespace1", "uri1")
+            .AddStringProperty("string_property", "foo", "bar")
+            .Build();
+
+    // Write and persist the protos
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+                               proto_log->WriteProto(document));
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                IsOkAndHolds(EqualsProto(document)));
+  }
+
+  {
+    // "Corrupt" the content written in the log. Make the corrupt document
+    // smaller than our original one so we don't accidentally write past our
+    // file.
+    DocumentProto document =
+        DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build();
+    std::string serialized_document = document.SerializeAsString();
+    ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), kHeaderReservedBytes,
+                                   serialized_document.data(),
+                                   serialized_document.size()));
+
+    Header header = ReadHeader(filesystem_, file_path_);
+
+    // Set dirty bit to true to reflect that something changed in the log.
+    header.SetDirtyFlag(true);
+    header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+    WriteHeader(filesystem_, file_path_, header);
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    EXPECT_TRUE(create_result.has_data_loss());
+    EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+    // We had to recalculate the checksum to detect the corruption.
+    EXPECT_TRUE(create_result.recalculated_checksum);
+
+    // We lost everything, file size is back down to the header.
+    EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+                Eq(kHeaderReservedBytes));
+
+    // At least the log is no longer dirty.
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitFalseAlarmKeepsData) {
+  DocumentProto document =
+      DocumentBuilder().SetKey("namespace1", "uri1").Build();
+  int64_t document_offset;
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    // Write and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(document_offset,
+                               proto_log->WriteProto(document));
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                IsOkAndHolds(EqualsProto(document)));
+  }
+
+  {
+    Header header = ReadHeader(filesystem_, file_path_);
+
+    // Simulate the dirty flag set as true, but no data has been changed yet.
+    // Maybe we crashed between writing the dirty flag and erasing a proto.
+    header.SetDirtyFlag(true);
+    header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+    WriteHeader(filesystem_, file_path_, header);
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    EXPECT_FALSE(create_result.has_data_loss());
+
+    // Even though nothing changed, the false alarm dirty bit should have
+    // triggered us to recalculate our checksum.
+    EXPECT_TRUE(create_result.recalculated_checksum);
+
+    // Check that our document still exists even though dirty bit was true.
+    EXPECT_THAT(proto_log->ReadProto(document_offset),
+                IsOkAndHolds(EqualsProto(document)));
+
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+       PersistToDiskKeepsPersistedDataAndTruncatesExtraData) {
   DocumentProto document1 =
       DocumentBuilder().SetKey("namespace1", "uri1").Build();
   DocumentProto document2 =
@@ -426,6 +586,8 @@
 
     log_size = filesystem_.GetFileSize(file_path_.c_str());
     ASSERT_GT(log_size, 0);
+
+    // PersistToDisk happens implicitly during the destructor.
   }
 
   {
@@ -453,6 +615,7 @@
     auto proto_log = std::move(create_result.proto_log);
     ASSERT_TRUE(create_result.has_data_loss());
     ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+    ASSERT_FALSE(create_result.recalculated_checksum);
 
     // Check that everything was persisted across instances
     ASSERT_THAT(proto_log->ReadProto(document1_offset),
@@ -465,6 +628,183 @@
   }
 }
 
+TEST_F(PortableFileBackedProtoLogTest,
+       DirtyBitIsFalseAfterPutAndPersistToDisk) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    // Write and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+                               proto_log->WriteProto(document));
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                IsOkAndHolds(EqualsProto(document)));
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+
+    // We previously persisted to disk so everything should be in a perfect
+    // state.
+    EXPECT_FALSE(create_result.has_data_loss());
+    EXPECT_FALSE(create_result.recalculated_checksum);
+
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+       DirtyBitIsFalseAfterDeleteAndPersistToDisk) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    // Write, delete, and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+                               proto_log->WriteProto(document));
+    ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+    ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+    // The proto has been erased.
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+
+    // We previously persisted to disk so everything should be in a perfect
+    // state.
+    EXPECT_FALSE(create_result.has_data_loss());
+    EXPECT_FALSE(create_result.recalculated_checksum);
+
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitIsFalseAfterPutAndDestructor) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    // Write and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+                               proto_log->WriteProto(document));
+
+    // Check that what we read is what we wrote
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                IsOkAndHolds(EqualsProto(document)));
+
+    // PersistToDisk is implicitly called as part of the destructor and
+    // PersistToDisk will clear the dirty bit.
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+
+    // We previously persisted to disk so everything should be in a perfect
+    // state.
+    EXPECT_FALSE(create_result.has_data_loss());
+    EXPECT_FALSE(create_result.recalculated_checksum);
+
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+       DirtyBitIsFalseAfterDeleteAndDestructor) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+    auto proto_log = std::move(create_result.proto_log);
+    ASSERT_FALSE(create_result.has_data_loss());
+
+    DocumentProto document =
+        DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+    // Write, delete, and persist the first proto
+    ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+                               proto_log->WriteProto(document));
+    ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+
+    // The proto has been erased.
+    ASSERT_THAT(proto_log->ReadProto(document_offset),
+                StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+    // PersistToDisk is implicitly called as part of the destructor and
+    // PersistToDisk will clear the dirty bit.
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(
+        PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+        PortableFileBackedProtoLog<DocumentProto>::Create(
+            &filesystem_, file_path_,
+            PortableFileBackedProtoLog<DocumentProto>::Options(
+                compress_, max_proto_size_)));
+
+    // We previously persisted to disk so everything should be in a perfect
+    // state.
+    EXPECT_FALSE(create_result.has_data_loss());
+    EXPECT_FALSE(create_result.recalculated_checksum);
+
+    Header header = ReadHeader(filesystem_, file_path_);
+    EXPECT_FALSE(header.GetDirtyFlag());
+  }
+}
+
 TEST_F(PortableFileBackedProtoLogTest, Iterator) {
   DocumentProto document1 =
       DocumentBuilder().SetKey("namespace", "uri1").Build();
@@ -508,7 +848,7 @@
   {
     // Iterator with bad filesystem
     MockFilesystem mock_filesystem;
-    ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+    ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
         .WillByDefault(Return(Filesystem::kBadFileSize));
     PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
         mock_filesystem, file_path_, /*initial_offset=*/0);

diff --git a/icing/result/snippet-retriever-test-jni-layer.cc b/icing/result/snippet-retriever-test-jni-layer.cc
new file mode 100644
index 0000000..707d9ee
--- /dev/null
+++ b/icing/result/snippet-retriever-test-jni-layer.cc

@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_SnippetRetrieverJniTest_testsMain(JNIEnv* env, jclass ignored) {
+  g_jenv = env;
+
+  std::vector<char*> my_argv;
+  char arg[] = "jni-test-lib";
+  my_argv.push_back(arg);
+  int argc = 1;
+  char** argv = &(my_argv[0]);
+  testing::InitGoogleTest(&argc, argv);
+  testing::UnitTest::GetInstance()->listeners().Append(
+      new icing::lib::LoggingEventListener());
+  return RUN_ALL_TESTS() == 0;
+}

diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index dc9f8be..2a138ec 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc

@@ -157,61 +157,58 @@
 }
 
 // Finds the start position of a valid token that is after
-// window_start_min_exclusive
+// window_start_min_exclusive_utf32
 //
 // Returns:
 //   the position of the window start if successful
 //   INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowStart(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    std::string_view value, int window_start_min_exclusive,
+    std::string_view value, int window_start_min_exclusive_utf32,
     Tokenizer::Iterator* iterator) {
-  if (!iterator->ResetToTokenAfter(window_start_min_exclusive)) {
+  if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
-  return iterator->GetToken().text.data() - value.data();
+  return iterator->CalculateTokenStart();
 }
 
 // Increments window_end_exclusive so long as the character at the position
 // of window_end_exclusive is punctuation and does not exceed
-// window_end_max_exclusive.
-int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
-                               int window_end_max_exclusive) {
-  while (window_end_exclusive < window_end_max_exclusive) {
+// window_end_max_exclusive_utf32.
+CharacterIterator IncludeTrailingPunctuation(
+    std::string_view value, CharacterIterator window_end_exclusive,
+    int window_end_max_exclusive_utf32) {
+  while (window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
     int char_len = 0;
-    if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
-      break;
-    }
-    if (window_end_exclusive + char_len > window_end_max_exclusive) {
-      // This is punctuation, but it goes beyond the window end max. Don't
-      // include it.
+    if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
+                                     &char_len)) {
       break;
     }
     // Expand window by char_len and check the next character.
-    window_end_exclusive += char_len;
+    window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
   }
   return window_end_exclusive;
 }
 
 // Finds the end position of a valid token that is before the
-// window_end_max_exclusive.
+// window_end_max_exclusive_utf32.
 //
 // Returns:
 //   the position of the window end if successful
 //   INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowEnd(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
-    std::string_view value, int window_end_max_exclusive,
+    std::string_view value, int window_end_max_exclusive_utf32,
     Tokenizer::Iterator* iterator) {
-  if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
+  if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
-  int window_end_exclusive = iterator->GetToken().text.data() - value.data() +
-                             iterator->GetToken().text.length();
-  return IncludeTrailingPunctuation(value, window_end_exclusive,
-                                    window_end_max_exclusive);
+  ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
+                         iterator->CalculateTokenEndExclusive());
+  return IncludeTrailingPunctuation(value, end_exclusive,
+                                    window_end_max_exclusive_utf32);
 }
 
 struct SectionData {
@@ -232,8 +229,10 @@
     const SectionData& value, Tokenizer::Iterator* iterator,
     const CharacterIterator& char_iterator) {
   SnippetMatchProto snippet_match;
-  Token match = iterator->GetToken();
-  int match_pos = char_iterator.utf8_index();
+  ICING_ASSIGN_OR_RETURN(CharacterIterator start_itr,
+                         iterator->CalculateTokenStart());
+  ICING_ASSIGN_OR_RETURN(CharacterIterator end_itr,
+                         iterator->CalculateTokenEndExclusive());
 
   // When finding boundaries,  we have a few cases:
   //
@@ -262,70 +261,65 @@
   //     window =               |-----|
   //
   // We have do +1/-1 below to get the math to match up.
-  int match_mid = match_pos + match.text.length() / 2;
-  int window_start_min_exclusive =
-      (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
-  int window_end_max_exclusive =
-      match_mid + (snippet_spec.max_window_bytes() + 1) / 2;
+  int match_pos_utf32 = start_itr.utf32_index();
+  int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
+  int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
+  int window_start_min_exclusive_utf32 =
+      (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1;
+  int window_end_max_exclusive_utf32 =
+      match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2;
 
-  snippet_match.set_exact_match_byte_position(match_pos);
-  snippet_match.set_exact_match_utf16_position(char_iterator.utf16_index());
-
-  // Create character iterators to find the beginning and end of the window.
-  CharacterIterator forward_char_iterator(char_iterator);
-  CharacterIterator backwards_char_iterator(char_iterator);
-
-  if (!backwards_char_iterator.AdvanceToUtf8(match_pos + match.text.length())) {
-    return absl_ports::AbortedError("Could not retrieve valid utf8 character!");
-  }
-  snippet_match.set_exact_match_byte_length(match.text.length());
-  snippet_match.set_exact_match_utf16_length(
-      backwards_char_iterator.utf16_index() - char_iterator.utf16_index());
+  snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
+  snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
+  snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
+                                            start_itr.utf8_index());
+  snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
+                                             start_itr.utf16_index());
 
   // Only include windows if it'll at least include the matched text. Otherwise,
   // it'll just be an empty string anyways.
-  if (snippet_spec.max_window_bytes() >= match.text.length()) {
+  if (snippet_spec.max_window_bytes() >= match_len_utf32) {
     // Find the beginning of the window.
-    int window_start;
-    int window_start_utf16;
-    if (window_start_min_exclusive < 0) {
-      window_start = 0;
-      window_start_utf16 = 0;
-    } else {
+    ICING_ASSIGN_OR_RETURN(
+        CharacterIterator window_start,
+        DetermineWindowStart(snippet_spec, value.section_subcontent,
+                             window_start_min_exclusive_utf32, iterator));
+
+    // Check. Did we get fewer characters than we requested? If so, then add it
+    // on to the window_end.
+    int extra_window_space =
+        window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
+    window_end_max_exclusive_utf32 += extra_window_space;
+
+    // Find the end of the window.
+    ICING_ASSIGN_OR_RETURN(
+        CharacterIterator window_end,
+        DetermineWindowEnd(snippet_spec, value.section_subcontent,
+                           window_end_max_exclusive_utf32, iterator));
+
+    // Check one more time. Did we get fewer characters than we requested? If
+    // so, then see if we can push the start back again.
+    extra_window_space =
+        window_end_max_exclusive_utf32 - window_end.utf32_index();
+    if (extra_window_space > 0) {
+      window_start_min_exclusive_utf32 =
+          window_start.utf32_index() - 1 - extra_window_space;
       ICING_ASSIGN_OR_RETURN(
           window_start,
           DetermineWindowStart(snippet_spec, value.section_subcontent,
-                               window_start_min_exclusive, iterator));
-      if (!forward_char_iterator.RewindToUtf8(window_start)) {
-        return absl_ports::AbortedError(
-            "Could not retrieve valid utf8 character!");
-      }
-      window_start_utf16 = forward_char_iterator.utf16_index();
+                               window_start_min_exclusive_utf32, iterator));
     }
-    snippet_match.set_window_byte_position(window_start);
-    snippet_match.set_window_utf16_position(window_start_utf16);
 
-    // Find the end of the window.
-    int window_end_exclusive;
-    if (window_end_max_exclusive >= value.section_subcontent.length()) {
-      window_end_exclusive = value.section_subcontent.length();
-    } else {
-      ICING_ASSIGN_OR_RETURN(
-          window_end_exclusive,
-          DetermineWindowEnd(snippet_spec, value.section_subcontent,
-                             window_end_max_exclusive, iterator));
-    }
-    if (!backwards_char_iterator.AdvanceToUtf8(window_end_exclusive)) {
-      return absl_ports::AbortedError(
-          "Could not retrieve valid utf8 character!");
-    }
-    snippet_match.set_window_byte_length(window_end_exclusive - window_start);
-    snippet_match.set_window_utf16_length(
-        backwards_char_iterator.utf16_index() - window_start_utf16);
+    snippet_match.set_window_byte_position(window_start.utf8_index());
+    snippet_match.set_window_utf16_position(window_start.utf16_index());
+    snippet_match.set_window_byte_length(window_end.utf8_index() -
+                                         window_start.utf8_index());
+    snippet_match.set_window_utf16_length(window_end.utf16_index() -
+                                          window_start.utf16_index());
 
     // DetermineWindowStart/End may change the position of the iterator. So,
     // reset the iterator back to the original position.
-    bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
+    bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1)
                                    : iterator->ResetToStart();
     if (!success) {
       return absl_ports::InternalError(

diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index c052a9e..e7988ae 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc

@@ -37,6 +37,7 @@
 #include "icing/store/key-mapper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
@@ -88,7 +89,9 @@
               GetTestFilePath("icing/icu.dat")));
     }
 
-    language_segmenter_factory::SegmenterOptions options(ULOC_US);
+    jni_cache_ = GetTestJniCache();
+    language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                         jni_cache_.get());
     ICING_ASSERT_OK_AND_ASSIGN(
         language_segmenter_,
         language_segmenter_factory::Create(std::move(options)));
@@ -140,6 +143,7 @@
   std::unique_ptr<LanguageSegmenter> language_segmenter_;
   std::unique_ptr<SnippetRetriever> snippet_retriever_;
   std::unique_ptr<Normalizer> normalizer_;
+  std::unique_ptr<const JniCache> jni_cache_;
   ResultSpecProto::SnippetSpecProto snippet_spec_;
   std::string test_dir_;
 };
@@ -248,9 +252,15 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
 
-  // Window starts at the space between "one" and "two". Window ends in the
-  // middle of "four".
-  // len=14, orig_window=" two three fou"
+  // String:      "one two three four.... five"
+  //               ^   ^   ^     ^        ^   ^
+  // UTF-8 idx:    0   4   8     14       23  27
+  // UTF-32 idx:   0   4   8     14       23  27
+  //
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (2,17).
+  //   2. trimmed, no-shifting window [4,13) "two three"
+  //   3. trimmed, shifted window [4,18) "two three four"
   snippet_spec_.set_max_window_bytes(14);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -260,7 +270,7 @@
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
-              ElementsAre("two three"));
+              ElementsAre("two three four"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
@@ -275,8 +285,15 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
 
-  // Window starts in the middle of "one" and ends at the end of "four".
-  // len=16, orig_window="e two three four"
+  // String:      "one two three four.... five"
+  //               ^   ^   ^     ^        ^   ^
+  // UTF-8 idx:    0   4   8     14       23  27
+  // UTF-32 idx:   0   4   8     14       23  27
+  //
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (1,18).
+  //   2. trimmed, no-shifting window [4,18) "two three four"
+  //   3. trimmed, shifted window [4,20) "two three four.."
   snippet_spec_.set_max_window_bytes(16);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -286,7 +303,7 @@
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
-              ElementsAre("two three four"));
+              ElementsAre("two three four.."));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
@@ -316,7 +333,7 @@
 }
 
 TEST_F(SnippetRetrieverTest,
-       SnippetingWindowMaxWindowEndsInMiddleOfMultiBytePunctuation) {
+       SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
   DocumentProto document =
       DocumentBuilder()
           .SetKey("icing", "email/1")
@@ -330,7 +347,7 @@
   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
 
   // Window ends in the middle of all the punctuation and window starts at 0.
-  // len=26, orig_window="pside down in Australia\xC2"
+  // len=26, orig_window="pside down in Australia¿"
   snippet_spec_.set_max_window_bytes(24);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -340,11 +357,11 @@
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
-              ElementsAre("down in Australia"));
+              ElementsAre("down in Australia¿"));
 }
 
 TEST_F(SnippetRetrieverTest,
-       SnippetingWindowMaxWindowEndsInMultiBytePunctuation) {
+       SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
   DocumentProto document =
       DocumentBuilder()
           .SetKey("icing", "email/1")
@@ -358,7 +375,7 @@
   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
 
   // Window ends in the middle of all the punctuation and window starts at 0.
-  // len=26, orig_window="upside down in Australia\xC2\xBF"
+  // len=26, orig_window="upside down in Australia¿ "
   snippet_spec_.set_max_window_bytes(26);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -383,8 +400,15 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
 
-  // Window starts before 0.
-  // len=22, orig_window="one two three four..."
+  // String:      "one two three four.... five"
+  //               ^   ^   ^     ^        ^   ^
+  // UTF-8 idx:    0   4   8     14       23  27
+  // UTF-32 idx:   0   4   8     14       23  27
+  //
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (-2,21).
+  //   2. trimmed, no-shifting window [0,21) "one two three four..."
+  //   3. trimmed, shifted window [0,22) "one two three four...."
   snippet_spec_.set_max_window_bytes(22);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -394,7 +418,7 @@
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
-              ElementsAre("one two three four..."));
+              ElementsAre("one two three four...."));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
@@ -435,8 +459,15 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
 
-  // Window ends in the middle of "five"
-  // len=32, orig_window="one two three four.... fiv"
+  // String:      "one two three four.... five"
+  //               ^   ^   ^     ^        ^   ^
+  // UTF-8 idx:    0   4   8     14       23  27
+  // UTF-32 idx:   0   4   8     14       23  27
+  //
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be ((-7,26).
+  //   2. trimmed, no-shifting window [0,26) "one two three four...."
+  //   3. trimmed, shifted window [0,27) "one two three four.... five"
   snippet_spec_.set_max_window_bytes(32);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -446,7 +477,7 @@
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
-              ElementsAre("one two three four...."));
+              ElementsAre("one two three four.... five"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
@@ -501,6 +532,142 @@
               ElementsAre("one two three four.... five"));
 }
 
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four.... five six")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+  // String:      "one two three four.... five six"
+  //               ^   ^   ^     ^        ^    ^  ^
+  // UTF-8 idx:    0   4   8     14       23  28  31
+  // UTF-32 idx:   0   4   8     14       23  28  31
+  //
+  // Window size will go past the start of the window.
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (-10,19).
+  //   2. trimmed, no-shifting window [0,19) "one two three four."
+  //   3. trimmed, shifted window [0,27) "one two three four.... five"
+  snippet_spec_.set_max_window_bytes(28);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+              ElementsAre("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four.... five six")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
+
+  // String:      "one two three four.... five six"
+  //               ^   ^   ^     ^        ^    ^  ^
+  // UTF-8 idx:    0   4   8     14       23  28  31
+  // UTF-32 idx:   0   4   8     14       23  28  31
+  //
+  // Window size will go past the end of the window.
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (10,39).
+  //   2. trimmed, no-shifting window [14,31) "four.... five six"
+  //   3. trimmed, shifted window [4,31) "two three four.... five six"
+  snippet_spec_.set_max_window_bytes(28);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+              ElementsAre("two three four.... five six"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four....")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+  // String:      "one two three four...."
+  //               ^   ^   ^     ^       ^
+  // UTF-8 idx:    0   4   8     14      22
+  // UTF-32 idx:   0   4   8     14      22
+  //
+  // Window size will go past the start of the window.
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (-10,19).
+  //   2. trimmed, no-shifting window [0, 19) "one two three four."
+  //   3. trimmed, shifted window [0, 22) "one two three four...."
+  snippet_spec_.set_max_window_bytes(28);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+              ElementsAre("one two three four...."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
+  DocumentProto document =
+      DocumentBuilder()
+          .SetKey("icing", "email/1")
+          .SetSchema("email")
+          .AddStringProperty("subject", "counting")
+          .AddStringProperty("body", "one two three four....")
+          .Build();
+
+  SectionIdMask section_mask = 0b00000011;
+  SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
+
+  // String:      "one two three four...."
+  //               ^   ^   ^     ^       ^
+  // UTF-8 idx:    0   4   8     14      22
+  // UTF-32 idx:   0   4   8     14      22
+  //
+  // Window size will go past the start of the window.
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (1,30).
+  //   2. trimmed, no-shifting window [4, 22) "two three four...."
+  //   3. trimmed, shifted window [0, 22) "one two three four...."
+  snippet_spec_.set_max_window_bytes(28);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  EXPECT_THAT(snippet.entries(), SizeIs(1));
+  EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+  std::string_view content =
+      GetString(&document, snippet.entries(0).property_name());
+  EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+              ElementsAre("one two three four...."));
+}
+
 TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
   DocumentProto document =
       DocumentBuilder()
@@ -578,6 +745,15 @@
                              "Concerning the subject of foo, we need to begin "
                              "considering our options regarding body bar.")
           .Build();
+  // String:      "Concerning the subject of foo, we need to begin considering "
+  //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
+  // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
+  // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
+  //
+  // String ctd:  "our options regarding body bar."
+  //               ^   ^       ^         ^    ^   ^
+  // UTF-8 idx:    60  64      72        82   87  91
+  // UTF-32 idx:   60  64      72        82   87  91
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
@@ -588,10 +764,19 @@
   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
+  // The first window will be:
+  //   1. untrimmed, no-shifting window will be (-6,59).
+  //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+  //   3. trimmed, shifted window [0, 63) "Concerning... our"
+  // The second window will be:
+  //   1. untrimmed, no-shifting window will be (54,91).
+  //   2. trimmed, no-shifting window [60, 91) "our... bar.".
+  //   3. trimmed, shifted window [31, 91) "we... bar."
   EXPECT_THAT(
       GetWindows(content, snippet.entries(0)),
-      ElementsAre("Concerning the subject of foo, we need to begin considering",
-                  "our options regarding body bar."));
+      ElementsAre(
+          "Concerning the subject of foo, we need to begin considering our",
+          "we need to begin considering our options regarding body bar."));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("foo", "bar"));
 
@@ -612,6 +797,16 @@
                              "Concerning the subject of foo, we need to begin "
                              "considering our options regarding body bar.")
           .Build();
+  // String:      "Concerning the subject of foo, we need to begin considering "
+  //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
+  // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
+  // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
+  //
+  // String ctd:  "our options regarding body bar."
+  //               ^   ^       ^         ^    ^   ^
+  // UTF-8 idx:    60  64      72        82   87  91
+  // UTF-32 idx:   60  64      72        82   87  91
+  //
   // Section 1 "subject" is not in the section_mask, so no snippet information
   // from that section should be returned by the SnippetRetriever.
   SectionIdMask section_mask = 0b00000001;
@@ -624,10 +819,19 @@
   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
+  // The first window will be:
+  //   1. untrimmed, no-shifting window will be (-6,59).
+  //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+  //   3. trimmed, shifted window [0, 63) "Concerning... our"
+  // The second window will be:
+  //   1. untrimmed, no-shifting window will be (54,91).
+  //   2. trimmed, no-shifting window [60, 91) "our... bar.".
+  //   3. trimmed, shifted window [31, 91) "we... bar."
   EXPECT_THAT(
       GetWindows(content, snippet.entries(0)),
-      ElementsAre("Concerning the subject of foo, we need to begin considering",
-                  "our options regarding body bar."));
+      ElementsAre(
+          "Concerning the subject of foo, we need to begin considering our",
+          "we need to begin considering our options regarding body bar."));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("foo", "bar"));
 }
@@ -642,6 +846,15 @@
                              "Concerning the subject of foo, we need to begin "
                              "considering our options regarding body bar.")
           .Build();
+  // String:      "Concerning the subject of foo, we need to begin considering "
+  //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
+  // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
+  // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
+  //
+  // String ctd:  "our options regarding body bar."
+  //               ^   ^       ^         ^    ^   ^
+  // UTF-8 idx:    60  64      72        82   87  91
+  // UTF-32 idx:   60  64      72        82   87  91
   SectionIdMask section_mask = 0b00000011;
   // "subject" should match in both sections, but "foo" is restricted to "body"
   // so it should only match in the 'body' section and not the 'subject'
@@ -656,11 +869,19 @@
   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
+  // The first window will be:
+  //   1. untrimmed, no-shifting window will be (-15,50).
+  //   2. trimmed, no-shifting window [0, 47) "Concerning... begin".
+  //   3. trimmed, shifted window [0, 63) "Concerning... our"
+  // The second window will be:
+  //   1. untrimmed, no-shifting window will be (-6,59).
+  //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+  //   3. trimmed, shifted window [0, 63) "Concerning... our"
   EXPECT_THAT(
       GetWindows(content, snippet.entries(0)),
       ElementsAre(
-          "Concerning the subject of foo, we need to begin",
-          "Concerning the subject of foo, we need to begin considering"));
+          "Concerning the subject of foo, we need to begin considering our",
+          "Concerning the subject of foo, we need to begin considering our"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("subject", "foo"));
 
@@ -682,6 +903,15 @@
                              "considering our options regarding body bar.")
           .Build();
 
+  // String:      "Concerning the subject of foo, we need to begin considering "
+  //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
+  // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
+  // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
+  //
+  // String ctd:  "our options regarding body bar."
+  //               ^   ^       ^         ^    ^   ^
+  // UTF-8 idx:    60  64      72        82   87  91
+  // UTF-32 idx:   60  64      72        82   87  91
   snippet_spec_.set_num_matches_per_property(1);
 
   SectionIdMask section_mask = 0b00000011;
@@ -694,10 +924,14 @@
   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
   std::string_view content =
       GetString(&document, snippet.entries(0).property_name());
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (-6,59).
+  //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+  //   3. trimmed, shifted window [0, 63) "Concerning... our"
   EXPECT_THAT(
       GetWindows(content, snippet.entries(0)),
       ElementsAre(
-          "Concerning the subject of foo, we need to begin considering"));
+          "Concerning the subject of foo, we need to begin considering our"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
@@ -1177,7 +1411,8 @@
 }
 
 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       language_segmenter_,
       language_segmenter_factory::Create(std::move(options)));
@@ -1190,6 +1425,7 @@
   //              ^ ^  ^   ^^
   // UTF8 idx:    0 3  9  15 18
   // UTF16 idx:   0 1  3   5 6
+  // UTF32 idx:   0 1  3   5 6
   // Breaks into segments: "我", "每天", "走路", "去", "上班"
   constexpr std::string_view kChinese = "我每天走路去上班。";
   DocumentProto document =
@@ -1205,12 +1441,11 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
 
-  // Set a twenty byte window. This will produce a window like this:
-  // String:     "我每天走路去上班。"
-  //                ^       ^
-  // UTF8 idx:      3       18
-  // UTF16 idx:     1       6
-  snippet_spec_.set_max_window_bytes(20);
+  // The window will be:
+  //   1. untrimmed, no-shifting window will be (0,7).
+  //   2. trimmed, no-shifting window [1, 6) "每天走路去".
+  //   3. trimmed, shifted window [0, 6) "我每天走路去"
+  snippet_spec_.set_max_window_bytes(6);
 
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1227,11 +1462,11 @@
   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
 
   // Ensure that the match is correct.
-  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("每天走路去"));
+  EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
 
   // Ensure that the utf-16 values are also as expected
-  EXPECT_THAT(match_proto.window_utf16_position(), Eq(1));
-  EXPECT_THAT(match_proto.window_utf16_length(), Eq(5));
+  EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
 }
 
 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
@@ -1285,6 +1520,7 @@
   //              ^  ^  ^
   // UTF8 idx:    0  9  18
   // UTF16 idx:   0  5  10
+  // UTF32 idx:   0  3  6
   // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
   constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
   DocumentProto document =
@@ -1300,12 +1536,13 @@
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
 
-  // Set a twenty byte window. This will produce a window like this:
+  // Set a six character window. This will produce a window like this:
   // String:     "𐀀𐀁 𐀂𐀃 𐀄"
   //                 ^   ^
   // UTF8 idx:       9   22
   // UTF16 idx:      5   12
-  snippet_spec_.set_max_window_bytes(20);
+  // UTF32 idx:      3   7
+  snippet_spec_.set_max_window_bytes(6);
 
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);

diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 5f478fa..4e63b90 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc

@@ -1119,6 +1119,11 @@
 
 libtextclassifier3::StatusOr<DocumentFilterData>
 DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
+  if (!DoesDocumentExist(document_id)) {
+    return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+        "Can't get filter data, document id '%d' doesn't exist", document_id));
+  }
+
   auto filter_data_or = filter_cache_->GetCopy(document_id);
   if (!filter_data_or.ok()) {
     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
@@ -1127,10 +1132,6 @@
   }
   DocumentFilterData document_filter_data =
       std::move(filter_data_or).ValueOrDie();
-  if (document_filter_data.namespace_id() == kInvalidNamespaceId) {
-    // An invalid namespace id means that the filter data has been deleted.
-    return absl_ports::NotFoundError("Document filter data not found.");
-  }
   return document_filter_data;
 }
 

diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 9e1b3ec..b0cd1ce 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h

@@ -231,6 +231,7 @@
   //
   // Returns:
   //   OK on success
+  //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
   //   INTERNAL_ERROR on IO error
   //   INVALID_ARGUMENT if document_id is invalid.
   libtextclassifier3::Status Delete(DocumentId document_id);
@@ -278,16 +279,11 @@
 
   // Returns the DocumentFilterData of the document specified by the DocumentId.
   //
-  // NOTE: This does not check if the document exists and will return the
-  // DocumentFilterData of the document even if it has been deleted. Users
-  // should check DoesDocumentExist(document_id) if they only want existing
-  // documents' DocumentFilterData.
-  //
   // Returns:
   //   DocumentFilterData on success
   //   OUT_OF_RANGE if document_id is negative or exceeds previously seen
   //                DocumentIds
-  //   NOT_FOUND if no filter data is found
+  //   NOT_FOUND if the document or the filter data is not found
   libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
       DocumentId document_id) const;
 

diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index b37c6de..ad3b7c4 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc

@@ -1595,7 +1595,7 @@
           /*length_in_tokens=*/7)));
 }
 
-TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) {
+TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1651,6 +1651,18 @@
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
+TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  std::unique_ptr<DocumentStore> doc_store =
+      std::move(create_result.document_store);
+
+  EXPECT_THAT(doc_store->GetDocumentFilterData(/*document_id=*/0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
 TEST_F(DocumentStoreTest, DeleteClearsFilterCache) {
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
@@ -3099,36 +3111,39 @@
 #define DISABLE_BACKWARDS_COMPAT_TEST
 #ifndef DISABLE_BACKWARDS_COMPAT_TEST
 TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
-  // The directory testdata/v0/document_store contains only the scoring_cache
-  // and the document_store_header (holding the crc for the scoring_cache). If
-  // the current code is compatible with the format of the v0 scoring_cache,
-  // then an empty document store should be initialized, but the non-empty
-  // scoring_cache should be retained.
-  // The current document-asscoiated-score-data has a new field with respect to
-  // the ones stored in testdata/v0, hence the document store's initialization
-  // requires regenerating its derived files.
+  // The directory testdata/score_cache_without_length_in_tokens/document_store
+  // contains only the scoring_cache and the document_store_header (holding the
+  // crc for the scoring_cache). If the current code is compatible with the
+  // format of the v0 scoring_cache, then an empty document store should be
+  // initialized, but the non-empty scoring_cache should be retained. The
+  // current document-asscoiated-score-data has a new field with respect to the
+  // ones stored in testdata/score_cache_Without_length_in_tokens, hence the
+  // document store's initialization requires regenerating its derived files.
 
   // Create dst directory
   ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true);
 
   // Get src files
-  std::string document_store_v0;
+  std::string document_store_without_length_in_tokens;
   if (IsAndroidPlatform() || IsIosPlatform()) {
-    document_store_v0 = GetTestFilePath(
-        "icing/testdata/v0/document_store_android_ios_compatible");
+    document_store_without_length_in_tokens = GetTestFilePath(
+        "icing/testdata/score_cache_without_length_in_tokens/"
+        "document_store_android_ios_compatible");
   } else {
-    document_store_v0 =
-        GetTestFilePath("icing/testdata/v0/document_store");
+    document_store_without_length_in_tokens = GetTestFilePath(
+        "icing/testdata/score_cache_without_length_in_tokens/"
+        "document_store");
   }
   std::vector<std::string> document_store_files;
   Filesystem filesystem;
-  filesystem.ListDirectory(document_store_v0.c_str(), &document_store_files);
+  filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
+                           &document_store_files);
 
-  VLOG(1) << "Copying files " << document_store_v0 << ' '
-          << document_store_files.size();
+  ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
+                  << ' ' << document_store_files.size();
   for (size_t i = 0; i != document_store_files.size(); i++) {
-    std::string src =
-        absl_ports::StrCat(document_store_v0, "/", document_store_files[i]);
+    std::string src = absl_ports::StrCat(
+        document_store_without_length_in_tokens, "/", document_store_files[i]);
     std::string dst =
         absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
     ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);

diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 74d22cd..cb31441 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc

@@ -25,6 +25,7 @@
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 #include "unicode/ubrk.h"
@@ -101,59 +102,149 @@
     return text_.substr(term_start_index_, term_length);
   }
 
-  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+      override {
+    if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    return offset_iterator_;
+  }
+
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+      override {
+    if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    return offset_iterator_;
+  }
+
+  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
       int32_t offset) override {
-    if (offset < 0 || offset >= text_.length()) {
-      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
-          "Illegal offset provided! Offset %d is not within bounds of string "
-          "of length %zu",
-          offset, text_.length()));
+    if (offset < 0) {
+      // Very simple. The first term start after a negative offset is the first
+      // term. So just reset to start and Advance.
+      return ResetToStartUtf32();
     }
-    term_start_index_ = ubrk_following(break_iterator_, offset);
-    if (term_start_index_ == UBRK_DONE) {
-      MarkAsDone();
-      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-          "No segments begin after provided offset %d.", offset));
-    }
-    term_end_index_exclusive_ = ubrk_next(break_iterator_);
-    if (term_end_index_exclusive_ == UBRK_DONE) {
-      MarkAsDone();
-      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-          "No segments begin after provided offset %d.", offset));
-    }
-    if (!IsValidSegment()) {
-      if (!Advance()) {
-        return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-            "No segments begin after provided offset %d.", offset));
+
+    // 1. Find the unicode character that contains the byte at offset.
+    if (!offset_iterator_.MoveToUtf32(offset)) {
+      // An error occurred. Mark as DONE
+      if (offset_iterator_.utf8_index() != text_.length()) {
+        // We returned false for some reason other than hitting the end. This is
+        // a real error. Just return.
+        MarkAsDone();
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
       }
     }
-    return term_start_index_;
+    if (offset_iterator_.utf8_index() == text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+          "bounds of string of length %zu",
+          offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
+          text_.length()));
+    }
+
+    // 2. We've got the unicode character containing byte offset. Now, we need
+    // to point to the segment that starts after this character.
+    int following_utf8_index =
+        ubrk_following(break_iterator_, offset_iterator_.utf8_index());
+    if (following_utf8_index == UBRK_DONE) {
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
+    }
+    term_end_index_exclusive_ = following_utf8_index;
+
+    // 3. The term_end_exclusive_ points to the start of the term that we want
+    // to return. We need to Advance so that term_start_ will now point to this
+    // term.
+    if (!Advance()) {
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
+    }
+    if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    return offset_iterator_.utf32_index();
   }
 
-  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
       int32_t offset) override {
-    if (offset < 0 || offset >= text_.length()) {
+    if (offset < 0) {
       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
           "Illegal offset provided! Offset %d is not within bounds of string "
           "of length %zu",
           offset, text_.length()));
     }
-    ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
-    if (term_end_index_exclusive_ > offset) {
-      // This term ends after offset. So we need to get the term just before
-      // this one.
-      ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
+
+    if (!offset_iterator_.MoveToUtf32(offset)) {
+      // An error occurred. Mark as DONE
+      if (offset_iterator_.utf8_index() != text_.length()) {
+        // We returned false for some reason other than hitting the end. This is
+        // a real error. Just return.
+        MarkAsDone();
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
+      }
+      // If it returned false because we hit the end. Then that's fine. We'll
+      // just treat it as if the request was for the end.
     }
-    return term_start_index_;
+
+    // 2. We've got the unicode character containing byte offset. Now, we need
+    // to point to the segment that ends before this character.
+    int starting_utf8_index =
+        ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
+    if (starting_utf8_index == UBRK_DONE) {
+      // Rewind the end indices.
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments end before provided offset %d.", offset));
+    }
+    term_start_index_ = starting_utf8_index;
+
+    // 3. We've correctly set the start index and the iterator currently points
+    // to that position. Now we need to find the correct end position and
+    // advance the iterator to that position.
+    int ending_utf8_index = ubrk_next(break_iterator_);
+    if (ending_utf8_index == UBRK_DONE) {
+      // This shouldn't ever happen.
+      MarkAsDone();
+      return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
+          "No segments end before provided offset %d.", offset));
+    }
+    term_end_index_exclusive_ = ending_utf8_index;
+
+    // 4. The start and end indices point to a segment, but we need to ensure
+    // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+    // need a segment prior to this one.
+    CharacterIterator term_start_iterator = offset_iterator_;
+    if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
+        !IsValidSegment()) {
+      return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
+    }
+    return term_start_iterator.utf32_index();
   }
 
-  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+  libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
     term_start_index_ = 0;
     term_end_index_exclusive_ = 0;
     if (!Advance()) {
-      return absl_ports::NotFoundError("");
+      return absl_ports::NotFoundError(
+          "Unable to find any valid terms in text.");
     }
-    return term_start_index_;
+    if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+    return offset_iterator_.utf32_index();
   }
 
  private:
@@ -163,6 +254,7 @@
         text_(text),
         locale_(locale),
         u_text_(UTEXT_INITIALIZER),
+        offset_iterator_(text),
         term_start_index_(0),
         term_end_index_exclusive_(0) {}
 
@@ -232,6 +324,15 @@
   // utext_close() must be called after using.
   UText u_text_;
 
+  // Offset iterator. This iterator is not guaranteed to point to any particular
+  // character, but is guaranteed to point to a valid UTF character sequence.
+  //
+  // This iterator is used to save some amount of linear traversal when seeking
+  // to a specific UTF-32 offset. Each function that uses it could just create
+  // a CharacterIterator starting at the beginning of the text and traverse
+  // forward from there.
+  CharacterIterator offset_iterator_;
+
   // The start and end indices are used to track the positions of current
   // term.
   int term_start_index_;

diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index c0d6d43..01eb7d8 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc

@@ -12,24 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
+#include <string_view>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
 #include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
-namespace {
+
 using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::IsEmpty;
 
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+    const std::string& locale, const JniCache* jni_cache) {
+  return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
 // Returns a vector containing all terms retrieved by Advancing on the iterator.
 std::vector<std::string_view> GetAllTermsAdvance(
     LanguageSegmenter::Iterator* itr) {
@@ -40,70 +55,61 @@
   return terms;
 }
 
-// Returns a vector containing all terms retrieved by calling
-// ResetToStart/ResetAfter with the current position to simulate Advancing on
-// the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
     LanguageSegmenter::Iterator* itr) {
   std::vector<std::string_view> terms;
-  if (!itr->ResetToStart().ok()) {
-    return terms;
-  }
-  terms.push_back(itr->GetTerm());
-  const char* text_begin = itr->GetTerm().data();
-  // Calling ResetToTermStartingAfter with the current position should get the
-  // very next term in the sequence.
-  for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
-       current_pos = itr->GetTerm().data() - text_begin) {
+  // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+  // the sequence.
+  bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+  while (is_ok) {
     terms.push_back(itr->GetTerm());
+    // Calling ResetToTermStartingAfterUtf32 with the current position should
+    // get the very next term in the sequence.
+    CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+    is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
   }
   return terms;
 }
 
 // Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
     LanguageSegmenter::Iterator* itr) {
-  const char* text_begin = itr->GetTerm().data();
   std::vector<std::string_view> terms;
-
-  bool is_ok = true;
-  int current_pos = 0;
+  bool is_ok = itr->Advance();
   while (is_ok) {
+    terms.push_back(itr->GetTerm());
     // Alternate between using Advance and ResetToTermAfter.
     if (terms.size() % 2 == 0) {
       is_ok = itr->Advance();
     } else {
-      // Calling ResetToTermStartingAfter with the current position should get
-      // the very next term in the sequence.
-      current_pos = itr->GetTerm().data() - text_begin;
-      is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
-    }
-    if (is_ok) {
-      terms.push_back(itr->GetTerm());
+      // Calling ResetToTermStartingAfterUtf32 with the current position should
+      // get the very next term in the sequence.
+      CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+      is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
     }
   }
   return terms;
 }
 
 // Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
     LanguageSegmenter::Iterator* itr) {
-  const char* text_begin = itr->GetTerm().data();
-  int last_pos = 0;
-  while (itr->Advance()) {
-    last_pos = itr->GetTerm().data() - text_begin;
-  }
   std::vector<std::string_view> terms;
-  // Calling ResetToTermEndingBefore with the current position should get the
-  // previous term in the sequence.
-  for (int current_pos = last_pos;
-       itr->ResetToTermEndingBefore(current_pos).ok();
-       current_pos = itr->GetTerm().data() - text_begin) {
+  bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+  while (is_ok) {
     terms.push_back(itr->GetTerm());
+    // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+    // the previous term in the sequence.
+    CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+    is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
   }
   return terms;
 }
@@ -119,27 +125,34 @@
   }
 
   static std::string GetLocale() { return GetParam(); }
-  static language_segmenter_factory::SegmenterOptions GetOptions() {
-    return language_segmenter_factory::SegmenterOptions(GetLocale());
-  }
+
+  std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
 };
 
+}  // namespace
+
 TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
               IsOkAndHolds(ElementsAre("Hello", " ", "World")));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // ASCII punctuation marks are kept
   EXPECT_THAT(
       language_segmenter->GetAllTerms("Hello, World!!!"),
@@ -153,8 +166,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // ASCII special characters are kept
   EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
               IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
@@ -169,8 +184,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Full-width (non-ASCII) punctuation marks and special characters are left
   // out.
   EXPECT_THAT(language_segmenter->GetAllTerms("。？·Hello！×"),
@@ -178,10 +195,12 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
-              IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+              IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
   EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
               IsOkAndHolds(ElementsAre("I.B.M", ".")));
   EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
@@ -191,8 +210,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // According to unicode word break rules
   // WB6(https://unicode.org/reports/tr29/#WB6),
   // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
@@ -274,8 +295,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
               IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
   EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
@@ -295,8 +318,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
 
   EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
               IsOkAndHolds(ElementsAre("(", "Hello", ")")));
@@ -306,8 +331,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
 
   EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
               IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
@@ -317,8 +344,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -326,8 +355,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(
@@ -342,8 +373,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Multiple continuous whitespaces are treated as one.
   const int kNumSeparators = 256;
   std::string text_with_spaces =
@@ -367,8 +400,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
   // have whitespaces as word delimiter.
 
@@ -389,15 +424,19 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
               IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Turkish
   EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
               IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
@@ -408,8 +447,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
               IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
                                        "吗", "お", "元気", "です", "か")));
@@ -420,8 +461,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Validates that the input strings are not copied
   const std::string text = "Hello World";
   const char* word1_address = text.c_str();
@@ -437,127 +480,141 @@
   EXPECT_THAT(word2_address, Eq(word2_result_address));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartWordConnector) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "com:google:android is package";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "com:google:android is package"
-  //          ^                 ^^ ^^
-  // Bytes:   0              18 19 21 22
-  auto position_or = itr->ResetToStart();
+  // String:      "com:google:android is package"
+  //               ^                 ^^ ^^
+  // UTF-8 idx:    0              18 19 21 22
+  // UTF-32 idx:   0              18 19 21 22
+  auto position_or = itr->ResetToStartUtf32();
   EXPECT_THAT(position_or, IsOk());
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
-  EXPECT_THAT(itr->GetTerm(), Eq("How"));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
-  constexpr std::string_view kText = "How are you你好吗お元気ですか";
-  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
-                             segmenter->Segment(kText));
-
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  ASSERT_TRUE(itr->Advance());  // itr points to 'How'
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       IteratorMultipleAdvancesResetToStart) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       IteratorOneAdvanceResetToStartUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
+  ASSERT_TRUE(itr->Advance());  // itr points to 'How'
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       IteratorMultipleAdvancesResetToStartUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());  // itr points to ' '
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
   while (itr->Advance()) {
     // Do nothing.
   }
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterWordConnector) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "package com:google:android name";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "package com:google:android name"
-  //          ^      ^^                 ^^
-  // Bytes:   0      7 8               26 27
-  auto position_or = itr->ResetToTermStartingAfter(8);
+  // String:     "package com:google:android name"
+  //              ^      ^^                 ^^
+  // UTF-8 idx:   0      7 8               26 27
+  // UTF-32 idx:  0      7 8               26 27
+  auto position_or = itr->ResetToTermStartingAfterUtf32(8);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
   ASSERT_THAT(itr->GetTerm(), Eq(" "));
 
-  position_or = itr->ResetToTermStartingAfter(7);
+  position_or = itr->ResetToTermStartingAfterUtf32(7);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
+  ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
   ASSERT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
 // Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -566,9 +623,10 @@
 // terms produced by ResetToTermAfter calls with the current position
 // provided as the argument.
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       MixedLanguagesResetToTermAfterEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -580,16 +638,17 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       ThaiResetToTermAfterEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       ThaiResetToTermAfterUtf32EquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -601,16 +660,17 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       KoreanResetToTermAfterEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       KoreanResetToTermAfterUtf32EquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kKorean = "나는 매일 출근합니다.";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -622,7 +682,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -633,9 +693,10 @@
 // should be able to mix ResetToTermAfter(current_position) calls and Advance
 // calls to mimic calling Advance.
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -647,7 +708,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
@@ -655,9 +716,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       ThaiResetToTermAfterInteroperableWithAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -669,7 +731,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
@@ -677,9 +739,10 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       KoreanResetToTermAfterInteroperableWithAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kKorean = "나는 매일 출근합니다.";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -691,211 +754,234 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> itr,
       language_segmenter->Segment("How are you你好吗お元気ですか"));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
   EXPECT_THAT(itr->GetTerm(), Eq("你好"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
   EXPECT_THAT(itr->GetTerm(), Eq("か"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
   EXPECT_THAT(itr->GetTerm(), Eq("吗"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       ContinuousWhitespacesResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       ContinuousWhitespacesResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Multiple continuous whitespaces are treated as one.
   constexpr std::string_view kTextWithSpace = "Hello          World";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kTextWithSpace));
 
-  // String: "Hello          World"
-  //          ^    ^         ^
-  // Bytes:   0    5         15
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+  // String:      "Hello          World"
+  //               ^    ^         ^
+  // UTF-8 idx:    0    5         15
+  // UTF-32 idx:   0    5         15
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("World"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("World"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
   // don't have whitespaces as word delimiter. Chinese
   constexpr std::string_view kChinese = "我每天走路去上班。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kChinese));
-  // String: "我每天走路去上班。"
-  //          ^ ^  ^   ^^
-  // Bytes:   0 3  9  15 18
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  // String:       "我每天走路去上班。"
+  //                ^ ^  ^   ^^
+  // UTF-8 idx:     0 3  9  15 18
+  // UTF-832 idx:   0 1  3   5 6
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("每天"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("走路"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Japanese
   constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kJapanese));
-  // String: "私は毎日仕事に歩いています。"
-  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
-  // Bytes:   0 3 6  12 18212427 33
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  // String:       "私は毎日仕事に歩いています。"
+  //                ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // UTF-8 idx:     0 3 6  12 18212427 33
+  // UTF-32 idx:    0 1 2  4  6 7 8 9  11
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("は"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
   EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kKhmer));
-  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
-  //          ^ ^   ^   ^
-  // Bytes:   0 9   24  45
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  // String:            "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //                     ^ ^   ^   ^
+  // UTF-8 idx:          0 9   24  45
+  // UTF-32 idx:         0 3   8   15
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Thai
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kThai));
-  // String: "ฉันเดินไปทำงานทุกวัน"
-  //          ^ ^  ^ ^    ^ ^
-  // Bytes:   0 9 21 27  42 51
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  // String:      "ฉันเดินไปทำงานทุกวัน"
+  //               ^ ^  ^ ^    ^ ^
+  // UTF-8 idx:    0 9 21 27  42 51
+  // UTF-32 idx:   0 3  7 9   14 17
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
   EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeWordConnector) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ResetToTermBeforeWordConnectorUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "package name com:google:android!";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "package name com:google:android!"
-  //          ^      ^^   ^^                 ^
-  // Bytes:   0      7 8 12 13               31
-  auto position_or = itr->ResetToTermEndingBefore(31);
+  // String:      "package name com:google:android!"
+  //               ^      ^^   ^^                 ^
+  // UTF-8 idx:    0      7 8 12 13               31
+  // UTF-32 idx:   0      7 8 12 13               31
+  auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 
-  position_or = itr->ResetToTermEndingBefore(21);
+  position_or = itr->ResetToTermEndingBeforeUtf32(21);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
   ASSERT_THAT(itr->GetTerm(), Eq(" "));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
   ASSERT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
   EXPECT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+  EXPECT_THAT(itr->GetTerm(), Eq("か"));
 }
 
 // Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -904,26 +990,22 @@
 // terms produced by ResetToTermBefore calls with the current position
 // provided as the argument (after their order has been reversed).
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -932,26 +1014,22 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       ThaiResetToTermBeforeEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -959,192 +1037,209 @@
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       KoreanResetToTermBeforeEquivalentToAdvance) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kKorean = "나는 매일 출근합니다.";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> itr,
       language_segmenter->Segment("How are you你好吗お元気ですか"));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
   EXPECT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("元気"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
   EXPECT_THAT(itr->GetTerm(), Eq("です"));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest,
-       ContinuousWhitespacesResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+       ContinuousWhitespacesResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Multiple continuous whitespaces are treated as one.
   constexpr std::string_view kTextWithSpace = "Hello          World";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kTextWithSpace));
 
-  // String: "Hello          World"
-  //          ^    ^         ^
-  // Bytes:   0    5         15
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "Hello          World"
+  //               ^    ^         ^
+  // UTF-8 idx:    0    5         15
+  // UTF-32 idx:   0    5         15
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
   // don't have whitespaces as word delimiter. Chinese
   constexpr std::string_view kChinese = "我每天走路去上班。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kChinese));
-  // String: "我每天走路去上班。"
-  //          ^ ^  ^   ^^
-  // Bytes:   0 3  9  15 18
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "我每天走路去上班。"
+  //               ^ ^  ^   ^^
+  // UTF-8 idx:    0 3  9  15 18
+  // UTF-32 idx:   0 1  3   5 6
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("我"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq("去"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Japanese
   constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kJapanese));
-  // String: "私は毎日仕事に歩いています。"
-  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
-  // Bytes:   0 3 6  12 18212427 33
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "私は毎日仕事に歩いています。"
+  //               ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // UTF-8 idx:    0 3 6  12 18212427 33
+  // UTF-32 idx:   0 1 2  4  6 7 8 9  11
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
   EXPECT_THAT(itr->GetTerm(), Eq("てい"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("は"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kKhmer));
-  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
-  //          ^ ^   ^   ^
-  // Bytes:   0 9   24  45
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //               ^ ^   ^   ^
+  // UTF-8 idx:    0 9   24  45
+  // UTF-32 idx:   0 3   8   15
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
 }
 
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Thai
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kThai));
-  // String: "ฉันเดินไปทำงานทุกวัน"
-  //          ^ ^  ^ ^    ^ ^
-  // Bytes:   0 9 21 27  42 51
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "ฉันเดินไปทำงานทุกวัน"
+  //               ^ ^  ^ ^    ^ ^
+  // UTF-8 idx:    0 9 21 27  42 51
+  // UTF-32 idx:   0 3  7 9   14 17
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
   EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 }
 
 TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
   // Validates that the input strings are not copied
   ICING_ASSERT_OK_AND_ASSIGN(
       std::vector<std::string_view> terms,
@@ -1174,6 +1269,5 @@
                     ""              // Will fall back to ICU default locale
                     ));
 
-}  // namespace
 }  // namespace lib
 }  // namespace icing

diff --git a/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc
new file mode 100644
index 0000000..3a94af3
--- /dev/null
+++ b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc

@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_LanguageSegmenterIteratorJniTest_testsMain(JNIEnv* env,
+                                                          jclass ignored) {
+  g_jenv = env;
+
+  std::vector<char*> my_argv;
+  char arg[] = "jni-test-lib";
+  my_argv.push_back(arg);
+  int argc = 1;
+  char** argv = &(my_argv[0]);
+  testing::InitGoogleTest(&argc, argv);
+  testing::UnitTest::GetInstance()->listeners().Append(
+      new icing::lib::LoggingEventListener());
+  return RUN_ALL_TESTS() == 0;
+}

diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 317da04..d293581 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc

@@ -18,6 +18,7 @@
 #include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
@@ -43,10 +44,13 @@
               GetTestFilePath("icing/icu.dat")));
     }
   }
+
+  std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
 };
 
 TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -66,85 +70,91 @@
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithOffsetInText) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+       ResetToTermStartingAfterUtf32WithOffsetInText) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/0),
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0),
               IsOkAndHolds(3));  // The term " "
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/3),
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/3),
               IsOkAndHolds(4));  // The term "bar"
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/4),
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/4),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithNegativeOffsetNotOk) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+       ResetToTermStartingAfterUtf32WithNegativeOffsetNotOk) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk());
 
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-100), IsOk());
 
-  EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+  EXPECT_THAT(iterator->ResetToStartUtf32(), IsOkAndHolds(0));
   EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
+       ResetToTermStartingAfterUtf32WithTextLengthOffsetInvalidArgument) {
   std::string text = "foo bar";
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
+       ResetToTermStartingAfterUtf32WithOffsetPastTextLengthInvalidArgument) {
   std::string text = "foo bar";
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
-  EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
+  EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermEndingBeforeUtf32WithOffsetInText) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/6),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6),
               IsOkAndHolds(3));  // The term " "
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/3),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/3),
               IsOkAndHolds(0));  // The term "foo"
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/2),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/2),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithZeroNotFound) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermEndingBeforeUtf32WithZeroNotFound) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -152,40 +162,43 @@
                              language_segmenter->Segment("foo bar"));
 
   // Zero is a valid argument, but there aren't any terms that end before it.
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+       ResetToTermEndingBeforeUtf32WithNegativeOffsetInvalidArgument) {
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-100),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
 TEST_F(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
+       ResetToTermEndingBeforeUtf32WithOffsetPastTextEndInvalidArgument) {
   std::string text = "foo bar";
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()),
+              IsOk());
 
-  EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(
+      iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length() + 1),
+      IsOk());
 }
 
 }  // namespace

diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index 7ca31d1..913386a 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h

@@ -21,6 +21,8 @@
 #include <vector>
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/character-iterator.h"
 
 namespace icing {
 namespace lib {
@@ -56,51 +58,81 @@
     // true.
     virtual std::string_view GetTerm() const = 0;
 
-    // Resets the iterator to point to the first term that starts after offset.
+    // RETURNS:
+    //   On success, a CharacterIterator pointing to the beginning of the
+    //   current term.
+    //   ABORTED if an invalid unicode character is encountered while
+    //   calculating the term start.
+    virtual libtextclassifier3::StatusOr<CharacterIterator>
+    CalculateTermStart() {
+      return absl_ports::UnimplementedError("");
+    }
+
+    // RETURNS:
+    //   On success, a CharacterIterator pointing just past the end of the
+    //   current term.
+    //   ABORTED if an invalid unicode character is encountered while
+    //   calculating the term end.
+    virtual libtextclassifier3::StatusOr<CharacterIterator>
+    CalculateTermEndExclusive() {
+      return absl_ports::UnimplementedError("");
+    }
+
+    // Resets the iterator to point to the first term that starts after UTF-32
+    // offset.
     // GetTerm will now return that term. For example:
     //
     //   language_segmenter = language_segmenter_factory::Create(type);
     //   iterator = language_segmenter->Segment("foo bar baz");
-    //   iterator.ResetToTermStartingAfter(4);
+    //   iterator.ResetToTermStartingAfterUtf32(4);
     //   iterator.GetTerm() // returns "baz";
     //
     // Return types of OK and NOT_FOUND indicate that the function call was
     // valid and the state of the iterator has changed. Return type of
-    // INVALID_ARGUMENT will leave the iterator unchanged.
+    // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+    // of ABORTED means that the iterator may be left in an undefined state and
+    // no longer be usable.
     //
     // Returns:
-    //   On success, the starting position of the first term that starts after
+    //   On success, the UTF-32 offset of the first term that starts after
     //   offset.
     //   NOT_FOUND if an error occurred or there are no terms that start after
     //   offset.
-    //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+    //   INVALID_ARGUMENT if offset is beyond the end of the text.
     //   ABORTED if an invalid unicode character is encountered while
     //   traversing the text.
-    virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
-        int32_t offset) = 0;
+    virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
+        int32_t offset) {
+      return absl_ports::UnimplementedError("");
+    }
 
-    // Resets the iterator to point to the first term that ends before offset.
+    // Resets the iterator to point to the first term that ends before UTF-32
+    // offset.
     // GetTerm will now return that term. For example:
     //
     //   language_segmenter = language_segmenter_factory::Create(type);
     //   iterator = language_segmenter->Segment("foo bar baz");
-    //   iterator.ResetToTermEndingBefore(7);
+    //   iterator.ResetToTermEndingBeforeUtf32(7);
     //   iterator.GetTerm() // returns "bar";
     //
     // Return types of OK and NOT_FOUND indicate that the function call was
     // valid and the state of the iterator has changed. Return type of
-    // INVALID_ARGUMENT will leave the iterator unchanged.
+    // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+    // of ABORTED means that the iterator may be left in an undefined state and
+    // no longer be usable.
     //
     // Returns:
-    //   On success, the starting position of the first term that ends before
+    //   On success, the UTF-32 offset of the first term that ends before
     //   offset.
     //   NOT_FOUND if an error occurred or there are no terms that ends before
     //   offset.
-    //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+    //   INVALID_ARGUMENT if offset is negative
     //   ABORTED if an invalid unicode character is encountered while
     //   traversing the text.
-    virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
-        int32_t offset) = 0;
+    virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
+        int32_t offset) {
+      return absl_ports::UnimplementedError("");
+    }
 
     // Resets the iterator to point to the first term.
     // GetTerm will now return that term. For example:
@@ -108,7 +140,7 @@
     //   language_segmenter = language_segmenter_factory::Create(type);
     //   iterator = language_segmenter->Segment("foo bar baz");
     //   iterator.Advance();
-    //   iterator.ResetToStart();
+    //   iterator.ResetToStartUtf32();
     //   iterator.GetTerm() // returns "foo";
     //
     // Return types of OK and NOT_FOUND indicate that the function call was
@@ -119,7 +151,7 @@
     //   NOT_FOUND if an error occurred or there are no valid terms in the text.
     //   ABORTED if an invalid unicode character is encountered while
     //   traversing the text.
-    virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
+    virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0;
   };
 
   // Segments the input text into terms.

diff --git a/icing/tokenization/plain-tokenizer-test-jni-layer.cc b/icing/tokenization/plain-tokenizer-test-jni-layer.cc
new file mode 100644
index 0000000..efa6427
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer-test-jni-layer.cc

@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_PlainTokenizerJniTest_testsMain(JNIEnv* env, jclass ignored) {
+  g_jenv = env;
+
+  std::vector<char*> my_argv;
+  char arg[] = "jni-test-lib";
+  my_argv.push_back(arg);
+  int argc = 1;
+  char** argv = &(my_argv[0]);
+  testing::InitGoogleTest(&argc, argv);
+  testing::UnitTest::GetInstance()->listeners().Append(
+      new icing::lib::LoggingEventListener());
+  return RUN_ALL_TESTS() == 0;
+}

diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 6e54af9..13fe550 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc

@@ -18,6 +18,7 @@
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
@@ -70,8 +71,18 @@
     return Token(Token::REGULAR, current_term_);
   }
 
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+      override {
+    return base_iterator_->CalculateTermStart();
+  }
+
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+      override {
+    return base_iterator_->CalculateTermEndExclusive();
+  }
+
   bool ResetToTokenAfter(int32_t offset) override {
-    if (!base_iterator_->ResetToTermStartingAfter(offset).ok()) {
+    if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) {
       return false;
     }
     current_term_ = base_iterator_->GetTerm();
@@ -84,20 +95,20 @@
 
   bool ResetToTokenBefore(int32_t offset) override {
     ICING_ASSIGN_OR_RETURN(
-        offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+        offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
     current_term_ = base_iterator_->GetTerm();
     while (!IsValidTerm(current_term_)) {
       // Haven't found a valid term yet. Retrieve the term prior to this one
       // from the segmenter.
       ICING_ASSIGN_OR_RETURN(
-          offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+          offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
       current_term_ = base_iterator_->GetTerm();
     }
     return true;
   }
 
   bool ResetToStart() override {
-    if (!base_iterator_->ResetToStart().ok()) {
+    if (!base_iterator_->ResetToStartUtf32().ok()) {
       return false;
     }
     current_term_ = base_iterator_->GetTerm();

diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index 2fb9750..7490bfa 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc

@@ -22,6 +22,7 @@
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
@@ -43,6 +44,8 @@
               GetTestFilePath("icing/icu.dat")));
     }
   }
+
+  std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
 };
 
 TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
@@ -53,7 +56,8 @@
 }
 
 TEST_F(PlainTokenizerTest, Simple) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -87,7 +91,8 @@
 }
 
 TEST_F(PlainTokenizerTest, Whitespace) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -115,7 +120,8 @@
 }
 
 TEST_F(PlainTokenizerTest, Punctuation) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -161,7 +167,8 @@
 }
 
 TEST_F(PlainTokenizerTest, SpecialCharacters) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -187,7 +194,8 @@
   // In plain tokenizer, CJKT characters are handled the same way as non-CJKT
   // characters, just add these tests as sanity checks.
   // Chinese
-  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE);
+  language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -202,7 +210,8 @@
                                        EqualsToken(Token::REGULAR, "去"),
                                        EqualsToken(Token::REGULAR, "上班"))));
   // Japanese
-  options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE);
+  options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
+                                                         jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -272,7 +281,8 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -291,7 +301,8 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -310,7 +321,8 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));
@@ -360,7 +372,8 @@
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
+  language_segmenter_factory::SegmenterOptions options(ULOC_US,
+                                                       jni_cache_.get());
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(std::move(options)));

diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
new file mode 100644
index 0000000..6b1cb3a
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc

@@ -0,0 +1,187 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
+
+#include <jni.h>
+#include <math.h>
+
+#include <cassert>
+#include <cctype>
+#include <map>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Chosen based on results in go/reverse-jni-benchmarks
+static constexpr int kBatchSize = 100;
+}  // namespace
+
+// -----------------------------------------------------------------------------
+// Implementations that call out to JVM. Behold the beauty.
+// -----------------------------------------------------------------------------
+libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ReverseJniBreakIterator::Create(const JniCache* jni_cache,
+                                std::string_view text,
+                                std::string_view locale) {
+  if (jni_cache == nullptr) {
+    return absl_ports::InvalidArgumentError(
+        "Create must be called with a valid JniCache pointer!");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jstring> java_text,
+      jni_cache->ConvertToJavaString(text.data(), text.length()));
+  if (java_text.get() == nullptr) {
+    return absl_ports::AbortedError("Failed to create Java String from input.");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
+      jni_cache->ConvertToJavaString(locale.data(), locale.length()));
+  if (java_locale_string.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java String from locale.");
+  }
+
+  JNIEnv* jenv = jni_cache->GetEnv();
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jobject> java_locale,
+      libtextclassifier3::JniHelper::NewObject(
+          jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
+          java_locale_string.get()));
+  if (java_locale.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java Locale from locale.");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
+      libtextclassifier3::JniHelper::NewObject(
+          jenv, jni_cache->breakiterator_class.get(),
+          jni_cache->breakiterator_constructor, java_locale.get()));
+  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
+      libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
+                                        jni_cache->jvm);
+  if (iterator_batcher.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java BreakIteratorBatcher.");
+  }
+
+  ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
+      jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
+      java_text.get()));
+  return std::unique_ptr<ReverseJniBreakIterator>(
+      new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+}
+
+ReverseJniBreakIterator::ReverseJniBreakIterator(
+    const JniCache* jni_cache,
+    libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+    : jni_cache_(jni_cache),
+      iterator_batcher_(std::move(iterator_batcher)),
+      is_done_(false),
+      is_almost_done_(false) {}
+
+int ReverseJniBreakIterator::Next() {
+  if (is_done_) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  if (break_indices_cache_.empty()) {
+    if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
+      // Either there were no more results or an error occurred. Either way,
+      // mark ourselves as done and return.
+      is_done_ = true;
+      return ReverseJniBreakIterator::kDone;
+    }
+    is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+  }
+  int break_index = break_indices_cache_.front();
+  break_indices_cache_.pop();
+  is_done_ = is_almost_done_ && break_indices_cache_.empty();
+  return break_index;
+}
+
+int ReverseJniBreakIterator::First() {
+  const int first_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_first);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return first_index;
+}
+
+int ReverseJniBreakIterator::Preceding(int offset) {
+  const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return preceding_index;
+}
+
+int ReverseJniBreakIterator::Following(int offset) {
+  const int following_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return following_index;
+}
+
+int ReverseJniBreakIterator::FetchNextBatch() {
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
+      libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
+          jni_cache_->GetEnv(), iterator_batcher_.get(),
+          jni_cache_->breakiterator_next, kBatchSize),
+      ReverseJniBreakIterator::kDone);
+  if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
+  if (num_indices == 0) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  jint* break_indices_arr =
+      static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
+          break_indices.get(), nullptr));
+  for (int i = 0; i < num_indices; ++i) {
+    break_indices_cache_.push(break_indices_arr[i]);
+  }
+  jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
+                                                      break_indices_arr,
+                                                      /*mode=*/0);
+  return num_indices;
+}
+
+void ReverseJniBreakIterator::ClearCache() {
+  break_indices_cache_ = std::queue<int>();
+  is_done_ = false;
+  is_almost_done_ = false;
+}
+
+}  // namespace lib
+}  // namespace icing

diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
new file mode 100644
index 0000000..41b470c
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h

@@ -0,0 +1,124 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+
+#include <jni.h>
+
+#include <queue>
+#include <string>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
+// hides the batching element to provide an interface akin to
+// java.text.BreakIterator.
+//
+// Example:
+// std::string text = "我每天走路去上班。";
+// ASSERT_THAT(text, SizeIs(27));
+// std::unique_ptr<ReverseJniBreakIterator> itr =
+//     ReverseJniBreakIterator::Create(jni_cache, text, locale);
+// std::vector<int> nexts;
+// int next = itr->Next();
+// while (next != ReverseJniBreakIterator::kDone) {
+//   nexts.push_back(next);
+//   next = itr->Next();
+// }
+// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
+class ReverseJniBreakIterator {
+ public:
+  static constexpr int kDone = -1;
+
+  // Creates a ReverseJniBreakiterator with the given text and locale.
+  //
+  // Returns:
+  //   A ReverseJniBreakIterator on success
+  //   INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
+  //   INTERNAL if unable to create any of the required Java objects
+  static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+  Create(const JniCache* jni_cache, std::string_view text,
+         std::string_view locale);
+
+  // Returns the UTF-16 boundary following the current boundary. If the current
+  // boundary is the last text boundary, it returns
+  // ReverseJniBreakIterator::kDONE.
+  //
+  // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
+  // boundary. Callers interested in the UTF-8 boundary are required to maintain
+  // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
+  int Next();
+
+  // Returns the first UTF-16 boundary. The iterator's current position is set
+  // to the first text boundary and any cached data is cleared.
+  int First();
+
+  // Returns the position of the first UTF-16 boundary preceding the UTF-16
+  // offset. If there is no boundary preceding the specified offset, then
+  // ReverseJniBreakIterator::kDone is returned.
+  //
+  // The iterator's current position is set to the segment whose boundary was
+  // returned and any cached data is cleared.
+  int Preceding(int offset);
+
+  // Returns the position of the first UTF-16 boundary following the UTF-16
+  // offset. If there is no boundary following the specified offset, then
+  // ReverseJniBreakIterator::kDone is returned.
+  //
+  // The iterator's current position is set to the segment whose boundary
+  // was returned and any cached data is cleared.
+  int Following(int offset);
+
+ private:
+  ReverseJniBreakIterator(
+      const JniCache* jni_cache,
+      libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+
+  // Fetches the results of up to kBatchSize next calls and stores them in
+  // break_indices_cache_. Returns the number of results or kDone if no more
+  // results could be fetched.
+  int FetchNextBatch();
+
+  // Empties the cache and sets is_done_ and is_almost_done_ to false.
+  void ClearCache();
+
+  // Keeps track of references to Java classes and methods. Does NOT own.
+  const JniCache* jni_cache_;
+
+  // The reference to the actual instance of BreakIteratorBatcher that
+  // this class interacts with.
+  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
+
+  // The cache holding the most recent batch of return values from
+  // BreakIteratorBatcher#next.
+  std::queue<int> break_indices_cache_;
+
+  bool is_done_;
+
+  // The last batch was incomplete (< kBatchSize results were returned). The
+  // next call to BreakIteratorBatcher#next is guaranteed to return an
+  // empty array. Once the results from the last batch are evicted from
+  // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
+  bool is_almost_done_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_

diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index bb26364..76219b5 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc

@@ -19,11 +19,11 @@
 #include <string>
 #include <string_view>
 
-#include "icing/jni/reverse-jni-break-iterator.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/legacy/core/icing-string-util.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
 #include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
@@ -44,13 +44,13 @@
   // Advances to the next term. Returns false if it has reached the end.
   bool Advance() override {
     // Prerequisite check
-    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+    if (IsDone()) {
       return false;
     }
 
     if (term_end_exclusive_.utf16_index() == 0) {
       int first = break_iterator_->First();
-      if (!term_start_.AdvanceToUtf16(first)) {
+      if (!term_start_.MoveToUtf16(first)) {
         // First is guaranteed to succeed and return a position within bonds. So
         // the only possible failure could be an invalid sequence. Mark as DONE
         // and return.
@@ -67,7 +67,7 @@
       MarkAsDone();
       return false;
     }
-    if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+    if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
       // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
       // the check for kDone above. So the only possible failure could be an
       // invalid sequence. Mark as DONE and return.
@@ -87,6 +87,9 @@
   // Returns the current term. It can be called only when Advance() returns
   // true.
   std::string_view GetTerm() const override {
+    if (IsDone()) {
+      return text_.substr(0, 0);
+    }
     int term_length =
         term_end_exclusive_.utf8_index() - term_start_.utf8_index();
     if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
@@ -96,6 +99,16 @@
     return text_.substr(term_start_.utf8_index(), term_length);
   }
 
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+      override {
+    return term_start_;
+  }
+
+  libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+      override {
+    return term_end_exclusive_;
+  }
+
   // Resets the iterator to point to the first term that starts after offset.
   // GetTerm will now return that term.
   //
@@ -107,15 +120,14 @@
   //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
   //   ABORTED if an invalid unicode character is encountered while
   //   traversing the text.
-  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
       int32_t offset) override {
-    if (offset < 0 || offset >= text_.length()) {
-      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
-          "Illegal offset provided! Offset %d is not within bounds of string "
-          "of length %zu",
-          offset, text_.length()));
+    if (offset < 0) {
+      // Very simple. The first term start after a negative offset is the first
+      // term. So just reset to start.
+      return ResetToStartUtf32();
     }
-    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+    if (IsDone()) {
       // We're done. Need to start from the beginning if we're going to reset
       // properly.
       term_start_ = CharacterIterator(text_);
@@ -123,43 +135,48 @@
     }
 
     // 1. Find the unicode character that contains the byte at offset.
-    CharacterIterator offset_iterator = term_end_exclusive_;
-    bool success = (offset > offset_iterator.utf8_index())
-                       ? offset_iterator.AdvanceToUtf8(offset)
-                       : offset_iterator.RewindToUtf8(offset);
-    if (!success) {
-      // Offset is guaranteed to be within bounds thanks to the check above. So
-      // the only possible failure could be an invalid sequence. Mark as DONE
-      // and return.
-      MarkAsDone();
-      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+    CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+                                            ? term_start_
+                                            : term_end_exclusive_;
+    if (!offset_iterator.MoveToUtf32(offset)) {
+      if (offset_iterator.utf8_index() != text_.length()) {
+        // We returned false for some reason other than hitting the end. This is
+        // a real error. Just return.
+        MarkAsDone();
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
+      }
+    }
+    // Check to see if offset is past the end of the text. If it is, then
+    // there's no term starting after it. Return an invalid argument.
+    if (offset_iterator.utf8_index() == text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+          "bounds of string of length %zu",
+          offset_iterator.utf32_index(), offset_iterator.utf8_index(),
+          text_.length()));
     }
 
     // 2. We've got the unicode character containing byte offset. Now, we need
     // to point to the segment that starts after this character.
     int following_utf16_index =
         break_iterator_->Following(offset_iterator.utf16_index());
-    if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+    if (following_utf16_index == ReverseJniBreakIterator::kDone ||
+        !offset_iterator.MoveToUtf16(following_utf16_index)) {
       MarkAsDone();
       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
           "No segments begin after provided offset %d.", offset));
     }
-    if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
-      // following_utf16_index is guaranteed to be within bonds thanks to the
-      // check for kDone above. So the only possible failure could be an invalid
-      // sequence. Mark as DONE and return.
-      MarkAsDone();
-      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
-    }
     term_end_exclusive_ = offset_iterator;
 
-    // 3. The term_end_exclusive_ points to the term that we want to return. We
-    // need to Advance so that term_start_ will now point to this term.
+    // 3. The term_end_exclusive_ points to the start of the term that we want
+    // to return. We need to Advance so that term_start_ will now point to this
+    // term.
     if (!Advance()) {
       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
           "No segments begin after provided offset %d.", offset));
     }
-    return term_start_.utf8_index();
+    return term_start_.utf32_index();
   }
 
   // Resets the iterator to point to the first term that ends before offset.
@@ -173,52 +190,48 @@
   //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
   //   ABORTED if an invalid unicode character is encountered while
   //   traversing the text.
-  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
       int32_t offset) override {
-    if (offset < 0 || offset >= text_.length()) {
+    if (offset < 0) {
       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
           "Illegal offset provided! Offset %d is not within bounds of string "
           "of length %zu",
           offset, text_.length()));
     }
-    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+    if (IsDone()) {
       // We're done. Need to start from the beginning if we're going to reset
       // properly.
       term_start_ = CharacterIterator(text_);
       term_end_exclusive_ = CharacterIterator(text_);
     }
 
-    // 1. Find the unicode character that contains the byte at offset.
-    CharacterIterator offset_iterator = term_end_exclusive_;
-    bool success = (offset > offset_iterator.utf8_index())
-                       ? offset_iterator.AdvanceToUtf8(offset)
-                       : offset_iterator.RewindToUtf8(offset);
-    if (!success) {
-      // Offset is guaranteed to be within bounds thanks to the check above. So
-      // the only possible failure could be an invalid sequence. Mark as DONE
-      // and return.
-      MarkAsDone();
-      return absl_ports::AbortedError(
-          "Could not retrieve valid utf8 character!");
+    CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+                                            ? term_start_
+                                            : term_end_exclusive_;
+    if (!offset_iterator.MoveToUtf32(offset)) {
+      // An error occurred. Mark as DONE
+      if (offset_iterator.utf8_index() != text_.length()) {
+        // We returned false for some reason other than hitting the end. This is
+        // a real error. Just return.
+        MarkAsDone();
+        return absl_ports::AbortedError(
+            "Could not retrieve valid utf8 character!");
+      }
+      // If it returned false because we hit the end. Then that's fine. We'll
+      // just treat it as if the request was for the end.
     }
 
     // 2. We've got the unicode character containing byte offset. Now, we need
-    // to point to the segment that starts before this character.
+    // to point to the segment that ends before this character.
     int starting_utf16_index =
         break_iterator_->Preceding(offset_iterator.utf16_index());
-    if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+    if (starting_utf16_index == ReverseJniBreakIterator::kDone ||
+        !offset_iterator.MoveToUtf16(starting_utf16_index)) {
       // Rewind the end indices.
       MarkAsDone();
       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
           "No segments end before provided offset %d.", offset));
     }
-    if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
-      // starting_utf16_index is guaranteed to be within bonds thanks to the
-      // check for kDone above. So the only possible failure could be an invalid
-      // sequence. Mark as DONE and return.
-      MarkAsDone();
-      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
-    }
     term_start_ = offset_iterator;
 
     // 3. We've correctly set the start index and the iterator currently points
@@ -226,24 +239,25 @@
     // advance the iterator to that position.
     int end_utf16_index = break_iterator_->Next();
     term_end_exclusive_ = term_start_;
-    term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+    term_end_exclusive_.MoveToUtf16(end_utf16_index);
 
     // 4. The start and end indices point to a segment, but we need to ensure
     // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
     // need a segment prior to this one.
-    if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) {
-      return ResetToTermEndingBefore(term_start_.utf8_index());
+    if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) {
+      return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
     }
-    return term_start_.utf8_index();
+    return term_start_.utf32_index();
   }
 
-  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+  libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
     term_start_ = CharacterIterator(text_);
     term_end_exclusive_ = CharacterIterator(text_);
     if (!Advance()) {
-      return absl_ports::NotFoundError("");
+      return absl_ports::NotFoundError(
+          "Unable to find any valid terms in text.");
     }
-    return term_start_.utf8_index();
+    return term_start_.utf32_index();
   }
 
  private:
@@ -255,11 +269,19 @@
   // break_iterator_ may be in any state.
   void MarkAsDone() {
     term_start_ =
-        CharacterIterator(text_, /*utf8_index=*/0,
-                          /*utf16_index=*/ReverseJniBreakIterator::kDone);
+        CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+                          /*utf16_index=*/ReverseJniBreakIterator::kDone,
+                          /*utf32_index=*/ReverseJniBreakIterator::kDone);
     term_end_exclusive_ =
-        CharacterIterator(text_, /*utf8_index=*/0,
-                          /*utf16_index=*/ReverseJniBreakIterator::kDone);
+        CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+                          /*utf16_index=*/ReverseJniBreakIterator::kDone,
+                          /*utf32_index=*/ReverseJniBreakIterator::kDone);
+  }
+  bool IsDone() const {
+    // We could just as easily check the other utf indices or the values in
+    // term_start_ to check for done. There's no particular reason to choose any
+    // one since they should all hold kDone.
+    return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
   }
 
   bool IsValidTerm() const {

diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 72c3180..b1a8f72 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc

@@ -27,6 +27,7 @@
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
 #include "unicode/uloc.h"
 
 namespace icing {
@@ -56,68 +57,60 @@
 }
 
 // Returns a vector containing all terms retrieved by calling ResetAfter with
-// the current position to simulate Advancing on the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
     LanguageSegmenter::Iterator* itr) {
   std::vector<std::string_view> terms;
-  if (!itr->ResetToStart().ok()) {
-    return terms;
-  }
-  terms.push_back(itr->GetTerm());
-  const char* text_begin = itr->GetTerm().data();
-  // Calling ResetToTermStartingAfter with the current position should get the
-  // very next term in the sequence.
-  for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
-       current_pos = itr->GetTerm().data() - text_begin) {
+  // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+  // the sequence.
+  bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+  while (is_ok) {
     terms.push_back(itr->GetTerm());
+    // Calling ResetToTermStartingAfterUtf32 with the current position should
+    // get the very next term in the sequence.
+    CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+    is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
   }
   return terms;
 }
 
 // Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
     LanguageSegmenter::Iterator* itr) {
-  const char* text_begin = itr->GetTerm().data();
   std::vector<std::string_view> terms;
-
-  bool is_ok = true;
-  int current_pos = 0;
+  bool is_ok = itr->Advance();
   while (is_ok) {
+    terms.push_back(itr->GetTerm());
     // Alternate between using Advance and ResetToTermAfter.
     if (terms.size() % 2 == 0) {
       is_ok = itr->Advance();
     } else {
-      // Calling ResetToTermStartingAfter with the current position should get
-      // the very next term in the sequence.
-      current_pos = itr->GetTerm().data() - text_begin;
-      is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
-    }
-    if (is_ok) {
-      terms.push_back(itr->GetTerm());
+      // Calling ResetToTermStartingAfterUtf32 with the current position should
+      // get the very next term in the sequence.
+      CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+      is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
     }
   }
   return terms;
 }
 
 // Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
     LanguageSegmenter::Iterator* itr) {
-  const char* text_begin = itr->GetTerm().data();
-  int last_pos = 0;
-  while (itr->Advance()) {
-    last_pos = itr->GetTerm().data() - text_begin;
-  }
   std::vector<std::string_view> terms;
-  // Calling ResetToTermEndingBefore with the current position should get the
-  // previous term in the sequence.
-  for (int current_pos = last_pos;
-       itr->ResetToTermEndingBefore(current_pos).ok();
-       current_pos = itr->GetTerm().data() - text_begin) {
+  bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+  while (is_ok) {
     terms.push_back(itr->GetTerm());
+    // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+    // the previous term in the sequence.
+    CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+    is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
   }
   return terms;
 }
@@ -481,7 +474,7 @@
   EXPECT_THAT(word2_address, Eq(word2_result_address));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -489,15 +482,16 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "com:google:android is package"
-  //          ^                 ^^ ^^
-  // Bytes:   0              18 19 21 22
-  auto position_or = itr->ResetToStart();
+  // String:      "com:google:android is package"
+  //               ^                 ^^ ^^
+  // UTF-8 idx:    0              18 19 21 22
+  // UTF-32 idx:   0              18 19 21 22
+  auto position_or = itr->ResetToStartUtf32();
   EXPECT_THAT(position_or, IsOk());
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -505,14 +499,15 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -520,15 +515,17 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
   ASSERT_TRUE(itr->Advance());  // itr points to 'How'
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest,
+       IteratorMultipleAdvancesResetToStartUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -536,18 +533,19 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());
   ASSERT_TRUE(itr->Advance());  // itr points to ' '
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) {
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -555,17 +553,18 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
   while (itr->Advance()) {
     // Do nothing.
   }
-  EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -573,21 +572,22 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "package com:google:android name"
-  //          ^      ^^                 ^^
-  // Bytes:   0      7 8               26 27
-  auto position_or = itr->ResetToTermStartingAfter(8);
+  // String:     "package com:google:android name"
+  //              ^      ^^                 ^^
+  // UTF-8 idx:   0      7 8               26 27
+  // UTF-32 idx:  0      7 8               26 27
+  auto position_or = itr->ResetToTermStartingAfterUtf32(8);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
   ASSERT_THAT(itr->GetTerm(), Eq(" "));
 
-  position_or = itr->ResetToTermStartingAfter(7);
+  position_or = itr->ResetToTermStartingAfterUtf32(7);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -595,19 +595,19 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  // String:     "How are you你好吗お元気ですか"
+  //              ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:   0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:  0  3 4 7 8 11 131415 17 19
+  ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
   ASSERT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+  EXPECT_THAT(itr->GetTerm(), Eq("How"));
 }
 
 // Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -616,7 +616,7 @@
 // terms produced by ResetToTermAfter calls with the current position
 // provided as the argument.
 TEST_P(ReverseJniLanguageSegmenterTest,
-       MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+       MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -631,14 +631,14 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       ThaiResetToTermAfterEquivalentToAdvance) {
+       ThaiResetToTermAfterUtf32EquivalentToAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -653,14 +653,14 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       KoreanResetToTermAfterEquivalentToAdvance) {
+       KoreanResetToTermAfterUtf32EquivalentToAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -675,7 +675,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetAfter(reset_to_term_itr.get());
+      GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -686,7 +686,7 @@
 // should be able to mix ResetToTermAfter(current_position) calls and Advance
 // calls to mimic calling Advance.
 TEST_P(ReverseJniLanguageSegmenterTest,
-       MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+       MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -701,7 +701,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
@@ -709,7 +709,7 @@
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       ThaiResetToTermAfterInteroperableWithAdvance) {
+       ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -724,7 +724,7 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
@@ -732,7 +732,7 @@
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       KoreanResetToTermAfterInteroperableWithAdvance) {
+       KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -747,14 +747,14 @@
       std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> advance_and_reset_terms =
-      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+      GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
 
   EXPECT_THAT(advance_and_reset_terms,
               testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -763,33 +763,35 @@
       std::unique_ptr<LanguageSegmenter::Iterator> itr,
       language_segmenter->Segment("How are you你好吗お元気ですか"));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
   EXPECT_THAT(itr->GetTerm(), Eq("你好"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
   EXPECT_THAT(itr->GetTerm(), Eq("か"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
   EXPECT_THAT(itr->GetTerm(), Eq("吗"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest,
+       ContinuousWhitespacesResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -799,35 +801,36 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kTextWithSpace));
 
-  // String: "Hello          World"
-  //          ^    ^         ^
-  // Bytes:   0    5         15
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+  // String:      "Hello          World"
+  //               ^    ^         ^
+  // UTF-8 idx:    0    5         15
+  // UTF-32 idx:   0    5         15
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("World"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("World"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -837,21 +840,22 @@
   constexpr std::string_view kChinese = "我每天走路去上班。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kChinese));
-  // String: "我每天走路去上班。"
-  //          ^ ^  ^   ^^
-  // Bytes:   0 3  9  15 18
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  // String:       "我每天走路去上班。"
+  //                ^ ^  ^   ^^
+  // UTF-8 idx:     0 3  9  15 18
+  // UTF-832 idx:   0 1  3   5 6
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("每天"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("走路"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -860,21 +864,22 @@
   constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kJapanese));
-  // String: "私は毎日仕事に歩いています。"
-  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
-  // Bytes:   0 3 6  12 18212427 33
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  // String:       "私は毎日仕事に歩いています。"
+  //                ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // UTF-8 idx:     0 3 6  12 18212427 33
+  // UTF-32 idx:    0 1 2  4  6 7 8 9  11
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("は"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
   EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -882,21 +887,22 @@
   constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kKhmer));
-  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
-  //          ^ ^   ^   ^
-  // Bytes:   0 9   24  45
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  // String:            "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //                     ^ ^   ^   ^
+  // UTF-8 idx:          0 9   24  45
+  // UTF-32 idx:         0 3   8   15
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -905,24 +911,25 @@
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kThai));
-  // String: "ฉันเดินไปทำงานทุกวัน"
-  //          ^ ^  ^ ^    ^ ^
-  // Bytes:   0 9 21 27  42 51
-  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  // String:      "ฉันเดินไปทำงานทุกวัน"
+  //               ^ ^  ^ ^    ^ ^
+  // UTF-8 idx:    0 9 21 27  42 51
+  // UTF-32 idx:   0 3  7 9   14 17
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
   EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 
-  EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
   EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnector) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -930,21 +937,22 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "package name com:google:android!"
-  //          ^      ^^   ^^                 ^
-  // Bytes:   0      7 8 12 13               31
-  auto position_or = itr->ResetToTermEndingBefore(31);
+  // String:      "package name com:google:android!"
+  //               ^      ^^   ^^                 ^
+  // UTF-8 idx:    0      7 8 12 13               31
+  // UTF-32 idx:   0      7 8 12 13               31
+  auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
   ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
 
-  position_or = itr->ResetToTermEndingBefore(21);
+  position_or = itr->ResetToTermEndingBeforeUtf32(21);
   EXPECT_THAT(position_or, IsOk());
   EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
   ASSERT_THAT(itr->GetTerm(), Eq(" "));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -952,19 +960,19 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              segmenter->Segment(kText));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
   ASSERT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
   EXPECT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+  EXPECT_THAT(itr->GetTerm(), Eq("か"));
 }
 
 // Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -973,7 +981,7 @@
 // terms produced by ResetToTermBefore calls with the current position
 // provided as the argument (after their order has been reversed).
 TEST_P(ReverseJniLanguageSegmenterTest,
-       MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+       MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -983,17 +991,12 @@
       segmenter->Segment(kText));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kText));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -1002,7 +1005,7 @@
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       ThaiResetToTermBeforeEquivalentToAdvance) {
+       ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -1012,17 +1015,12 @@
       segmenter->Segment(kThai));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kThai));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -1030,7 +1028,7 @@
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       KoreanResetToTermBeforeEquivalentToAdvance) {
+       KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto segmenter, language_segmenter_factory::Create(
                           GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -1040,24 +1038,19 @@
       segmenter->Segment(kKorean));
   std::vector<std::string_view> advance_terms =
       GetAllTermsAdvance(advance_itr.get());
-  // Can't produce the last term via calls to ResetToTermBefore. So skip
-  // past that one.
-  auto itr = advance_terms.begin();
-  std::advance(itr, advance_terms.size() - 1);
-  advance_terms.erase(itr);
 
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
       segmenter->Segment(kKorean));
   std::vector<std::string_view> reset_terms =
-      GetAllTermsResetBefore(reset_to_term_itr.get());
+      GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
   std::reverse(reset_terms.begin(), reset_terms.end());
 
   EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
   EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1066,35 +1059,36 @@
       std::unique_ptr<LanguageSegmenter::Iterator> itr,
       language_segmenter->Segment("How are you你好吗お元気ですか"));
 
-  // String: "How are you你好吗お元気ですか"
-  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
-  // Bytes:   0  3 4 7 8 11 172023 29 35
-  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+  // String:      "How are you你好吗お元気ですか"
+  //               ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // UTF-8 idx:    0  3 4 7 8 11 172023 29 35
+  // UTF-32 idx:   0  3 4 7 8 11 131415 17 19
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
   EXPECT_THAT(itr->GetTerm(), Eq("are"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
   EXPECT_THAT(itr->GetTerm(), Eq("元気"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("you"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
   EXPECT_THAT(itr->GetTerm(), Eq("です"));
 }
 
 TEST_P(ReverseJniLanguageSegmenterTest,
-       ContinuousWhitespacesResetToTermBefore) {
+       ContinuousWhitespacesResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1104,34 +1098,35 @@
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kTextWithSpace));
 
-  // String: "Hello          World"
-  //          ^    ^         ^
-  // Bytes:   0    5         15
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "Hello          World"
+  //               ^    ^         ^
+  // UTF-8 idx:    0    5         15
+  // UTF-32 idx:   0    5         15
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq(" "));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1141,21 +1136,22 @@
   constexpr std::string_view kChinese = "我每天走路去上班。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kChinese));
-  // String: "我每天走路去上班。"
-  //          ^ ^  ^   ^^
-  // Bytes:   0 3  9  15 18
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "我每天走路去上班。"
+  //               ^ ^  ^   ^^
+  // UTF-8 idx:    0 3  9  15 18
+  // UTF-32 idx:   0 1  3   5 6
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("我"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
   EXPECT_THAT(itr->GetTerm(), Eq("去"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1164,21 +1160,22 @@
   constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kJapanese));
-  // String: "私は毎日仕事に歩いています。"
-  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
-  // Bytes:   0 3 6  12 18212427 33
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "私は毎日仕事に歩いています。"
+  //               ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // UTF-8 idx:    0 3 6  12 18212427 33
+  // UTF-32 idx:   0 1 2  4  6 7 8 9  11
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
   EXPECT_THAT(itr->GetTerm(), Eq("てい"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
   EXPECT_THAT(itr->GetTerm(), Eq("は"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1186,21 +1183,22 @@
   constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kKhmer));
-  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
-  //          ^ ^   ^   ^
-  // Bytes:   0 9   24  45
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //               ^ ^   ^   ^
+  // UTF-8 idx:    0 9   24  45
+  // UTF-32 idx:   0 3   8   15
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
   EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
 }
 
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
       language_segmenter_factory::Create(
@@ -1209,20 +1207,21 @@
   constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
                              language_segmenter->Segment(kThai));
-  // String: "ฉันเดินไปทำงานทุกวัน"
-  //          ^ ^  ^ ^    ^ ^
-  // Bytes:   0 9 21 27  42 51
-  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+  // String:      "ฉันเดินไปทำงานทุกวัน"
+  //               ^ ^  ^ ^    ^ ^
+  // UTF-8 idx:    0 9 21 27  42 51
+  // UTF-32 idx:   0 3  7 9   14 17
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
   EXPECT_THAT(itr->GetTerm(), IsEmpty());
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
   EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
   EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
 
-  EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
   EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
 }
 

diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
deleted file mode 100644
index 856ba0a..0000000
--- a/icing/tokenization/simple/space-language-segmenter-factory.cc
+++ /dev/null

@@ -1,41 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/simple/space-language-segmenter.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-namespace language_segmenter_factory {
-
-// Creates a language segmenter with the given locale.
-//
-// Returns:
-//   A LanguageSegmenter on success
-//   INVALID_ARGUMENT if locale string is invalid
-//
-// TODO(b/156383798): Figure out if we want to verify locale strings and notify
-// users. Right now illegal locale strings will be ignored by ICU. ICU
-// components will be created with its default locale.
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
-    SegmenterOptions) {
-  return std::make_unique<SpaceLanguageSegmenter>();
-}
-
-}  // namespace language_segmenter_factory
-
-}  // namespace lib
-}  // namespace icing

diff --git a/icing/tokenization/simple/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
deleted file mode 100644
index 7e301ec..0000000
--- a/icing/tokenization/simple/space-language-segmenter.cc
+++ /dev/null

@@ -1,205 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/simple/space-language-segmenter.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-constexpr char kASCIISpace = ' ';
-}  // namespace
-
-class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
- public:
-  SpaceLanguageSegmenterIterator(std::string_view text)
-      : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
-
-  // Advances to the next term. Returns false if it has reached the end.
-  bool Advance() override {
-    if (term_end_index_exclusive_ >= text_.size() ||
-        term_start_index_ >= text_.size()) {
-      // Reached the end
-      return false;
-    }
-
-    // Next term starts where we left off.
-    term_start_index_ = term_end_index_exclusive_;
-
-    // We know a term is at least one length, so we can +1 first.
-    term_end_index_exclusive_++;
-
-    // We alternate terms between space and non-space. Figure out what type of
-    // term we're currently on so we know how to stop.
-    bool is_space = text_[term_start_index_] == kASCIISpace;
-
-    while (term_end_index_exclusive_ < text_.size()) {
-      bool end_is_space = text_[term_end_index_exclusive_] == kASCIISpace;
-      if (is_space != end_is_space) {
-        // We finally see a different type of character, reached the end.
-        break;
-      }
-      // We're still seeing the same types of characters (saw a space and
-      // still seeing spaces, or saw a non-space and still seeing non-spaces).
-      // Haven't reached the next term yet, keep advancing.
-      term_end_index_exclusive_++;
-    }
-
-    return true;
-  }
-
-  // Returns the current term. It can be called only when Advance() returns
-  // true.
-  std::string_view GetTerm() const override {
-    if (text_[term_start_index_] == kASCIISpace) {
-      // Rule: multiple continuous whitespaces are treated as one.
-      return std::string_view(&text_[term_start_index_], 1);
-    }
-    return text_.substr(term_start_index_,
-                        term_end_index_exclusive_ - term_start_index_);
-  }
-
-  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
-      int32_t offset) override {
-    if (offset < 0) {
-      // Start over from the beginning to find the first term.
-      term_start_index_ = 0;
-      term_end_index_exclusive_ = 0;
-    } else {
-      // Offset points to a term right now. Advance to get past the current
-      // term.
-      term_end_index_exclusive_ = offset;
-      if (!Advance()) {
-        return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-            "No term found in '%s' that starts after offset %d",
-            std::string(text_).c_str(), offset));
-      }
-    }
-
-    // Advance again so we can point to the next term.
-    if (!Advance()) {
-      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-          "No term found in '%s' that starts after offset %d",
-          std::string(text_).c_str(), offset));
-    }
-
-    return term_start_index_;
-  }
-
-  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
-      int32_t offset) override {
-    if (offset <= 0 || offset > text_.size()) {
-      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-          "No term found in '%s' that ends before offset %d",
-          std::string(text_).c_str(), offset));
-    }
-
-    if (offset == text_.size()) {
-      // Special-case if the offset is the text length, this is the last term in
-      // the text, which is also considered to be "ending before" the offset.
-      term_end_index_exclusive_ = offset;
-      ICING_ASSIGN_OR_RETURN(term_start_index_, GetTermStartingBefore(offset));
-      return term_start_index_;
-    }
-
-    // Otherwise, this is just the end of the previous term and we still need to
-    // find the start of the previous term.
-    ICING_ASSIGN_OR_RETURN(term_end_index_exclusive_,
-                           GetTermStartingBefore(offset));
-
-    if (term_end_index_exclusive_ == 0) {
-      // The current term starts at the beginning of the underlying text_.
-      // There is no term before this.
-      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
-          "No term found in '%s' that ends before offset %d",
-          std::string(text_).c_str(), offset));
-    }
-
-    // Reset ourselves to find the term before the end.
-    ICING_ASSIGN_OR_RETURN(
-        term_start_index_,
-        GetTermStartingBefore(term_end_index_exclusive_ - 1));
-    return term_start_index_;
-  }
-
-  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
-    term_start_index_ = 0;
-    term_end_index_exclusive_ = 0;
-    if (!Advance()) {
-      return absl_ports::NotFoundError("");
-    }
-    return term_start_index_;
-  }
-
- private:
-  // Return the start offset of the term starting right before the given offset.
-  libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
-    bool is_space = text_[offset] == kASCIISpace;
-
-    // Special-case that if offset was the text length, then we're already at
-    // the "end" of our current term.
-    if (offset == text_.size()) {
-      is_space = text_[--offset] == kASCIISpace;
-    }
-
-    // While it's the same type of character (space vs non-space), we're in the
-    // same term. So keep iterating backwards until we see a change.
-    while (offset >= 0 && (text_[offset] == kASCIISpace) == is_space) {
-      --offset;
-    }
-
-    // +1 is because offset was off-by-one to exit the while-loop.
-    return ++offset;
-  }
-
-  // Text to be segmented
-  std::string_view text_;
-
-  // The start and end indices are used to track the positions of current
-  // term.
-  int term_start_index_;
-  int term_end_index_exclusive_;
-};
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-SpaceLanguageSegmenter::Segment(const std::string_view text) const {
-  return std::make_unique<SpaceLanguageSegmenterIterator>(text);
-}
-
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-SpaceLanguageSegmenter::GetAllTerms(const std::string_view text) const {
-  ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
-                         Segment(text));
-  std::vector<std::string_view> terms;
-  while (iterator->Advance()) {
-    terms.push_back(iterator->GetTerm());
-  }
-  return terms;
-}
-
-}  // namespace lib
-}  // namespace icing

diff --git a/icing/tokenization/simple/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
deleted file mode 100644
index de0a6d3..0000000
--- a/icing/tokenization/simple/space-language-segmenter.h
+++ /dev/null

@@ -1,58 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/tokenization/language-segmenter.h"
-
-namespace icing {
-namespace lib {
-
-// Simple segmenter that splits on spaces, regardless of language. Continuous
-// whitespaces will be returned as a single whitespace character.
-class SpaceLanguageSegmenter : public LanguageSegmenter {
- public:
-  SpaceLanguageSegmenter() = default;
-  SpaceLanguageSegmenter(const SpaceLanguageSegmenter&) = delete;
-  SpaceLanguageSegmenter& operator=(const SpaceLanguageSegmenter&) = delete;
-
-  // Segmentation is based purely on whitespace; does not take into account the
-  // language of the text.
-  //
-  // Returns:
-  //   An iterator of terms on success
-  libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-  Segment(std::string_view text) const override;
-
-  // Does not take into account the language of the text.
-  //
-  // Returns:
-  //   A list of terms on success
-  //   INTERNAL_ERROR if any error occurs
-  libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
-      std::string_view text) const override;
-};
-
-}  // namespace lib
-}  // namespace icing
-
-#endif  // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_

diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
deleted file mode 100644
index 6c5e3f6..0000000
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ /dev/null

@@ -1,129 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-TEST(SpaceLanguageSegmenterTest, EmptyText) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-  EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST(SpaceLanguageSegmenterTest, SimpleText) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-  EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
-              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Punctuation) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
-              IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
-              IsOkAndHolds(ElementsAre("Open-source", " ", "project")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
-              IsOkAndHolds(ElementsAre("100%")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("(A&B)"),
-              IsOkAndHolds(ElementsAre("(A&B)")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-
-  // Alphanumeric terms are allowed
-  EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
-              IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Number) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-
-  // Alphanumeric terms are allowed
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
-      IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
-              IsOkAndHolds(ElementsAre("3,456.789")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
-              IsOkAndHolds(ElementsAre("-123")));
-}
-
-TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-
-  // Multiple continuous whitespaces are treated as one.
-  const int kNumSeparators = 256;
-  const std::string text_with_spaces =
-      absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
-  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
-              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
-  language_segmenter_factory::SegmenterOptions options(ULOC_US);
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(std::move(options)));
-  // Validates that the input strings are not copied
-  const std::string text = "Hello World";
-  const char* word1_address = text.c_str();
-  const char* word2_address = text.c_str() + 6;
-  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
-                             language_segmenter->GetAllTerms(text));
-  ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
-  const char* word1_result_address = terms.at(0).data();
-  const char* word2_result_address = terms.at(2).data();
-
-  // The underlying char* should be the same
-  EXPECT_THAT(word1_address, Eq(word1_result_address));
-  EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-}  // namespace
-}  // namespace lib
-}  // namespace icing

diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 38c4745..b4f0c6e 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h

@@ -20,7 +20,9 @@
 #include <string_view>
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
 #include "icing/tokenization/token.h"
+#include "icing/util/character-iterator.h"
 
 namespace icing {
 namespace lib {
@@ -64,6 +66,18 @@
     // true, otherwise an invalid token could be returned.
     virtual Token GetToken() const = 0;
 
+    virtual libtextclassifier3::StatusOr<CharacterIterator>
+    CalculateTokenStart() {
+      return absl_ports::UnimplementedError(
+          "CalculateTokenStart is not implemented!");
+    }
+
+    virtual libtextclassifier3::StatusOr<CharacterIterator>
+    CalculateTokenEndExclusive() {
+      return absl_ports::UnimplementedError(
+          "CalculateTokenEndExclusive is not implemented!");
+    }
+
     // Sets the tokenizer to point at the first token that *starts* *after*
     // offset. Returns false if there are no valid tokens starting after
     // offset.

diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
index 3707f95..6c5faef 100644
--- a/icing/util/character-iterator.cc
+++ b/icing/util/character-iterator.cc

@@ -30,6 +30,11 @@
 
 }  // namespace
 
+bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
+  return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
+                                            : RewindToUtf8(desired_utf8_index);
+}
+
 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
   if (desired_utf8_index > text_.length()) {
     // Enforce the requirement.
@@ -50,6 +55,7 @@
     }
     utf8_index_ += utf8_length;
     utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+    ++utf32_index_;
   }
   return true;
 }
@@ -76,10 +82,17 @@
       return false;
     }
     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+    --utf32_index_;
   }
   return true;
 }
 
+bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
+  return (desired_utf16_index > utf16_index_)
+             ? AdvanceToUtf16(desired_utf16_index)
+             : RewindToUtf16(desired_utf16_index);
+}
+
 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
   while (utf16_index_ < desired_utf16_index) {
     UChar32 uchar32 =
@@ -100,6 +113,7 @@
     }
     utf8_index_ += utf8_length;
     utf16_index_ += utf16_length;
+    ++utf32_index_;
   }
   return true;
 }
@@ -111,6 +125,11 @@
   while (utf16_index_ > desired_utf16_index) {
     --utf8_index_;
     utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+    if (utf8_index_ < 0) {
+      // Somehow, there wasn't a single UTF-8 lead byte at
+      // requested_byte_index or an earlier byte.
+      return false;
+    }
     // We've found the start of a unicode char!
     UChar32 uchar32 =
         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
@@ -119,6 +138,59 @@
       return false;
     }
     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+    --utf32_index_;
+  }
+  return true;
+}
+
+bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
+  return (desired_utf32_index > utf32_index_)
+             ? AdvanceToUtf32(desired_utf32_index)
+             : RewindToUtf32(desired_utf32_index);
+}
+
+bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
+  while (utf32_index_ < desired_utf32_index) {
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+    int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+    if (utf8_index_ + utf8_length > text_.length()) {
+      // Enforce the requirement.
+      return false;
+    }
+    utf8_index_ += utf8_length;
+    utf16_index_ += utf16_length;
+    ++utf32_index_;
+  }
+  return true;
+}
+
+bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
+  if (desired_utf32_index < 0) {
+    return false;
+  }
+  while (utf32_index_ > desired_utf32_index) {
+    --utf8_index_;
+    utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+    if (utf8_index_ < 0) {
+      // Somehow, there wasn't a single UTF-8 lead byte at
+      // requested_byte_index or an earlier byte.
+      return false;
+    }
+    // We've found the start of a unicode char!
+    UChar32 uchar32 =
+        i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+    if (uchar32 == i18n_utils::kInvalidUChar32) {
+      // Unable to retrieve a valid UTF-32 character at the previous position.
+      return false;
+    }
+    utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+    --utf32_index_;
   }
   return true;
 }

diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
index 22de6c5..9df7bee 100644
--- a/icing/util/character-iterator.h
+++ b/icing/util/character-iterator.h

@@ -15,6 +15,7 @@
 #ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
 #define ICING_UTIL_CHARACTER_ITERATOR_H_
 
+#include "icing/legacy/core/icing-string-util.h"
 #include "icing/util/i18n-utils.h"
 
 namespace icing {
@@ -23,23 +24,35 @@
 class CharacterIterator {
  public:
   explicit CharacterIterator(std::string_view text)
-      : CharacterIterator(text, 0, 0) {}
+      : CharacterIterator(text, 0, 0, 0) {}
 
-  CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
-      : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+  CharacterIterator(std::string_view text, int utf8_index, int utf16_index,
+                    int utf32_index)
+      : text_(text),
+        utf8_index_(utf8_index),
+        utf16_index_(utf16_index),
+        utf32_index_(utf32_index) {}
 
-  // Moves from current position to the character that includes the specified
+  // Moves current position to desired_utf8_index.
+  // REQUIRES: 0 <= desired_utf8_index <= text_.length()
+  bool MoveToUtf8(int desired_utf8_index);
+
+  // Advances from current position to the character that includes the specified
   // UTF-8 index.
   // REQUIRES: desired_utf8_index <= text_.length()
   // desired_utf8_index is allowed to point one index past the end, but no
   // further.
   bool AdvanceToUtf8(int desired_utf8_index);
 
-  // Moves from current position to the character that includes the specified
+  // Rewinds from current position to the character that includes the specified
   // UTF-8 index.
   // REQUIRES: 0 <= desired_utf8_index
   bool RewindToUtf8(int desired_utf8_index);
 
+  // Moves current position to desired_utf16_index.
+  // REQUIRES: 0 <= desired_utf16_index <= text_.utf16_length()
+  bool MoveToUtf16(int desired_utf16_index);
+
   // Advances current position to desired_utf16_index.
   // REQUIRES: desired_utf16_index <= text_.utf16_length()
   // desired_utf16_index is allowed to point one index past the end, but no
@@ -50,18 +63,39 @@
   // REQUIRES: 0 <= desired_utf16_index
   bool RewindToUtf16(int desired_utf16_index);
 
+  // Moves current position to desired_utf32_index.
+  // REQUIRES: 0 <= desired_utf32_index <= text_.utf32_length()
+  bool MoveToUtf32(int desired_utf32_index);
+
+  // Advances current position to desired_utf32_index.
+  // REQUIRES: desired_utf32_index <= text_.utf32_length()
+  // desired_utf32_index is allowed to point one index past the end, but no
+  // further.
+  bool AdvanceToUtf32(int desired_utf32_index);
+
+  // Rewinds current position to desired_utf32_index.
+  // REQUIRES: 0 <= desired_utf32_index
+  bool RewindToUtf32(int desired_utf32_index);
+
   int utf8_index() const { return utf8_index_; }
   int utf16_index() const { return utf16_index_; }
+  int utf32_index() const { return utf32_index_; }
 
   bool operator==(const CharacterIterator& rhs) const {
     return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
-           utf16_index_ == rhs.utf16_index_;
+           utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_;
+  }
+
+  std::string DebugString() const {
+    return IcingStringUtil::StringPrintf("(u8:%d,u16:%d,u32:%d)", utf8_index_,
+                                         utf16_index_, utf32_index_);
   }
 
  private:
   std::string_view text_;
   int utf8_index_;
   int utf16_index_;
+  int utf32_index_;
 };
 
 }  // namespace lib

diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index 2019033..64f98f6 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java

@@ -59,6 +59,7 @@
 import java.util.Map;
 import org.junit.After;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -489,6 +490,7 @@
   }
 
   @Test
+  @Ignore("b/190845688")
   public void testCJKTSnippets() throws Exception {
     assertStatusOk(icingSearchEngine.initialize().getStatus());
 

diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 4069810..35ad6d9 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt

@@ -1 +1 @@
-set(synced_AOSP_CL_number=375495869)
+set(synced_AOSP_CL_number=378695940)
commit	058975937a9a12c3da9b7e099ef58ad2f942cf7b	[log] [tgz]
author	Tim Barron <tjbarron@google.com>	Mon Jun 21 20:17:44 2021 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	Mon Jun 21 20:17:44 2021 +0000
tree	28fec0e533e72f993e6e183af3675d87a1462d59
parent	1a698115fe367b4e3907b31ca4dfa5d6ae430469 [diff]
parent	77207b21c25fce96d03cc1a1d4f294a99b6868a6 [diff]