Merge remote-tracking branch 'goog/androidx-platform-dev' into sc-dev am: c586b8680c am: 8541fa7a31 am: e102320065 am: 1673a404e6
Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/14505837
Change-Id: I201d6fdcde2035657f512d568162d637a08163ef
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
new file mode 100644
index 0000000..95c3949
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -0,0 +1,1173 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// File-backed log of protos with append-only writes and position based reads.
+//
+// There should only be one instance of a PortableFileBackedProtoLog of the same
+// file at a time; using multiple instances at the same time may lead to
+// undefined behavior.
+//
+// The entire checksum is computed on initialization to verify the contents are
+// valid. On failure, the log will be truncated to the last verified state when
+// PersistToDisk() was called. If the log cannot successfully restore the last
+// state due to disk corruption or some other inconsistency, then the entire log
+// will be lost.
+//
+// Each proto written to the file will have a metadata written just before it.
+// The metadata consists of
+// {
+// 1 bytes of kProtoMagic;
+// 3 bytes of the proto size
+// n bytes of the proto itself
+// }
+//
+// All metadata is written in a portable format, encoded with htonl before
+// writing to file and decoded with ntohl when reading from file.
+//
+// Example usage:
+// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+// PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+// file_path_,
+// options));
+// auto proto_log = create_result.proto_log;
+//
+// Document document;
+// document.set_namespace("com.google.android.example");
+// document.set_uri("www.google.com");
+//
+// int64_t document_offset = proto_log->WriteProto(document));
+// Document same_document = proto_log->ReadProto(document_offset));
+// proto_log->PersistToDisk();
+
+#ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+#define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/platform.h"
+#include "icing/portable/zlib.h"
+#include "icing/util/bit-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Number of bytes we reserve for the heading at the beginning of the proto log.
+// We reserve this so the header can grow without running into the contents of
+// the proto log, triggering an unnecessary migration of the data.
+constexpr int kHeaderReservedBytes = 256;
+
+bool IsEmptyBuffer(const char* buffer, int size) {
+ return std::all_of(buffer, buffer + size,
+ [](const char byte) { return byte == 0; });
+}
+
+// Helper function to get stored proto size from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+// Helper function to get stored proto magic from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+} // namespace
+
+template <typename ProtoT>
+class PortableFileBackedProtoLog {
+ public:
+ struct Options {
+ // Whether to compress each proto before writing to the proto log.
+ bool compress;
+
+ // Byte-size limit for each proto written to the store. This does not
+ // include the bytes needed for the metadata of each proto.
+ //
+ // NOTE: Currently, we only support protos up to 16MiB. We store the proto
+ // size in 3 bytes within the metadata.
+ //
+ // NOTE: This limit is only enforced for future writes. If the store
+ // previously had a higher limit, then reading older entries could return
+ // larger protos.
+ //
+ // NOTE: The max_proto_size is the upper limit for input protos into the
+ // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
+ // to a smaller size, ProtoLog will not accept it. Protos that result in a
+ // compressed size larger than max_proto_size are also not accepted.
+ const int32_t max_proto_size;
+
+ // Must specify values for options.
+ Options() = delete;
+ explicit Options(bool compress_in,
+ const int32_t max_proto_size_in = kMaxProtoSize)
+ : compress(compress_in), max_proto_size(max_proto_size_in) {}
+ };
+
+ // Header stored at the beginning of the file before the rest of the log
+ // contents. Stores metadata on the log.
+ class Header {
+ public:
+ static constexpr int32_t kMagic = 0xf4c6f67a;
+
+ static constexpr int32_t kFileFormatVersion = 0;
+
+ uint32_t CalculateHeaderChecksum() const {
+ Crc32 crc;
+
+ // Get a string_view of all the fields of the Header, excluding the
+ // magic_nbytes and header_checksum_nbytes
+ std::string_view header_str(reinterpret_cast<const char*>(this) +
+ offsetof(Header, header_checksum_nbytes) +
+ sizeof(header_checksum_nbytes),
+ sizeof(Header) - sizeof(magic_nbytes) -
+ sizeof(header_checksum_nbytes));
+ crc.Append(header_str);
+ return crc.Get();
+ }
+
+ int32_t GetMagic() const { return gntohl(magic_nbytes); }
+
+ void SetMagic(int32_t magic_in) { magic_nbytes = ghtonl(magic_in); }
+
+ int32_t GetFileFormatVersion() const {
+ return gntohl(file_format_version_nbytes);
+ }
+
+ void SetFileFormatVersion(int32_t file_format_version_in) {
+ file_format_version_nbytes = ghtonl(file_format_version_in);
+ }
+
+ int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes); }
+
+ void SetMaxProtoSize(int32_t max_proto_size_in) {
+ max_proto_size_nbytes = ghtonl(max_proto_size_in);
+ }
+
+ int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes); }
+
+ void SetLogChecksum(int32_t log_checksum_in) {
+ log_checksum_nbytes = ghtonl(log_checksum_in);
+ }
+
+ int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes); }
+
+ void SetRewindOffset(int64_t rewind_offset_in) {
+ rewind_offset_nbytes = ghtonll(rewind_offset_in);
+ }
+
+ int32_t GetHeaderChecksum() const { return gntohl(header_checksum_nbytes); }
+
+ void SetHeaderChecksum(int32_t header_checksum_in) {
+ header_checksum_nbytes = ghtonl(header_checksum_in);
+ }
+
+ bool GetCompressFlag() const {
+ uint16_t host_order_flags = gntohs(flags_nbytes);
+ return bit_util::BitfieldGet(host_order_flags, kCompressBit, /*len=*/1);
+ }
+
+ void SetCompressFlag(bool compress) {
+ uint16_t host_order_flags = gntohs(flags_nbytes);
+ bit_util::BitfieldSet(compress, kCompressBit,
+ /*len=*/1, &host_order_flags);
+ flags_nbytes = ghtons(host_order_flags);
+ }
+
+ private:
+ // The least-significant bit offset at which the compress flag is stored in
+ // 'flags_nbytes'. Represents whether the protos in the log are compressed
+ // or not.
+ static constexpr int32_t kCompressBit = 0;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ //
+ // Field is in network-byte order.
+ int32_t magic_nbytes = ghtonl(kMagic);
+
+ // Must be at the beginning after kMagic. Contains the crc checksum of
+ // the following fields.
+ //
+ // Field is in network-byte order.
+ uint32_t header_checksum_nbytes = 0;
+
+ // Last known good offset at which the log and its checksum were updated.
+ // If we crash between writing to the log and updating the checksum, we can
+ // try to rewind the log to this offset and verify the checksum is still
+ // valid instead of throwing away the entire log.
+ //
+ // Field is in network-byte order.
+ int64_t rewind_offset_nbytes = ghtonll(kHeaderReservedBytes);
+
+ // Version number tracking how we serialize the file to disk. If we change
+ // how/what we write to disk, this version should be updated and this class
+ // should handle a migration.
+ //
+ // Currently at kFileFormatVersion.
+ //
+ // Field is in network-byte order.
+ int32_t file_format_version_nbytes = 0;
+
+ // The maximum proto size that can be written to the log.
+ //
+ // Field is in network-byte order.
+ int32_t max_proto_size_nbytes = 0;
+
+ // Checksum of the log elements, doesn't include the header fields.
+ //
+ // Field is in network-byte order.
+ uint32_t log_checksum_nbytes = 0;
+
+ // Bits are used to hold various flags.
+ // Lowest bit is whether the protos are compressed or not.
+ //
+ // Field is in network-byte order.
+ uint16_t flags_nbytes = 0;
+
+ // NOTE: New fields should *almost always* be added to the end here. Since
+ // this class may have already been written to disk, appending fields
+ // increases the chances that changes are backwards-compatible.
+ };
+ static_assert(sizeof(Header) <= kHeaderReservedBytes,
+ "Header has grown past our reserved bytes!");
+
+ struct CreateResult {
+ // A successfully initialized log.
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
+
+ // The data status after initializing from a previous state. Data loss can
+ // happen if the file is corrupted or some previously added data was
+ // unpersisted. This may be used to signal that any derived data off of the
+ // proto log may need to be regenerated.
+ DataLoss data_loss;
+
+ bool has_data_loss() {
+ return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
+ }
+ };
+
+ // Factory method to create, initialize, and return a
+ // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
+ //
+ // If on re-initialization the log detects disk corruption or some previously
+ // added data was unpersisted, the log will rewind to the last-good state. The
+ // log saves these checkpointed "good" states when PersistToDisk() is called
+ // or the log is safely destructed. If the log rewinds successfully to the
+ // last-good state, then the returned CreateResult.data_loss indicates
+ // whether it has a data loss and what kind of data loss it is (partial or
+ // complete) so that any derived data may know that it needs to be updated. If
+ // the log re-initializes successfully without any data loss,
+ // CreateResult.data_loss will be NONE.
+ //
+ // Params:
+ // filesystem: Handles system level calls
+ // file_path: Path of the underlying file. Directory of the file should
+ // already exist
+ // options: Configuration options for the proto log
+ //
+ // Returns:
+ // PortableFileBackedProtoLog::CreateResult on success
+ // INVALID_ARGUMENT on an invalid option
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> Create(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Not copyable
+ PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
+ PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
+ delete;
+
+ // This will update the checksum of the log as well.
+ ~PortableFileBackedProtoLog();
+
+ // Writes the serialized proto to the underlying file. Writes are applied
+ // directly to the underlying file. Users do not need to sync the file after
+ // writing.
+ //
+ // Returns:
+ // Offset of the newly appended proto in file on success
+ // INVALID_ARGUMENT if proto is too large, as decided by
+ // Options.max_proto_size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
+
+ // Reads out a proto located at file_offset from the file.
+ //
+ // Returns:
+ // A proto on success
+ // NOT_FOUND if the proto at the given offset has been erased
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+
+ // Erases the data of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status EraseProto(int64_t file_offset);
+
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the file size of all the elements held in the log. File size is in
+ // bytes. This excludes the size of any internal metadata of the log, e.g. the
+ // log's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
+ // An iterator helping to find offsets of all the protos in file.
+ // Example usage:
+ //
+ // while (iterator.Advance().ok()) {
+ // int64_t offset = iterator.GetOffset();
+ // // Do something
+ // }
+ class Iterator {
+ public:
+ Iterator(const Filesystem& filesystem, const std::string& file_path,
+ int64_t initial_offset);
+
+ // Advances to the position of next proto whether it has been erased or not.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if it reaches the end
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Advance();
+
+ // Returns the file offset of current proto.
+ int64_t GetOffset();
+
+ private:
+ static constexpr int64_t kInvalidOffset = -1;
+ // Used to read proto metadata
+ MemoryMappedFile mmapped_file_;
+ // Offset of first proto
+ int64_t initial_offset_;
+ int64_t current_offset_;
+ int64_t file_size_;
+ };
+
+ // Returns an iterator of current proto log. The caller needs to keep the
+ // proto log unchanged while using the iterator, otherwise unexpected
+ // behaviors could happen.
+ Iterator GetIterator();
+
+ // Persists all changes since initialization or the last call to
+ // PersistToDisk(). Any changes that aren't persisted may be lost if the
+ // system fails to close safely.
+ //
+ // Example use case:
+ //
+ // Document document;
+ // document.set_namespace("com.google.android.example");
+ // document.set_uri("www.google.com");
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // We lose the document here since it wasn't persisted.
+ // // *SYSTEM CRASH*
+ // }
+ //
+ // {
+ // // Can still successfully create after a crash since the log can
+ // // rewind/truncate to recover into a previously good state
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // Lost the proto since we didn't PersistToDisk before the crash
+ // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // Persisted this time, so we should be ok.
+ // ICING_ASSERT_OK(proto_log->PersistToDisk());
+ // }
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // SUCCESS
+ // Document same_document = proto_log->ReadProto(document_offset));
+ // }
+ //
+ // NOTE: Since all protos are already written to the file directly, this
+ // just updates the checksum and rewind position. Without these updates,
+ // future initializations will truncate the file and discard unpersisted
+ // changes.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates the checksum of the log contents. Excludes the header content.
+ //
+ // Returns:
+ // Crc of the log content
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ private:
+ // Object can only be instantiated via the ::Create factory.
+ PortableFileBackedProtoLog(const Filesystem* filesystem,
+ const std::string& file_path,
+ std::unique_ptr<Header> header);
+
+ // Initializes a new proto log.
+ //
+ // Returns:
+ // std::unique_ptr<CreateResult> on success
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Verifies that the existing proto log is in a good state. If not in a good
+ // state, then the proto log may be truncated to the last good state and
+ // content will be lost.
+ //
+ // Returns:
+ // std::unique_ptr<CreateResult> on success
+ // INTERNAL_ERROR on IO error or internal inconsistencies in the file
+ // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
+ // instances
+ static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options, int64_t file_size);
+
+ // Takes an initial checksum and updates it with the content between `start`
+ // and `end` offsets in the file.
+ //
+ // Returns:
+ // Crc of the content between `start`, inclusive, and `end`, exclusive.
+ // INTERNAL_ERROR on IO error
+ // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
+ static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end);
+
+ // Reads out the metadata of a proto located at file_offset from the file.
+ // Metadata will be returned in host byte order endianness.
+ //
+ // Returns:
+ // Proto's metadata on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
+ // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
+ static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+
+ // Writes metadata of a proto to the fd. Takes in a host byte order endianness
+ // metadata and converts it into a portable metadata before writing.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any IO errors
+ static libtextclassifier3::Status WriteProtoMetadata(
+ const Filesystem* filesystem, int fd, int32_t host_order_metadata);
+
+ // Magic number added in front of every proto. Used when reading out protos
+ // as a first check for corruption in each entry in the file. Even if there is
+ // a corruption, the best we can do is roll back to our last recovery point
+ // and throw away un-flushed data. We can discard/reuse this byte if needed so
+ // that we have 4 bytes to store the size of protos, and increase the size of
+ // protos we support.
+ static constexpr uint8_t kProtoMagic = 0x5C;
+
+ // Our internal max for protos.
+ //
+ // WARNING: Changing this to a larger number may invalidate our assumption
+ // that that proto size can safely be stored in the last 3 bytes of the proto
+ // header.
+ static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
+ static_assert(kMaxProtoSize <= 0x00FFFFFF,
+ "kMaxProtoSize doesn't fit in 3 bytes");
+
+ // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
+ static constexpr int kDeflateCompressionLevel = 3;
+
+ // Chunks of the file to mmap at a time, so we don't mmap the entire file.
+ // Only used on 32-bit devices
+ static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
+
+ ScopedFd fd_;
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+ std::unique_ptr<Header> header_;
+};
+
+template <typename ProtoT>
+constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic;
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
+ const Filesystem* filesystem, const std::string& file_path,
+ std::unique_ptr<Header> header)
+ : filesystem_(filesystem),
+ file_path_(file_path),
+ header_(std::move(header)) {
+ fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
+ "PortableFileBackedProtoLog: "
+ << file_path_;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
+ const std::string& file_path,
+ const Options& options) {
+ if (options.max_proto_size <= 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be greater than 0, was %d",
+ options.max_proto_size));
+ }
+
+ // Since we store the proto_size in 3 bytes, we can only support protos of up
+ // to 16MiB.
+ if (options.max_proto_size > kMaxProtoSize) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be under 16MiB, was %d",
+ options.max_proto_size));
+ }
+
+ if (!filesystem->FileExists(file_path.c_str())) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size '", file_path, "'"));
+ }
+
+ if (file_size == 0) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ return InitializeExistingFile(filesystem, file_path, options, file_size);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options) {
+ // Grow to the minimum reserved bytes for the header.
+ if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to initialize file size: ", file_path));
+ }
+
+ // Create the header
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ header->SetCompressFlag(options.compress);
+ header->SetMaxProtoSize(options.max_proto_size);
+ header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+ if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write header for file: ", file_path));
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+ new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header))),
+ /*data_loss=*/DataLoss::NONE};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options, int64_t file_size) {
+ if (file_size < kHeaderReservedBytes) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("File header too short for: ", file_path));
+ }
+
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
+ /*offset=*/0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to read header for file: ", file_path));
+ }
+
+ // Make sure the header is still valid before we use any of its values. This
+ // is covered by the header_checksum check below, but this is a quick check
+ // that can save us from an extra crc computation.
+ if (header->GetMagic() != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
+ }
+
+ if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header checksum for: ", file_path));
+ }
+
+ if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
+ // If this changes, we might need to handle a migration rather than throwing
+ // an error.
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header file format version: ", file_path));
+ }
+
+ if (header->GetCompressFlag() != options.compress) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Inconsistent compress option, expected %d, actual %d",
+ header->GetCompressFlag(), options.compress));
+ }
+
+ if (header->GetMaxProtoSize() > options.max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Max proto size cannot be smaller than previous "
+ "instantiations, previous size %d, wanted size %d",
+ header->GetMaxProtoSize(), options.max_proto_size));
+ }
+ header->SetMaxProtoSize(options.max_proto_size);
+
+ DataLoss data_loss = DataLoss::NONE;
+ ICING_ASSIGN_OR_RETURN(
+ Crc32 calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+
+ // Double check that the log checksum is the same as the one that was
+ // persisted last time. If not, we start recovery logic.
+ if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
+ // Need to rewind the proto log since the checksums don't match.
+ // Worst case, we have to rewind the entire log back to just the header
+ int64_t last_known_good = kHeaderReservedBytes;
+
+ // Calculate the checksum of the log contents just up to the last rewind
+ // offset point. This will be valid if we just appended contents to the log
+ // without updating the checksum, and we can rewind back to this point
+ // safely.
+ ICING_ASSIGN_OR_RETURN(calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(),
+ /*start=*/kHeaderReservedBytes,
+ /*end=*/header->GetRewindOffset()));
+ if (header->GetLogChecksum() == calculated_log_checksum.Get()) {
+ // Check if it matches our last rewind state. If so, this becomes our last
+ // good state and we can safely truncate and recover from here.
+ last_known_good = header->GetRewindOffset();
+ data_loss = DataLoss::PARTIAL;
+ } else {
+ // Otherwise, we're going to truncate the entire log and this resets the
+ // checksum to an empty log state.
+ header->SetLogChecksum(0);
+ data_loss = DataLoss::COMPLETE;
+ }
+
+ if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Error truncating file: ", file_path));
+ }
+
+ ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
+ << last_known_good;
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+ new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header))),
+ data_loss};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end) {
+ auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY);
+ Crc32 new_crc(initial_crc.Get());
+
+ if (start < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Starting checksum offset of file '%s' must be greater than 0, was "
+ "%lld",
+ file_path.c_str(), static_cast<long long>(start)));
+ }
+
+ if (end < start) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be greater than start "
+ "'%lld', was '%lld'",
+ file_path.c_str(), static_cast<long long>(start),
+ static_cast<long long>(end)));
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (end > file_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be within "
+ "file size of %lld, was %lld",
+ file_path.c_str(), static_cast<long long>(file_size),
+ static_cast<long long>(end)));
+ }
+
+ Architecture architecture = GetArchitecture();
+ switch (architecture) {
+ case Architecture::BIT_64: {
+ // Don't mmap in chunks here since mmapping can be harmful on 64-bit
+ // devices where mmap/munmap calls need the mmap write semaphore, which
+ // blocks mmap/munmap/mprotect and all page faults from executing while
+ // they run. On 64-bit devices, this doesn't actually load into memory, it
+ // just makes the file faultable. So the whole file should be ok.
+ // b/185822878.
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
+ auto mmap_str = std::string_view(mmapped_file.region(), end - start);
+ new_crc.Append(mmap_str);
+ break;
+ }
+ case Architecture::BIT_32:
+ [[fallthrough]];
+ case Architecture::UNKNOWN: {
+ // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
+ // much memory at once. If we're unknown, then also chunk it because we're
+ // not sure what the device can handle.
+ for (int i = start; i < end; i += kMmapChunkSize) {
+ // Don't read past the file size.
+ int next_chunk_size = kMmapChunkSize;
+ if ((i + kMmapChunkSize) >= end) {
+ next_chunk_size = end - i;
+ }
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+
+ auto mmap_str =
+ std::string_view(mmapped_file.region(), next_chunk_size);
+ new_crc.Append(mmap_str);
+ }
+ break;
+ }
+ }
+
+ return new_crc;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
+ int64_t proto_size = proto.ByteSizeLong();
+ int32_t host_order_metadata;
+ int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
+
+ if (proto_size > header_->GetMaxProtoSize()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "proto_size, %lld, was too large to write. Max is %d",
+ static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
+ }
+
+ // At this point, we've guaranteed that proto_size is under kMaxProtoSize
+ // (see
+ // ::Create), so we can safely store it in an int.
+ int final_size = 0;
+
+ std::string proto_str;
+ google::protobuf::io::StringOutputStream proto_stream(&proto_str);
+
+ if (header_->GetCompressFlag()) {
+ google::protobuf::io::GzipOutputStream::Options options;
+ options.format = google::protobuf::io::GzipOutputStream::ZLIB;
+ options.compression_level = kDeflateCompressionLevel;
+
+ google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
+ options);
+
+ bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
+ compressing_stream.Close();
+
+ if (!success) {
+ return absl_ports::InternalError("Error compressing proto.");
+ }
+
+ final_size = proto_str.size();
+
+ // In case the compressed proto is larger than the original proto, we also
+ // can't write it.
+ if (final_size > header_->GetMaxProtoSize()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Compressed proto size, %d, was greater than "
+ "max_proto_size, %d",
+ final_size, header_->GetMaxProtoSize()));
+ }
+ } else {
+ // Serialize the proto directly into the write buffer at an offset of the
+ // metadata.
+ proto.SerializeToZeroCopyStream(&proto_stream);
+ final_size = proto_str.size();
+ }
+
+ // 1st byte for magic, next 3 bytes for proto size.
+ host_order_metadata = (kProtoMagic << 24) | final_size;
+
+ // Actually write metadata, has to be done after we know the possibly
+ // compressed proto size
+ ICING_RETURN_IF_ERROR(
+ WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
+
+ // Write the serialized proto
+ if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto to: ", file_path_));
+ }
+
+ return current_position;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<ProtoT>
+PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ MemoryMappedFile mmapped_file(*filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ // Copy out however many bytes it says the proto is
+ int stored_size = GetProtoSize(metadata);
+
+ ICING_RETURN_IF_ERROR(
+ mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+
+ if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ return absl_ports::NotFoundError("The proto data has been erased.");
+ }
+
+ google::protobuf::io::ArrayInputStream proto_stream(
+ mmapped_file.mutable_region(), stored_size);
+
+ // Deserialize proto
+ ProtoT proto;
+ if (header_->GetCompressFlag()) {
+ google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
+ proto.ParseFromZeroCopyStream(&decompress_stream);
+ } else {
+ proto.ParseFromZeroCopyStream(&proto_stream);
+ }
+
+ return proto;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
+ int64_t file_offset) {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Trying to erase data at a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ MemoryMappedFile mmapped_file(
+ *filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+ GetProtoSize(metadata)));
+
+ // We need to update the crc checksum if the erased area is before the
+ // rewind position.
+ if (file_offset + sizeof(metadata) < header_->GetRewindOffset()) {
+ // We need to calculate [original string xor 0s].
+ // The xored string is the same as the original string because 0 xor 0 =
+ // 0, 1 xor 0 = 1.
+ const std::string_view xored_str(mmapped_file.region(),
+ mmapped_file.region_size());
+
+ Crc32 crc(header_->GetLogChecksum());
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t new_crc,
+ crc.UpdateWithXor(xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/file_offset + sizeof(metadata) -
+ kHeaderReservedBytes));
+
+ header_->SetLogChecksum(new_crc);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+ }
+
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
+ int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+ if (size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Failed to get disk usage of proto log");
+ }
+ return size;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elments in the proto log");
+ }
+ return total_file_size - kHeaderReservedBytes;
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
+ const Filesystem& filesystem, const std::string& file_path,
+ int64_t initial_offset)
+ : mmapped_file_(filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY),
+ initial_offset_(initial_offset),
+ current_offset_(kInvalidOffset),
+ file_size_(filesystem.GetFileSize(file_path.c_str())) {
+ if (file_size_ == Filesystem::kBadFileSize) {
+ // Fails all Advance() calls
+ file_size_ = 0;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
+ if (current_offset_ == kInvalidOffset) {
+ // First Advance() call
+ current_offset_ = initial_offset_;
+ } else {
+ // Jumps to the next proto position
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+ current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
+ }
+
+ if (current_offset_ < file_size_) {
+ return libtextclassifier3::Status::OK;
+ } else {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "The next proto offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(current_offset_),
+ static_cast<long long>(file_size_)));
+ }
+}
+
+template <typename ProtoT>
+int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
+ return current_offset_;
+}
+
+template <typename ProtoT>
+typename PortableFileBackedProtoLog<ProtoT>::Iterator
+PortableFileBackedProtoLog<ProtoT>::GetIterator() {
+ return Iterator(*filesystem_, file_path_,
+ /*initial_offset=*/kHeaderReservedBytes);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int32_t>
+PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+ // Checks file_offset
+ if (file_offset >= file_size) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+ int32_t portable_metadata;
+ int metadata_size = sizeof(portable_metadata);
+ if (file_offset + metadata_size >= file_size) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Wrong metadata offset %lld, metadata doesn't fit in "
+ "with file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+
+ // Reads metadata
+ ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
+ memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
+
+ // Need to switch it back to host order endianness after reading from disk.
+ int32_t host_order_metadata = gntohl(portable_metadata);
+
+ // Checks magic number
+ uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
+ if (stored_k_proto_magic != kProtoMagic) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
+ stored_k_proto_magic));
+ }
+
+ return host_order_metadata;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
+ const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
+ // Convert it into portable endian format before writing to disk
+ int32_t portable_metadata = ghtonl(host_order_metadata);
+ int portable_metadata_size = sizeof(portable_metadata);
+
+ // Write metadata
+ if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto metadata."));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == header_->GetRewindOffset()) {
+ // No new protos appended, don't need to update the checksum.
+ return libtextclassifier3::Status::OK;
+ }
+
+ int64_t new_content_size = file_size - header_->GetRewindOffset();
+ Crc32 crc;
+ if (new_content_size < 0) {
+ // File shrunk, recalculate the entire checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc,
+ ComputeChecksum(filesystem_, file_path_, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+ } else {
+ // Append new changes to the existing checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc, ComputeChecksum(filesystem_, file_path_,
+ Crc32(header_->GetLogChecksum()),
+ header_->GetRewindOffset(), file_size));
+ }
+
+ header_->SetLogChecksum(crc.Get());
+ header_->SetRewindOffset(file_size);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header)) ||
+ !filesystem_->DataSync(fd_.get())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
+ return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+ filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes,
+ /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
new file mode 100644
index 0000000..b1dfe12
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -0,0 +1,211 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+// go/microbenchmarks
+//
+// To build and run on a local machine:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:portable-file-backed-proto-log_benchmark
+//
+// $ blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+// --benchmarks=all
+//
+//
+// To build and run on an Android device (must be connected and rooted):
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:portable-file-backed-proto-log_benchmark
+//
+// $ adb root
+//
+// $ adb push
+// blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/portable-file-backed-proto-log-benchmark
+// --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static void BM_Write(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->WriteProto(document));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Write)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+static void BM_Read(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ReadProto(write_offset));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Read)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+static void BM_ComputeChecksum(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = GetTestTempDir() + "/proto.log";
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Make each document 1KiB
+ int string_length = 1024;
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ int num_docs = state.range(0);
+ for (int i = 0; i < num_docs; ++i) {
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+ }
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ComputeChecksum());
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
new file mode 100644
index 0000000..dfb67aa
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -0,0 +1,727 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/portable-file-backed-proto-log.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::A;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::Return;
+
+class PortableFileBackedProtoLogTest : public ::testing::Test {
+ protected:
+ // Adds a user-defined default construct because a const member variable may
+ // make the compiler accidentally delete the default constructor.
+ // https://stackoverflow.com/a/47368753
+ PortableFileBackedProtoLogTest() {}
+
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/proto_log";
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
+
+ void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+ const Filesystem filesystem_;
+ std::string file_path_;
+ bool compress_ = true;
+ int64_t max_proto_size_ = 256 * 1024; // 256 KiB
+};
+
+TEST_F(PortableFileBackedProtoLogTest, Initialize) {
+ // max_proto_size must be greater than 0
+ int invalid_max_proto_size = 0;
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, invalid_max_proto_size)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ EXPECT_THAT(create_result.proto_log, NotNull());
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ // Can't recreate the same file with different options.
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ !compress_, max_proto_size_)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+
+ // With no protos written yet, the log should be minimum the size of the
+ // reserved header space.
+ ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), kHeaderReservedBytes);
+}
+
+TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) {
+ int max_proto_size = 1;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Proto is too large for the max_proto_size_in
+ ASSERT_THAT(proto_log->WriteProto(document),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a proto
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
+ proto_log->WriteProto(document));
+
+ // The 4 bytes of metadata that just doesn't have the same kProtoMagic
+ // specified in file-backed-proto-log.h
+ uint32_t wrong_magic = 0x7E000000;
+
+ // Sanity check that we opened the file correctly
+ int fd = filesystem_.OpenForWrite(file_path_.c_str());
+ ASSERT_GT(fd, 0);
+
+ // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
+ // a proto entry.
+ filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
+
+ ASSERT_THAT(proto_log->ReadProto(file_offset),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteUncompressedProto) {
+ int last_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteCompressedProto) {
+ int last_offset;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, CorruptHeader) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ }
+
+ int corrupt_value = 24;
+
+ // Offset after the kMagic and the header_checksum.
+ int offset_after_checksum = 8;
+ filesystem_.PWrite(file_path_.c_str(), offset_after_checksum, &corrupt_value,
+ sizeof(corrupt_value));
+
+ {
+ // Reinitialize the same proto_log
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Invalid header checksum")));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DifferentMagic) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ // Corrupt the magic that's stored at the beginning of the header.
+ int invalid_magic = -1;
+ filesystem_.PWrite(file_path_.c_str(), /*offset=*/0, &invalid_magic,
+ sizeof(invalid_magic));
+ }
+
+ {
+ // Reinitialize the same proto_log
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Invalid header kMagic")));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, CorruptContent) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist an document.
+ ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // "Corrupt" the content written in the log.
+ document.set_uri("invalid");
+ std::string serialized_document = document.SerializeAsString();
+ filesystem_.PWrite(file_path_.c_str(), document_offset,
+ serialized_document.data(), serialized_document.size());
+ }
+
+ {
+ // We can recover, but we have data loss.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.has_data_loss());
+ ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+ // Lost everything in the log since the rewind position doesn't help if
+ // there's been data corruption within the persisted region
+ ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
+ kHeaderReservedBytes);
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, PersistToDisk) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace2", "uri2").Build();
+ int document1_offset, document2_offset;
+ int log_size;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Write, but don't explicitly persist the second proto
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ log_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_GT(log_size, 0);
+ }
+
+ {
+ // The header rewind position and checksum aren't updated in this "system
+ // crash" scenario.
+
+ std::string bad_proto =
+ "some incomplete proto that we didn't finish writing before the "
+ "system crashed";
+ filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
+ bad_proto.size());
+
+ // Double check that we actually wrote something to the underlying file
+ ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
+ }
+
+ {
+ // We can recover, but we have data loss
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.has_data_loss());
+ ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+
+ // Check that everything was persisted across instances
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // We correctly rewound to the last good state.
+ ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, Iterator) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ {
+ // Empty iterator
+ auto iterator = proto_log->GetIterator();
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterates through some documents
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->WriteProto(document2));
+ auto iterator = proto_log->GetIterator();
+ // 1st proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document1)));
+ // 2nd proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document2)));
+ // Tries to advance
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterator with bad filesystem
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+ PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
+ mock_filesystem, file_path_, /*initial_offset=*/0);
+ ASSERT_THAT(bad_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ComputeChecksum) {
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ Crc32 checksum;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
+
+ // Calling it twice with no changes should get us the same checksum
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Checksum should be consistent across instances
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // PersistToDisk shouldn't affect the checksum value
+ ICING_EXPECT_OK(proto_log->PersistToDisk());
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // Check that modifying the log leads to a different checksum
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes and erases proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // Checks if the erased area is set to 0.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ MemoryMappedFile mmapped_file(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+
+ // document1_offset + sizeof(int) is the start byte of the proto where
+ // sizeof(int) is the size of the proto metadata.
+ mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+ for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+ ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 2 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Erases the first proto
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // The first proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // The second proto should be returned.
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace", "uri3").Build();
+ DocumentProto document4 =
+ DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+ int64_t document2_offset;
+ int64_t document3_offset;
+
+ {
+ // Erase data after the rewind position. This won't update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 3 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ proto_log->WriteProto(document3));
+
+ // Erases the 1st proto, checksum won't be updated immediately because the
+ // rewind position is 0.
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2175574628))));
+ } // New checksum is updated in destructor.
+
+ {
+ // Erase data before the rewind position. This will update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Erases the 2nd proto that is now before the rewind position. Checksum
+ // is updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(790877774))));
+ }
+
+ {
+ // Append data and erase data before the rewind position. This will update
+ // the checksum twice: in EraseProto() and destructor.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Append a new document which is after the rewind position.
+ ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+ // Erases the 3rd proto that is now before the rewind position. Checksum
+ // is updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2344803210))));
+ } // Checksum is updated with the newly appended document.
+
+ {
+ // A successful creation means that the checksum matches.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index a281f22..c1de0f0 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -97,8 +97,11 @@
constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
+ StringIndexingConfig_TokenizerType_Code_NONE;
constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN;
// For mocking purpose, we allow tests to provide a custom Filesystem.
class TestIcingSearchEngine : public IcingSearchEngine {
@@ -5726,6 +5729,88 @@
}
}
+TEST_F(IcingSearchEngineTest,
+ DocumentWithNoIndexedContentDoesntCauseRestoreIndex) {
+ // 1. Create an index with a single document in it that has no indexed
+ // content.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Set a schema for a single type that has no indexed properties.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("unindexedField")
+ .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Add a document that contains no indexed content.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("unindexedField",
+ "Don't you dare search over this!")
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again. This should NOT trigger a recovery of any kind.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+}
+
+TEST_F(IcingSearchEngineTest,
+ DocumentWithNoValidIndexedContentDoesntCauseRestoreIndex) {
+ // 1. Create an index with a single document in it that has no valid indexed
+ // tokens in its content.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Set a schema for a single type that has no indexed properties.
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Add a document that contains no valid indexed content - just punctuation.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "?...!")
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again. This should NOT trigger a recovery of any kind.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+}
+
TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) {
DocumentProto document = DocumentBuilder()
.SetKey("icing", "fake_type/0")
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index 2a5a0d9..887e6e4 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -67,9 +67,10 @@
&temp_value);
bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value);
bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency,
- kHasTermFrequency, 1, &temp_value);
- bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value);
- bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value);
+ kHasTermFrequency, /*len=*/1, &temp_value);
+ bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value);
+ bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection,
+ /*len=*/1, &temp_value);
value_ = temp_value;
}
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 09dda41..6d8632f 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -64,6 +64,7 @@
"DocumentId %d must be greater than last added document_id %d",
document_id, index_->last_added_document_id()));
}
+ index_->set_last_added_document_id(document_id);
uint32_t num_tokens = 0;
libtextclassifier3::Status overall_status;
for (const TokenizedSection& section : tokenized_document.sections()) {
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index fc14800..8a6a9f5 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -261,7 +261,23 @@
document));
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexProcessorTest, NoValidContent) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "?...!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexProcessorTest, OneDoc) {
diff --git a/icing/index/index.h b/icing/index/index.h
index b7021ca..eab5be8 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -127,6 +127,16 @@
return main_index_->last_added_document_id();
}
+ // Sets last_added_document_id to document_id so long as document_id >
+ // last_added_document_id()
+ void set_last_added_document_id(DocumentId document_id) {
+ DocumentId lite_document_id = lite_index_->last_added_document_id();
+ if (lite_document_id == kInvalidDocumentId ||
+ document_id >= lite_document_id) {
+ lite_index_->set_last_added_document_id(document_id);
+ }
+ }
+
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - just the lexicons and lite
// index.
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index de4edf8..16593ef 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -153,8 +153,6 @@
index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
}
TEST_F(IndexTest, EmptyIndexAfterMerge) {
@@ -172,8 +170,6 @@
index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
}
TEST_F(IndexTest, AdvancePastEnd) {
@@ -238,8 +234,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) {
@@ -256,8 +250,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, SingleHitMultiTermIndex) {
@@ -273,8 +265,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) {
@@ -292,8 +282,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, NoHitMultiTermIndex) {
@@ -308,7 +296,6 @@
index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) {
@@ -325,7 +312,6 @@
index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, MultiHitMultiTermIndex) {
@@ -352,7 +338,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) {
@@ -381,7 +366,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, MultiHitSectionRestrict) {
@@ -402,8 +386,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) {
@@ -426,8 +408,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, SingleHitDedupeIndex) {
@@ -449,8 +429,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixHit) {
@@ -465,8 +443,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixHitAfterMerge) {
@@ -483,8 +459,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, MultiPrefixHit) {
@@ -506,8 +480,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, MultiPrefixHitAfterMerge) {
@@ -531,8 +503,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, NoExactHitInPrefixQuery) {
@@ -552,7 +522,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId1, std::vector<SectionId>{kSectionId3})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) {
@@ -574,7 +543,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId1, std::vector<SectionId>{kSectionId3})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, PrefixHitDedupe) {
@@ -590,7 +558,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixHitDedupeAfterMerge) {
@@ -608,7 +575,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixToString) {
@@ -705,9 +671,11 @@
std::default_random_engine random;
std::vector<std::string> query_terms;
+ std::string prefix = "prefix";
for (int i = 0; i < 2600; ++i) {
constexpr int kTokenSize = 5;
- query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random));
+ query_terms.push_back(prefix +
+ RandomString(kAlNumAlphabet, kTokenSize, &random));
}
DocumentId document_id = 0;
@@ -716,7 +684,7 @@
while (status.ok()) {
for (int i = 0; i < 100; ++i) {
Index::Editor edit =
- index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
size_t idx = uniform(random);
status = edit.BufferTerm(query_terms.at(idx).c_str());
@@ -733,11 +701,14 @@
// Adding more hits should fail.
Index::Editor edit =
- index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
- EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+ std::string term = prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
@@ -745,12 +716,17 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
index_->GetIterator(query_terms.at(i).c_str(), kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ TermMatchType::PREFIX));
// Each query term should contain at least one hit - there may have been
// other hits for this term that were added.
EXPECT_THAT(itr->Advance(), IsOk());
}
- EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> last_itr,
+ index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
}
TEST_F(IndexTest, FullIndexMerge) {
@@ -761,9 +737,11 @@
std::default_random_engine random;
std::vector<std::string> query_terms;
+ std::string prefix = "prefix";
for (int i = 0; i < 2600; ++i) {
constexpr int kTokenSize = 5;
- query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random));
+ query_terms.push_back(prefix +
+ RandomString(kAlNumAlphabet, kTokenSize, &random));
}
DocumentId document_id = 0;
@@ -772,7 +750,7 @@
while (status.ok()) {
for (int i = 0; i < 100; ++i) {
Index::Editor edit =
- index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
size_t idx = uniform(random);
status = edit.BufferTerm(query_terms.at(idx).c_str());
@@ -791,30 +769,45 @@
// Adding more hits should fail.
Index::Editor edit =
- index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
- EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+ std::string term = prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> last_itr,
+ index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
// After merging with the main index. Adding more hits should succeed now.
ICING_ASSERT_OK(index_->Merge());
- edit =
- index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, 0);
- EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
- EXPECT_THAT(edit.BufferTerm("baz"), IsOk());
+ edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, 0);
+ prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator(prefix + "bar", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
// We know that "bar" should have at least one hit because we just added it!
EXPECT_THAT(itr->Advance(), IsOk());
EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(document_id + 1));
- EXPECT_THAT(index_->last_added_document_id(), Eq(document_id + 1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ last_itr, index_->GetIterator(prefix.c_str(), kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1));
}
TEST_F(IndexTest, IndexCreateIOFailure) {
@@ -883,8 +876,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, IndexPersistenceAfterMerge) {
@@ -912,8 +903,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, InvalidHitBufferSize) {
@@ -1280,8 +1269,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, PrefixResultsFromLiteAndMain) {
@@ -1314,8 +1301,6 @@
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, GetDebugInfo) {
@@ -1422,8 +1407,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId3})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, BackfillingNewTermsSucceeds) {
@@ -1478,8 +1461,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId3));
}
TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) {
@@ -1527,8 +1508,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) {
@@ -1544,6 +1523,7 @@
TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
// Clipping to invalid should have no effect.
ICING_ASSERT_OK_AND_ASSIGN(
@@ -1565,6 +1545,7 @@
/*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
// Clipping to invalid should still have no effect even if both indices have
// hits.
@@ -1576,8 +1557,6 @@
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) {
@@ -1586,6 +1565,7 @@
TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
ICING_ASSERT_OK(index_->Merge());
@@ -1594,6 +1574,7 @@
/*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
@@ -1604,8 +1585,6 @@
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, TruncateToThrowsOutBothIndices) {
@@ -1614,10 +1593,12 @@
TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foul"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
ICING_ASSERT_OK(index_->Merge());
@@ -1626,6 +1607,7 @@
/*namespace_id=*/0);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
@@ -1634,8 +1616,6 @@
std::unique_ptr<DocHitInfoIterator> itr,
index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
}
TEST_F(IndexTest, IndexStorageInfoProto) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index 69138e1..fb23934 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -310,8 +310,6 @@
return absl_ports::ResourceExhaustedError("Hit buffer is full!");
}
- header_->set_last_added_docid(hit.document_id());
-
TermIdHitPair term_id_hit_pair(term_id, hit);
uint32_t cur_size = header_->cur_size();
TermIdHitPair::Value* valp =
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 90c6fbc..b134aba 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -225,6 +225,9 @@
DocumentId last_added_document_id() const {
return header_->last_added_docid();
}
+ void set_last_added_document_id(DocumentId document_id) const {
+ header_->set_last_added_docid(document_id);
+ }
const IcingDynamicTrie& lexicon() const { return lexicon_; }
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
index 58eb8bf..9b75db6 100644
--- a/icing/jni/jni-cache.cc
+++ b/icing/jni/jni-cache.cc
@@ -14,6 +14,8 @@
#include "icing/jni/jni-cache.h"
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
#include "icing/absl_ports/canonical_errors.h"
@@ -214,3 +216,5 @@
} // namespace lib
} // namespace icing
+
+#endif // ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
index a5f16c7..3faaed6 100644
--- a/icing/jni/jni-cache.h
+++ b/icing/jni/jni-cache.h
@@ -15,6 +15,16 @@
#ifndef ICING_JNI_JNI_CACHE_H_
#define ICING_JNI_JNI_CACHE_H_
+#ifndef ICING_REVERSE_JNI_SEGMENTATION
+namespace icing {
+namespace lib {
+
+class JniCache {}; // Declare an empty class definition for non-Android builds.
+
+} // namespace lib
+} // namespace icing
+#else // ICING_REVERSE_JNI_SEGMENTATION
+
#include <jni.h>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -75,4 +85,6 @@
} // namespace lib
} // namespace icing
+#endif // !ICING_REVERSE_JNI_SEGMENTATION
+
#endif // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc
deleted file mode 100644
index 1a8a799..0000000
--- a/icing/jni/reverse-jni-break-iterator.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/jni/reverse-jni-break-iterator.h"
-
-#include <jni.h>
-#include <math.h>
-
-#include <cassert>
-#include <cctype>
-#include <map>
-
-#include "icing/jni/jni-cache.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/text_classifier/lib3/utils/java/jni-base.h"
-#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-// Chosen based on results in go/reverse-jni-benchmarks
-static constexpr int kBatchSize = 100;
-} // namespace
-
-// -----------------------------------------------------------------------------
-// Implementations that call out to JVM. Behold the beauty.
-// -----------------------------------------------------------------------------
-libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
-ReverseJniBreakIterator::Create(const JniCache* jni_cache,
- std::string_view text,
- std::string_view locale) {
- if (jni_cache == nullptr) {
- return absl_ports::InvalidArgumentError(
- "Create must be called with a valid JniCache pointer!");
- }
-
- ICING_ASSIGN_OR_RETURN(
- libtextclassifier3::ScopedLocalRef<jstring> java_text,
- jni_cache->ConvertToJavaString(text.data(), text.length()));
- if (java_text.get() == nullptr) {
- return absl_ports::AbortedError("Failed to create Java String from input.");
- }
-
- ICING_ASSIGN_OR_RETURN(
- libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
- jni_cache->ConvertToJavaString(locale.data(), locale.length()));
- if (java_locale_string.get() == nullptr) {
- return absl_ports::AbortedError(
- "Failed to create Java String from locale.");
- }
-
- JNIEnv* jenv = jni_cache->GetEnv();
- ICING_ASSIGN_OR_RETURN(
- libtextclassifier3::ScopedLocalRef<jobject> java_locale,
- libtextclassifier3::JniHelper::NewObject(
- jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
- java_locale_string.get()));
- if (java_locale.get() == nullptr) {
- return absl_ports::AbortedError(
- "Failed to create Java Locale from locale.");
- }
-
- ICING_ASSIGN_OR_RETURN(
- libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
- libtextclassifier3::JniHelper::NewObject(
- jenv, jni_cache->breakiterator_class.get(),
- jni_cache->breakiterator_constructor, java_locale.get()));
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
- libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
- jni_cache->jvm);
- if (iterator_batcher.get() == nullptr) {
- return absl_ports::AbortedError(
- "Failed to create Java BreakIteratorBatcher.");
- }
-
- ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
- jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
- java_text.get()));
- return std::unique_ptr<ReverseJniBreakIterator>(
- new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
-}
-
-ReverseJniBreakIterator::ReverseJniBreakIterator(
- const JniCache* jni_cache,
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
- : jni_cache_(jni_cache),
- iterator_batcher_(std::move(iterator_batcher)),
- is_done_(false),
- is_almost_done_(false) {}
-
-int ReverseJniBreakIterator::Next() {
- if (is_done_) {
- return ReverseJniBreakIterator::kDone;
- }
- if (break_indices_cache_.empty()) {
- if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
- // Either there were no more results or an error occurred. Either way,
- // mark ourselves as done and return.
- is_done_ = true;
- return ReverseJniBreakIterator::kDone;
- }
- is_almost_done_ = break_indices_cache_.size() < kBatchSize;
- }
- int break_index = break_indices_cache_.front();
- break_indices_cache_.pop();
- is_done_ = is_almost_done_ && break_indices_cache_.empty();
- return break_index;
-}
-
-int ReverseJniBreakIterator::First() {
- const int first_index = jni_cache_->GetEnv()->CallIntMethod(
- iterator_batcher_.get(), jni_cache_->breakiterator_first);
- if (jni_cache_->ExceptionCheckAndClear()) {
- return ReverseJniBreakIterator::kDone;
- }
- ClearCache();
- return first_index;
-}
-
-int ReverseJniBreakIterator::Preceding(int offset) {
- const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
- iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
- if (jni_cache_->ExceptionCheckAndClear()) {
- return ReverseJniBreakIterator::kDone;
- }
- ClearCache();
- return preceding_index;
-}
-
-int ReverseJniBreakIterator::Following(int offset) {
- const int following_index = jni_cache_->GetEnv()->CallIntMethod(
- iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
- if (jni_cache_->ExceptionCheckAndClear()) {
- return ReverseJniBreakIterator::kDone;
- }
- ClearCache();
- return following_index;
-}
-
-int ReverseJniBreakIterator::FetchNextBatch() {
- ICING_ASSIGN_OR_RETURN(
- libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
- libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
- jni_cache_->GetEnv(), iterator_batcher_.get(),
- jni_cache_->breakiterator_next, kBatchSize),
- ReverseJniBreakIterator::kDone);
- if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
- return ReverseJniBreakIterator::kDone;
- }
- jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
- if (num_indices == 0) {
- return ReverseJniBreakIterator::kDone;
- }
- jint* break_indices_arr =
- static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
- break_indices.get(), nullptr));
- for (int i = 0; i < num_indices; ++i) {
- break_indices_cache_.push(break_indices_arr[i]);
- }
- jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
- break_indices_arr,
- /*mode=*/0);
- return num_indices;
-}
-
-void ReverseJniBreakIterator::ClearCache() {
- break_indices_cache_ = std::queue<int>();
- is_done_ = false;
- is_almost_done_ = false;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h
deleted file mode 100644
index c1f05f4..0000000
--- a/icing/jni/reverse-jni-break-iterator.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
-#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
-
-#include <jni.h>
-
-#include <queue>
-#include <string>
-
-#include "icing/jni/jni-cache.h"
-#include "icing/text_classifier/lib3/utils/java/jni-base.h"
-
-namespace icing {
-namespace lib {
-
-// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
-// hides the batching element to provide an interface akin to
-// java.text.BreakIterator.
-//
-// Example:
-// std::string text = "我每天走路去上班。";
-// ASSERT_THAT(text, SizeIs(27));
-// std::unique_ptr<ReverseJniBreakIterator> itr =
-// ReverseJniBreakIterator::Create(jni_cache, text, locale);
-// std::vector<int> nexts;
-// int next = itr->Next();
-// while (next != ReverseJniBreakIterator::kDone) {
-// nexts.push_back(next);
-// next = itr->Next();
-// }
-// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
-class ReverseJniBreakIterator {
- public:
- static constexpr int kDone = -1;
-
- // Creates a ReverseJniBreakiterator with the given text and locale.
- //
- // Returns:
- // A ReverseJniBreakIterator on success
- // INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
- // INTERNAL if unable to create any of the required Java objects
- static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
- Create(const JniCache* jni_cache, std::string_view text,
- std::string_view locale);
-
- // Returns the UTF-16 boundary following the current boundary. If the current
- // boundary is the last text boundary, it returns
- // ReverseJniBreakIterator::kDONE.
- //
- // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
- // boundary. Callers interested in the UTF-8 boundary are required to maintain
- // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
- int Next();
-
- // Returns the first UTF-16 boundary. The iterator's current position is set
- // to the first text boundary and any cached data is cleared.
- int First();
-
- // Returns the position of the first UTF-16 boundary preceding the UTF-16
- // offset. If there is no boundary preceding the specified offset, then
- // ReverseJniBreakIterator::kDone is returned.
- //
- // The iterator's current position is set to the segment whose boundary was
- // returned and any cached data is cleared.
- int Preceding(int offset);
-
- // Returns the position of the first UTF-16 boundary following the UTF-16
- // offset. If there is no boundary following the specified offset, then
- // ReverseJniBreakIterator::kDone is returned.
- //
- // The iterator's current position is set to the segment whose boundary
- // was returned and any cached data is cleared.
- int Following(int offset);
-
- private:
- ReverseJniBreakIterator(
- const JniCache* jni_cache,
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
-
- // Fetches the results of up to kBatchSize next calls and stores them in
- // break_indices_cache_. Returns the number of results or kDone if no more
- // results could be fetched.
- int FetchNextBatch();
-
- // Empties the cache and sets is_done_ and is_almost_done_ to false.
- void ClearCache();
-
- // Keeps track of references to Java classes and methods. Does NOT own.
- const JniCache* jni_cache_;
-
- // The reference to the actual instance of BreakIteratorBatcher that
- // this class interacts with.
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
-
- // The cache holding the most recent batch of return values from
- // BreakIteratorBatcher#next.
- std::queue<int> break_indices_cache_;
-
- bool is_done_;
-
- // The last batch was incomplete (< kBatchSize results were returned). The
- // next call to BreakIteratorBatcher#next is guaranteed to return an
- // empty array. Once the results from the last batch are evicted from
- // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
- bool is_almost_done_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
index fe89f47..a4734b4 100644
--- a/icing/scoring/scorer.cc
+++ b/icing/scoring/scorer.cc
@@ -89,6 +89,7 @@
if (!query_it) {
return default_score_;
}
+
return static_cast<double>(
bm25f_calculator_->ComputeScore(query_it, hit_info, default_score_));
}
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index 22d548a..8b89514 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -95,6 +95,10 @@
const FakeClock& fake_clock2() { return fake_clock2_; }
+ void SetFakeClock1Time(int64_t new_time) {
+ fake_clock1_.SetSystemTimeMilliseconds(new_time);
+ }
+
private:
const std::string test_dir_;
const std::string doc_store_dir_;
@@ -123,7 +127,7 @@
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScorerTest, ShouldGetDefaultScore) {
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
@@ -135,6 +139,66 @@
EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
}
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) {
+ // Creates a test document with a provided score
+ DocumentProto test_document = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetScore(42)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ // The document's score is returned
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42));
+
+ // Delete the document and check that the caller-provided default score is
+ // returned
+ EXPECT_THAT(document_store()->Delete(document_id), IsOk());
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
+}
+
+TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) {
+ // Creates a test document with a provided score
+ int64_t creation_time = fake_clock1().GetSystemTimeMilliseconds();
+ int64_t ttl = 100;
+ DocumentProto test_document = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetScore(42)
+ .SetCreationTimestampMs(creation_time)
+ .SetTtlMs(ttl)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ // The document's score is returned since the document hasn't expired yet.
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42));
+
+ // Expire the document and check that the caller-provided default score is
+ // returned
+ SetFakeClock1Time(creation_time + ttl + 10);
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
+}
+
TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
// Creates a test document with the default document score 0
DocumentProto test_document =
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index d79c861..5f478fa 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -1068,6 +1068,11 @@
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
+ if (!DoesDocumentExist(document_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Can't get usage scores, document id '%d' doesn't exist", document_id));
+ }
+
auto score_data_or = score_cache_->GetCopy(document_id);
if (!score_data_or.ok()) {
ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
@@ -1131,6 +1136,10 @@
libtextclassifier3::StatusOr<UsageStore::UsageScores>
DocumentStore::GetUsageScores(DocumentId document_id) const {
+ if (!DoesDocumentExist(document_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Can't get usage scores, document id '%d' doesn't exist", document_id));
+ }
return usage_store_->GetUsageScores(document_id);
}
@@ -1139,6 +1148,17 @@
ICING_ASSIGN_OR_RETURN(DocumentId document_id,
GetDocumentId(usage_report.document_namespace(),
usage_report.document_uri()));
+ // We can use the internal version here because we got our document_id from
+ // our internal data structures. We would have thrown some error if the
+ // namespace and/or uri were incorrect.
+ if (!InternalDoesDocumentExist(document_id)) {
+ // Document was probably deleted or expired.
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Couldn't report usage on a nonexistent document: (namespace: '",
+ usage_report.document_namespace(), "', uri: '",
+ usage_report.document_uri(), "')"));
+ }
+
return usage_store_->AddUsageReport(usage_report, document_id);
}
@@ -1587,6 +1607,7 @@
// Copy over usage scores.
ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
usage_store_->GetUsageScores(document_id));
+
DocumentId new_document_id = new_document_id_or.ValueOrDie();
ICING_RETURN_IF_ERROR(
new_doc_store->SetUsageScores(new_document_id, usage_scores));
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index a8d87c8..9e1b3ec 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -256,16 +256,9 @@
// Returns the DocumentAssociatedScoreData of the document specified by the
// DocumentId.
//
- // NOTE: This does not check if the document exists and will return the
- // DocumentFilterData of the document even if it has been deleted. Users
- // should check DoesDocumentExist(document_id) if they only want existing
- // documents' DocumentFilterData.
- //
// Returns:
// DocumentAssociatedScoreData on success
- // OUT_OF_RANGE if document_id is negative or exceeds previously seen
- // DocumentIds
- // NOT_FOUND if no score data is found
+ // NOT_FOUND if the document or the score data is not found
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const;
@@ -302,8 +295,8 @@
//
// Returns:
// UsageScores on success
+ // NOT_FOUND if document_id no longer exists.
// INVALID_ARGUMENT if document_id is invalid
- // INTERNAL_ERROR on I/O errors
libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores(
DocumentId document_id) const;
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index ebc5ec3..b37c6de 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -1639,7 +1639,7 @@
/*length_in_tokens=*/7)));
}
-TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) {
+TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1648,7 +1648,7 @@
std::move(create_result.document_store);
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(DocumentStoreTest, DeleteClearsFilterCache) {
@@ -1699,7 +1699,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteShouldClearUsageScores) {
+TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1724,10 +1724,63 @@
// Delete the document.
ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
- // The scores should be cleared.
- expected_scores.usage_type1_count = 0;
+ // Can't report or get usage scores on the deleted document
+ ASSERT_THAT(
+ doc_store->ReportUsage(usage_report_type1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Couldn't report usage on a nonexistent document")));
+
+ ASSERT_THAT(doc_store->GetUsageScores(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Can't get usage scores")));
+}
+
+TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(10)
+ .SetTtlMs(100)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+ // Some arbitrary time before the document's creation time (10) + ttl (100)
+ fake_clock_.SetSystemTimeMilliseconds(109);
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1));
+
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
ASSERT_THAT(doc_store->GetUsageScores(document_id),
IsOkAndHolds(expected_scores));
+
+ // Some arbitrary time past the document's creation time (10) + ttl (100)
+ fake_clock_.SetSystemTimeMilliseconds(200);
+
+ // Can't report or get usage scores on the expired document
+ ASSERT_THAT(
+ doc_store->ReportUsage(usage_report_type1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Couldn't report usage on a nonexistent document")));
+
+ ASSERT_THAT(doc_store->GetUsageScores(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Can't get usage scores")));
}
TEST_F(DocumentStoreTest,
diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h
index adc469a..67a98c3 100644
--- a/icing/testing/jni-test-helpers.h
+++ b/icing/testing/jni-test-helpers.h
@@ -15,6 +15,8 @@
#ifndef ICING_TESTING_JNI_TEST_HELPERS_H_
#define ICING_TESTING_JNI_TEST_HELPERS_H_
+#include <memory>
+
#include "icing/jni/jni-cache.h"
#ifdef ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index e60c168..cae3eee 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,11 +18,7 @@
#include <memory>
#include <string_view>
-#ifdef __ANDROID__
#include "icing/jni/jni-cache.h"
-#else // __ANDROID__
-class JniCache; // forward declaration to let non-Android builds work.
-#endif // __ANDROID__
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
similarity index 85%
rename from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
rename to icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
index 8392363..5f5202c 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
@@ -21,12 +21,12 @@
JNIEnv* g_jenv = nullptr;
extern "C" JNIEXPORT jboolean JNICALL
-Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
- JNIEnv* env, jclass ignored) {
+Java_icing_jni_ReverseJniLanguageSegmenterJniTest_testsMain(JNIEnv* env,
+ jclass ignored) {
g_jenv = env;
std::vector<char*> my_argv;
- char arg[] = "reverse-jni-language-segmenter-test-lib";
+ char arg[] = "jni-test-lib";
my_argv.push_back(arg);
int argc = 1;
char** argv = &(my_argv[0]);
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
deleted file mode 100644
index 64b68ec..0000000
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-
-#include <jni.h>
-
-#include "icing/jni/jni-cache.h"
-#include "gtest/gtest.h"
-
-extern JNIEnv* g_jenv;
-
-namespace icing {
-namespace lib {
-
-namespace test_internal {
-
-class ReverseJniLanguageSegmenterTest
- : public testing::TestWithParam<const char*> {
- protected:
- ReverseJniLanguageSegmenterTest()
- : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
-
- static std::string GetLocale() { return GetParam(); }
-
- std::unique_ptr<JniCache> jni_cache_;
-};
-
-} // namespace test_internal
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
similarity index 99%
rename from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
rename to icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 2c268ff..72c3180 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -12,17 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+#include <jni.h>
#include <memory>
#include <string_view>
+#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "unicode/uloc.h"
@@ -120,6 +122,14 @@
return terms;
}
+class ReverseJniLanguageSegmenterTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ static std::string GetLocale() { return GetParam(); }
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
+};
+
} // namespace
TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc
deleted file mode 100644
index 45c9bf5..0000000
--- a/icing/tools/document-store-dump.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tools/document-store-dump.h"
-
-#include <cinttypes>
-
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-void AppendDocumentProto(DocId document_id, const Document& doc,
- std::string* output) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- "Document {\n document_id: %d\n corpus_id: %d\n uri: "
- "'%s'\n score: %d\n created_timestamp_ms: %" PRIu64 "\n",
- static_cast<int>(document_id), doc.corpus_id(),
- doc.uri().c_str(), static_cast<int>(doc.score()),
- static_cast<int64_t>(doc.created_timestamp_ms())));
- for (const auto& section : doc.sections()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " section {\n id: %d\n indexed_length: "
- "%d\n content: '%s'\n snippet: '%s'\n",
- static_cast<int>(section.id()),
- static_cast<int>(section.indexed_length()),
- section.content().c_str(), section.snippet().c_str()));
- for (int64_t extracted_number : section.extracted_numbers()) {
- absl_ports::StrAppend(output, IcingStringUtil::StringPrintf(
- " extracted_numbers: %" PRId64 "\n",
- extracted_number));
- }
- for (const std::string& annotation_token : section.annotation_tokens()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(" annotation_tokens: '%s'\n",
- annotation_token.c_str()));
- }
- std::string indexed = (section.config().indexed()) ? "true" : "false";
- std::string index_prefixes =
- (section.config().index_prefixes()) ? "true" : "false";
- absl_ports::StrAppend(
- output,
- IcingStringUtil::StringPrintf(
- " config {\n name: '%s'\n indexed: %s\n "
- "tokenizer: %d\n weight: %d\n index_prefixes: %s\n "
- "subsection_separator: '%s'\n",
- section.config().name().c_str(), indexed.c_str(),
- section.config().tokenizer(),
- static_cast<int>(section.config().weight()), index_prefixes.c_str(),
- section.config().subsection_separator().c_str()));
- for (const auto& variant_generator :
- section.config().variant_generators()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " variant_generators: %d\n", variant_generator));
- }
- absl_ports::StrAppend(
- output,
- IcingStringUtil::StringPrintf(
- " common_term_legacy_hit_score: %d\n "
- "rfc822_host_name_term_legacy_hit_score: %d\n "
- "semantic_property: '%s'\n universal_section_id: %d\n "
- "omnibox_section_type: %d\n st_section_type: %d\n }\n }\n",
- section.config().common_term_legacy_hit_score(),
- section.config().rfc822_host_name_term_legacy_hit_score(),
- section.config().semantic_property().c_str(),
- section.config().universal_section_id(),
- section.config().omnibox_section_type(),
- section.config().st_section_type()));
- }
- for (const auto& language : doc.languages()) {
- std::string used_classifier =
- (language.used_classifier()) ? "true" : "false";
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " languages {\n language: %d\n score: %d\n "
- "used_classifier: %s\n }\n",
- language.language(), static_cast<int>(language.score()),
- used_classifier.c_str()));
- }
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n"));
-}
-
-} // namespace
-
-std::string GetDocumentStoreDump(const DocumentStore& document_store) {
- std::string output;
- for (DocId document_id = 0; document_id < document_store.num_documents();
- document_id++) {
- Document doc;
- if (!document_store.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document";
- }
-
- AppendDocumentProto(document_id, doc, &output);
- }
- return output;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h
deleted file mode 100644
index 023b301..0000000
--- a/icing/tools/document-store-dump.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-
-namespace icing {
-namespace lib {
-
-// Utility function for dumping the complete document store content.
-// This provides a human-readable representation of the document store, mainly
-// provided for easier understandability for developers.
-// The output of this class should only be available on cmdline-tool-level
-// (with root access), or unit tests. In other words it should not be possible
-// to trigger this on a release key device, for data protection reasons.
-std::string GetDocumentStoreDump(const DocumentStore& document_store);
-
-} // namespace lib
-} // namespace icing
-#endif // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc
deleted file mode 100644
index 72a11e9..0000000
--- a/icing/tools/icing-tool.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-//
-// A tool to debug the native index.
-
-#include <getopt.h>
-#include <unistd.h>
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/tools/document-store-dump.h"
-#include "icing/util/logging.h"
-
-using std::vector;
-using ::wireless_android_play_playlog::icing::IndexRestorationStats;
-
-namespace icing {
-namespace lib {
-
-// 256KB for debugging.
-const size_t kMaxDocumentSizeForDebugging = 1u << 18;
-// Dump dynamic trie stats and contents.
-void ProcessDynamicTrie(const char* filename) {
- Filesystem filesystem;
- DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem);
- if (!trie.Init()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed",
- filename);
- return;
- }
-
- std::string out;
- trie.GetDebugInfo(true, &out);
- printf("Stats:\n%s", out.c_str());
-
- std::ostringstream contents;
- vector<std::string> keys;
- trie.DumpTrie(&contents, &keys);
- printf("Contents:\n%s", contents.str().c_str());
-}
-
-NativeIndexImpl* MakeIndex(const char* root_dir) {
- NativeConfig native_config;
- native_config.set_max_document_size(kMaxDocumentSizeForDebugging);
- FlashIndexOptions flash_index_options(
- NativeIndexImpl::GetNativeIndexDir(root_dir));
- NativeIndexImpl* ni =
- new NativeIndexImpl(root_dir, native_config, flash_index_options);
- InitStatus init_status;
- if (!ni->Init(&init_status)) {
- ICING_LOG(FATAL) << "Failed to initialize legacy native index impl";
- }
-
- IndexRestorationStats unused;
- ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused);
- return ni;
-}
-
-void RunQuery(NativeIndexImpl* ni, const std::string& query, int start,
- int num_results) {
- // Pull out corpusids and uris.
- QueryRequestSpec spec;
- spec.set_no_corpus_filter(true);
- spec.set_want_uris(true);
- spec.set_scoring_verbosity_level(1);
- spec.set_prefix_match(true);
-
- QueryResponse response;
- ni->ExecuteQuery(query, spec, 10000, start, num_results, &response);
-
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Query [%s] num results %u", query.c_str(), response.num_results());
-
- for (int i = 0, uri_offset = 0; i < response.num_results(); i++) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i),
- response.uri_lengths(i), response.uri_buffer().data() + uri_offset);
- uri_offset += response.uri_lengths(i);
- }
-}
-
-void RunSuggest(NativeIndexImpl* ni, const std::string& prefix,
- int num_results) {
- SuggestionResponse results;
- ni->Suggest(prefix, num_results, vector<CorpusId>(), &results);
-
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Query [%s] num results %zu", prefix.c_str(),
- static_cast<size_t>(results.suggestions_size()));
-
- for (size_t i = 0; i < results.suggestions_size(); i++) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(),
- results.suggestions(i).display_text().c_str());
- }
-}
-
-int IcingTool(int argc, char** argv) {
- auto file_storage = CreatePosixFileStorage();
- enum Options {
- OPT_FILENAME,
- OPT_OP,
- OPT_QUERY,
- NUM_OPT,
- };
- static const option kOptions[NUM_OPT + 1] = {
- {"filename", 1, nullptr, 0},
- {"op", 1, nullptr, 0},
- {"query", 1, nullptr, 0},
- {nullptr, 0, nullptr, 0},
- };
- const char* opt_values[NUM_OPT];
- memset(opt_values, 0, sizeof(opt_values));
-
- while (true) {
- int opt_idx = -1;
- int ret = getopt_long(argc, argv, "", kOptions, &opt_idx);
- if (ret != 0) break;
-
- if (opt_idx >= 0 && opt_idx < NUM_OPT) {
- opt_values[opt_idx] = optarg;
- }
- }
-
- if (!opt_values[OPT_OP]) {
- ICING_LOG(ERROR) << "No op specified";
- return -1;
- }
-
- if (!opt_values[OPT_FILENAME]) {
- ICING_LOG(ERROR) << "No filename specified";
- return -1;
- }
- if (!strncmp(
- opt_values[OPT_FILENAME],
- "/data/data/com.google.android.gms/files/AppDataSearch",
- strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) {
- ICING_LOG(ERROR)
- << "Should not read directly from the file in gmscore - "
- "icing-tool also commits writes as side-effects which corrupts "
- "the index on concurrent modification";
- return -1;
- }
-
- const char* op = opt_values[OPT_OP];
- DocumentStore::Options options(file_storage.get(),
- kMaxDocumentSizeForDebugging);
- if (!strcmp(op, "dyntrie")) {
- std::string full_file_path =
- absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon");
- ProcessDynamicTrie(full_file_path.c_str());
- } else if (!strcmp(op, "verify")) {
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- ni->CheckVerify();
- } else if (!strcmp(op, "query")) {
- if (opt_values[OPT_QUERY] == nullptr) {
- ICING_LOG(FATAL) << "Opt value is null";
- }
-
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100);
- } else if (!strcmp(op, "suggest")) {
- if (opt_values[OPT_QUERY] == nullptr) {
- ICING_LOG(FATAL) << "Opt value is null";
- }
-
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- RunSuggest(ni.get(), opt_values[OPT_QUERY], 100);
- } else if (!strcmp(op, "dump-all-docs")) {
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- printf(
- "------ Document Store Dump Start ------\n"
- "%s\n"
- "------ Document Store Dump End ------\n",
- GetDocumentStoreDump(ds).c_str());
- } else if (!strcmp(op, "dump-uris")) {
- CorpusId corpus_id = kInvalidCorpusId;
- if (opt_values[OPT_QUERY]) {
- // Query is corpus id.
- corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
- }
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- DocPropertyFilter dpf;
- ds.AddDeletedTagFilter(&dpf);
-
- // Dump with format "<corpusid> <uri> <tagname>*".
- int filtered = 0;
- vector<std::string> tagnames;
- for (DocId document_id = 0; document_id < ds.num_documents();
- document_id++) {
- Document doc;
- if (!ds.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document.";
- }
-
- if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
- filtered++;
- continue;
- }
- if (dpf.Match(0, document_id)) {
- filtered++;
- continue;
- }
-
- tagnames.clear();
- ds.GetAllSetUserTagNames(document_id, &tagnames);
-
- printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(),
- StringUtil::JoinStrings("/", tagnames).c_str());
- }
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Processed %u filtered %d", ds.num_documents(), filtered);
- } else if (!strcmp(op, "dump-docs")) {
- std::string out_filename = opt_values[OPT_FILENAME];
- out_filename.append("/docs-dump");
- CorpusId corpus_id = kInvalidCorpusId;
- if (opt_values[OPT_QUERY]) {
- // Query is corpus id.
- corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
- out_filename.push_back('.');
- out_filename.append(opt_values[OPT_QUERY]);
- }
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- DocPropertyFilter dpf;
- ds.AddDeletedTagFilter(&dpf);
-
- // Dump with format (<32-bit length><serialized content>)*.
- FILE* fp = fopen(out_filename.c_str(), "w");
- int filtered = 0;
- for (DocId document_id = 0; document_id < ds.num_documents();
- document_id++) {
- Document doc;
- if (!ds.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document.";
- }
-
- if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
- filtered++;
- continue;
- }
- if (dpf.Match(0, document_id)) {
- filtered++;
- continue;
- }
-
- std::string serialized = doc.SerializeAsString();
- uint32_t length = serialized.size();
- if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) {
- ICING_LOG(FATAL) << "Failed to write length information to file";
- }
-
- if (fwrite(serialized.data(), 1, serialized.size(), fp) !=
- serialized.size()) {
- ICING_LOG(FATAL) << "Failed to write document to file";
- }
- }
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Processed %u filtered %d", ds.num_documents(), filtered);
- fclose(fp);
- } else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op);
- return -1;
- }
-
- return 0;
-}
-
-} // namespace lib
-} // namespace icing
-
-int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); }
diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h
index e2bb817..7ca20b4 100644
--- a/icing/util/bit-util.h
+++ b/icing/util/bit-util.h
@@ -24,19 +24,18 @@
// Manipulating bit fields.
//
-// x value containing the bit field(s)
-// offset offset of bit field in x
-// len len of bit field in x
+// value value containing the bit field(s)
+// lsb_offset offset of bit field in value, starting from the least significant
+// bit. for example, the '1' in '0100' has a lsb_offset of 2
+// len len of bit field in value
//
// REQUIREMENTS
//
-// - x an unsigned integer <= 64 bits
-// - offset + len <= sizeof(x) * 8
+// - value is an unsigned integer <= 64 bits
+// - lsb_offset + len <= sizeof(value) * 8
//
// There is no error checking so you will get garbage if you don't
// ensure the above.
-//
-// To set a value, use BITFIELD_CLEAR then BITFIELD_OR.
// Shifting by more than the word length is undefined (on ARM it has the
// intended effect, but on Intel it shifts by % word length), so check the
@@ -44,20 +43,65 @@
inline uint64_t BitfieldMask(uint32_t len) {
return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len))));
}
-inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) {
- return ((mask) >> (lsb_offset)) & BitfieldMask(len);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint8_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
}
-inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len,
- uint32_t* mask) {
- // We conservatively mask val at len so x won't be corrupted if val >=
- // 1 << len.
- *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint16_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
}
-inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len,
- uint64_t* mask) {
- // We conservatively mask val at len so x won't be corrupted if val >=
- // 1 << len.
- *mask |= (value & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint32_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint64_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline uint64_t BitfieldGet(uint64_t value, uint32_t lsb_offset, uint32_t len) {
+ return ((value) >> (lsb_offset)) & BitfieldMask(len);
+}
+
+inline void BitfieldSet(uint8_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint8_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint16_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint16_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint32_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint32_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint64_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint64_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
}
} // namespace bit_util
diff --git a/icing/util/bit-util_test.cc b/icing/util/bit-util_test.cc
new file mode 100644
index 0000000..3b86a21
--- /dev/null
+++ b/icing/util/bit-util_test.cc
@@ -0,0 +1,145 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/bit-util.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(BitUtilTest, BitfieldMask) {
+ // Check that we can handle up to uint8_t's
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/0), Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/1), Eq(0b01));
+
+ // Check that we can handle up to uint32_t's
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/16), Eq(0b01111111111111111));
+
+ // Check that we can handle up to uint64_t's
+ EXPECT_THAT(
+ bit_util::BitfieldMask(/*len=*/63),
+ Eq(0b0111111111111111111111111111111111111111111111111111111111111111));
+}
+
+TEST(BitUtilTest, BitfieldClear) {
+ // Check that we can handle up to uint8_t's
+ uint8_t value_8 = 0b0;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0));
+
+ value_8 = 0b01;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b00));
+
+ value_8 = 0b011;
+ bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b001));
+
+ value_8 = 0b011;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b000));
+
+ value_8 = 0b0110;
+ bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0000));
+
+ // Check that we can handle up to uint32_t's
+ uint32_t value_32 = 0b010000000000000000000000;
+ bit_util::BitfieldClear(/*lsb_offset=*/22, /*len=*/1, &value_32);
+ EXPECT_THAT(value_32, Eq(0b0));
+
+ // Check that we can handle up to uint64_t's
+ uint64_t value_64 = 0b0100000000000000000000000000000000000;
+ bit_util::BitfieldClear(/*lsb_offset=*/35, /*len=*/1, &value_64);
+ EXPECT_THAT(value_64, Eq(0b0));
+}
+
+TEST(BitUtilTest, BitfieldGet) {
+ // Get something in the uint8_t range
+ EXPECT_THAT(bit_util::BitfieldGet(0b0, /*lsb_offset=*/0, /*len=*/1), Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldGet(0b01, /*lsb_offset=*/0, /*len=*/1),
+ Eq(0b01));
+ EXPECT_THAT(bit_util::BitfieldGet(0b010, /*lsb_offset=*/1, /*len=*/1),
+ Eq(0b01));
+ EXPECT_THAT(bit_util::BitfieldGet(0b001, /*lsb_offset=*/1, /*len=*/1),
+ Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldGet(0b011, /*lsb_offset=*/0, /*len=*/2),
+ Eq(0b011));
+ EXPECT_THAT(bit_util::BitfieldGet(0b0110, /*lsb_offset=*/1, /*len=*/2),
+ Eq(0b011));
+ EXPECT_THAT(bit_util::BitfieldGet(0b0101, /*lsb_offset=*/0, /*len=*/3),
+ Eq(0b0101));
+
+ // Get something in the uint32_t range
+ EXPECT_THAT(
+ bit_util::BitfieldGet(0b01000000000000, /*lsb_offset=*/12, /*len=*/1),
+ Eq(0b01));
+
+ // Get something in the uint64_t range
+ EXPECT_THAT(bit_util::BitfieldGet(0b010000000000000000000000000000000000,
+ /*lsb_offset=*/34, /*len=*/1),
+ Eq(0b01));
+}
+
+TEST(BitUtilTest, BitfieldSet) {
+ // Set something in the uint8_t range
+ uint8_t value_8 = 0b0;
+ bit_util::BitfieldSet(0b0, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b01));
+
+ value_8 = 0b00;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b01));
+
+ value_8 = 0b00;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/1, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b0001;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/1, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0111));
+
+ // Set something in the uint32_t range
+ uint32_t value_32 = 0b0;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/16, /*len=*/1, &value_32);
+ EXPECT_THAT(value_32, Eq(0b010000000000000000));
+
+ // Set something in the uint64_t range
+ uint64_t value_64 = 0b0;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/34, /*len=*/1, &value_64);
+ EXPECT_THAT(value_64, Eq(0b010000000000000000000000000000000000));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/java/Android.bp b/java/Android.bp
index 7daeb0a..ef417ba 100644
--- a/java/Android.bp
+++ b/java/Android.bp
@@ -25,9 +25,12 @@
name: "libicing-java",
srcs: ["src/**/*.java"],
static_libs: [
- "androidx.annotation_annotation",
"icing-java-proto-lite",
"libprotobuf-java-lite",
],
+ libs: [
+ "androidx.annotation_annotation",
+ ],
+ sdk_version: "current",
apex_available: ["com.android.appsearch"],
}
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 6f5faa0..4069810 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=373174102)
+set(synced_AOSP_CL_number=375495869)