Port over Icing c++ code from upstream
Change-Id: Ia3981fed7e0e70589efc027d4123f306cdfbe990
diff --git a/METADATA b/METADATA
index 2af1aa1..d350608 100644
--- a/METADATA
+++ b/METADATA
@@ -12,6 +12,6 @@
type: PIPER
value: "http://google3/third_party/icing/"
}
- last_upgrade_date { year: 2019 month: 11 day: 25 }
+ last_upgrade_date { year: 2019 month: 12 day: 20 }
license_type: NOTICE
}
diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc
new file mode 100644
index 0000000..f73c432
--- /dev/null
+++ b/icing/absl_ports/annotate.cc
@@ -0,0 +1,43 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/annotate.h"
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+namespace {
+constexpr std::string_view kErrorSeparator = ";";
+} // namespace
+
+libtextclassifier3::Status Annotate(const libtextclassifier3::Status& s,
+ std::string_view msg) {
+ if (s.ok() || msg.empty()) {
+ return s;
+ }
+
+ std::string new_msg =
+ (!s.error_message().empty())
+ ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg)
+ : std::string(msg);
+ return libtextclassifier3::Status(s.CanonicalCode(), new_msg);
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/annotate.h b/icing/absl_ports/annotate.h
new file mode 100644
index 0000000..81adce0
--- /dev/null
+++ b/icing/absl_ports/annotate.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_ANNOTATE_H_
+#define ICING_ABSL_PORTS_ANNOTATE_H_
+
+#include <string_view>
+
+#include "utils/base/status.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// Returns a Status that is identical to `s` except that the error_message()
+// has been augmented by adding `msg` to the end of the original error message.
+//
+// Annotate should be used to add higher-level information to a Status. E.g.,
+//
+// libtextclassifier3::Status s = file::GetContents(...);
+// if (!s.ok()) {
+// return Annotate(s, "loading blacklist");
+// }
+//
+// Annotate() adds the appropriate separators, so callers should not include a
+// separator in `msg`. The exact formatting is subject to change, so you should
+// not depend on it in your tests.
+//
+// OK status values have no error message and therefore if `s` is OK, the result
+// is unchanged.
+libtextclassifier3::Status Annotate(const libtextclassifier3::Status& s,
+ std::string_view msg);
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_ANNOTATE_H_
diff --git a/icing/absl_ports/canonical_errors.cc b/icing/absl_ports/canonical_errors.cc
new file mode 100644
index 0000000..03b2c61
--- /dev/null
+++ b/icing/absl_ports/canonical_errors.cc
@@ -0,0 +1,176 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/canonical_errors.h"
+
+#include "utils/base/status.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+libtextclassifier3::Status CancelledError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::CANCELLED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status UnknownError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::UNKNOWN,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status InvalidArgumentError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status DeadlineExceededError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::DEADLINE_EXCEEDED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status NotFoundError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::NOT_FOUND,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status AlreadyExistsError(std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::ALREADY_EXISTS,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status PermissionDeniedError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::PERMISSION_DENIED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status ResourceExhaustedError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status FailedPreconditionError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status AbortedError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::ABORTED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status OutOfRangeError(std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::OUT_OF_RANGE, std::string(error_message));
+}
+
+libtextclassifier3::Status UnimplementedError(std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::UNIMPLEMENTED,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status InternalError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::INTERNAL,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status UnavailableError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::UNAVAILABLE,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status DataLossError(std::string_view error_message) {
+ return libtextclassifier3::Status(libtextclassifier3::StatusCode::DATA_LOSS,
+ std::string(error_message));
+}
+
+libtextclassifier3::Status UnauthenticatedError(
+ std::string_view error_message) {
+ return libtextclassifier3::Status(
+ libtextclassifier3::StatusCode::UNAUTHENTICATED,
+ std::string(error_message));
+}
+
+bool IsCancelled(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::CANCELLED;
+}
+bool IsUnknown(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::UNKNOWN;
+}
+bool IsInvalidArgument(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::INVALID_ARGUMENT;
+}
+bool IsDeadlineExceeded(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::DEADLINE_EXCEEDED;
+}
+bool IsNotFound(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::NOT_FOUND;
+}
+bool IsAlreadyExists(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::ALREADY_EXISTS;
+}
+bool IsPermissionDenied(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::PERMISSION_DENIED;
+}
+bool IsResourceExhausted(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED;
+}
+bool IsFailedPrecondition(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::FAILED_PRECONDITION;
+}
+bool IsAborted(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::ABORTED;
+}
+bool IsOutOfRange(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::OUT_OF_RANGE;
+}
+bool IsUnimplemented(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::UNIMPLEMENTED;
+}
+bool IsInternal(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::INTERNAL;
+}
+bool IsUnavailable(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::UNAVAILABLE;
+}
+bool IsDataLoss(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() == libtextclassifier3::StatusCode::DATA_LOSS;
+}
+bool IsUnauthenticated(const libtextclassifier3::Status& status) {
+ return status.CanonicalCode() ==
+ libtextclassifier3::StatusCode::UNAUTHENTICATED;
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/canonical_errors.h b/icing/absl_ports/canonical_errors.h
new file mode 100644
index 0000000..c2d7784
--- /dev/null
+++ b/icing/absl_ports/canonical_errors.h
@@ -0,0 +1,68 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_CANONICAL_ERRORS_H_
+#define ICING_ABSL_PORTS_CANONICAL_ERRORS_H_
+
+#include <string_view>
+
+#include "utils/base/status.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+libtextclassifier3::Status CancelledError(std::string_view error_message);
+libtextclassifier3::Status UnknownError(std::string_view error_message);
+libtextclassifier3::Status InvalidArgumentError(std::string_view error_message);
+libtextclassifier3::Status DeadlineExceededError(
+ std::string_view error_message);
+libtextclassifier3::Status NotFoundError(std::string_view error_message);
+libtextclassifier3::Status AlreadyExistsError(std::string_view error_message);
+libtextclassifier3::Status PermissionDeniedError(
+ std::string_view error_message);
+libtextclassifier3::Status ResourceExhaustedError(
+ std::string_view error_message);
+libtextclassifier3::Status FailedPreconditionError(
+ std::string_view error_message);
+libtextclassifier3::Status AbortedError(std::string_view error_message);
+libtextclassifier3::Status OutOfRangeError(std::string_view error_message);
+libtextclassifier3::Status UnimplementedError(std::string_view error_message);
+libtextclassifier3::Status InternalError(std::string_view error_message);
+libtextclassifier3::Status UnavailableError(std::string_view error_message);
+libtextclassifier3::Status DataLossError(std::string_view error_message);
+libtextclassifier3::Status UnauthenticatedError(std::string_view error_message);
+
+bool IsCancelled(const libtextclassifier3::Status& status);
+bool IsUnknown(const libtextclassifier3::Status& status);
+bool IsInvalidArgument(const libtextclassifier3::Status& status);
+bool IsDeadlineExceeded(const libtextclassifier3::Status& status);
+bool IsNotFound(const libtextclassifier3::Status& status);
+bool IsAlreadyExists(const libtextclassifier3::Status& status);
+bool IsPermissionDenied(const libtextclassifier3::Status& status);
+bool IsResourceExhausted(const libtextclassifier3::Status& status);
+bool IsFailedPrecondition(const libtextclassifier3::Status& status);
+bool IsAborted(const libtextclassifier3::Status& status);
+bool IsOutOfRange(const libtextclassifier3::Status& status);
+bool IsUnimplemented(const libtextclassifier3::Status& status);
+bool IsInternal(const libtextclassifier3::Status& status);
+bool IsUnavailable(const libtextclassifier3::Status& status);
+bool IsDataLoss(const libtextclassifier3::Status& status);
+bool IsUnauthenticated(const libtextclassifier3::Status& status);
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_CANONICAL_ERRORS_H_
diff --git a/icing/absl_ports/mutex.h b/icing/absl_ports/mutex.h
new file mode 100644
index 0000000..c49b1e1
--- /dev/null
+++ b/icing/absl_ports/mutex.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_MUTEX_H_
+#define ICING_ABSL_PORTS_MUTEX_H_
+
+#include <mutex> // NOLINT
+#include <shared_mutex> // NOLINT
+
+#include "icing/absl_ports/thread_annotations.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// Simple wrapper around std::shared_mutex with annotations to allow thread
+// annotation checks.
+class LOCKABLE shared_mutex {
+ public:
+ void lock() EXCLUSIVE_LOCK_FUNCTION() { m_.lock(); }
+ bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true) { return m_.try_lock(); }
+ void unlock() UNLOCK_FUNCTION() { m_.unlock(); }
+
+ void lock_shared() SHARED_LOCK_FUNCTION() { m_.lock_shared(); }
+ bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true) {
+ return m_.try_lock_shared();
+ }
+ void unlock_shared() UNLOCK_FUNCTION() { m_.unlock_shared(); }
+
+ private:
+ std::shared_mutex m_;
+};
+
+// Simple wrapper around std::unique_lock with annotations to allow thread
+// annotation checks.
+class SCOPED_LOCKABLE unique_lock {
+ public:
+ explicit unique_lock(shared_mutex* mu) EXCLUSIVE_LOCK_FUNCTION(mu)
+ : lock_(*mu) {}
+ ~unique_lock() UNLOCK_FUNCTION() = default;
+
+ private:
+ std::unique_lock<shared_mutex> lock_;
+};
+
+// Simple wrapper around std::shared_lock with annotations to allow thread
+// annotation checks.
+class SCOPED_LOCKABLE shared_lock {
+ public:
+ explicit shared_lock(shared_mutex* mu) SHARED_LOCK_FUNCTION(mu)
+ : lock_(*mu) {}
+ ~shared_lock() UNLOCK_FUNCTION() = default;
+
+ private:
+ std::shared_lock<shared_mutex> lock_;
+};
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_MUTEX_H_
diff --git a/icing/absl_ports/status_imports.h b/icing/absl_ports/status_imports.h
new file mode 100644
index 0000000..fe4b6d9
--- /dev/null
+++ b/icing/absl_ports/status_imports.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_STATUS_IMPORTS_H_
+#define ICING_ABSL_PORTS_STATUS_IMPORTS_H_
+
+#include "utils/base/status.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// TODO(b/144458732) Delete this file once visibility on TC3 Status has been
+// granted to the sample app.
+using Status = libtextclassifier3::Status;
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_STATUS_IMPORTS_H_
diff --git a/icing/absl_ports/status_macros.h b/icing/absl_ports/status_macros.h
new file mode 100644
index 0000000..44cffdd
--- /dev/null
+++ b/icing/absl_ports/status_macros.h
@@ -0,0 +1,117 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_STATUS_MACROS_H_
+#define ICING_ABSL_PORTS_STATUS_MACROS_H_
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// TODO(b/144458732): Move the fixes included in this file over to TC Status and
+// remove this file.
+class StatusAdapter {
+ public:
+ explicit StatusAdapter(const libtextclassifier3::Status& s) : s_(s) {}
+ explicit StatusAdapter(libtextclassifier3::Status&& s) : s_(std::move(s)) {}
+ template <typename T>
+ explicit StatusAdapter(const libtextclassifier3::StatusOr<T>& s)
+ : s_(s.status()) {}
+ template <typename T>
+ explicit StatusAdapter(libtextclassifier3::StatusOr<T>&& s)
+ : s_(std::move(s).status()) {}
+
+ bool ok() const { return s_.ok(); }
+ explicit operator bool() const { return ok(); }
+
+ const libtextclassifier3::Status& status() const& { return s_; }
+ libtextclassifier3::Status status() && { return std::move(s_); }
+
+ private:
+ libtextclassifier3::Status s_;
+};
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+// Evaluates an expression that produces a `libtextclassifier3::Status`. If the
+// status is not ok, returns it from the current function.
+//
+// For example:
+// libtextclassifier3::Status MultiStepFunction() {
+// ICING_RETURN_IF_ERROR(Function(args...));
+// ICING_RETURN_IF_ERROR(foo.Method(args...));
+// return libtextclassifier3::Status();
+// }
+#define ICING_RETURN_IF_ERROR(expr) ICING_RETURN_IF_ERROR_IMPL(expr)
+#define ICING_RETURN_IF_ERROR_IMPL(expr) \
+ ICING_STATUS_MACROS_IMPL_ELSE_BLOCKER_ \
+ if (::icing::lib::absl_ports::StatusAdapter adapter{expr}) { \
+ } else /* NOLINT */ \
+ return std::move(adapter).status()
+
+// The GNU compiler emits a warning for code like:
+//
+// if (foo)
+// if (bar) { } else baz;
+//
+// because it thinks you might want the else to bind to the first if. This
+// leads to problems with code like:
+//
+// if (do_expr) ICING_RETURN_IF_ERROR(expr);
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#define ICING_STATUS_MACROS_IMPL_ELSE_BLOCKER_ \
+ switch (0) \
+ case 0: \
+ default: // NOLINT
+
+#define ICING_STATUS_MACROS_CONCAT_NAME(x, y) \
+ ICING_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define ICING_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+// Macros that help consume libtextclassifier3::StatusOr<...> return values and
+// propagate errors. TC_STRIP These macros are inspired by the nice practice
+// from Google3:
+// https://g3doc.corp.google.com/devtools/library_club/g3doc/totw/121.md?cl=head
+// TC_END_STRIP
+#define ICING_ASSIGN_OR_RETURN(lhs, rexpr) \
+ ICING_ASSIGN_OR_RETURN_IMPL( \
+ ICING_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+ rexpr)
+
+#define ICING_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
+ auto statusor = (rexpr); \
+ if (!statusor.ok()) { \
+ return statusor.status(); \
+ } \
+ lhs = std::move(statusor.ValueOrDie())
+
+#define ICING_ASSIGN_OR_RETURN_VAL(lhs, rexpr, val) \
+ ICING_ASSIGN_OR_RETURN_VAL_IMPL( \
+ ICING_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+ rexpr, val)
+
+#define ICING_ASSIGN_OR_RETURN_VAL_IMPL(statusor, lhs, rexpr, val) \
+ auto statusor = (rexpr); \
+ if (!statusor.ok()) { \
+ return val; \
+ } \
+ lhs = std::move(statusor.ValueOrDie())
+
+#endif // ICING_ABSL_PORTS_STATUS_MACROS_H_
diff --git a/icing/absl_ports/str_cat.cc b/icing/absl_ports/str_cat.cc
new file mode 100644
index 0000000..2cf020d
--- /dev/null
+++ b/icing/absl_ports/str_cat.cc
@@ -0,0 +1,190 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+char* Append(char* out, std::string_view s) {
+ if (!s.empty()) {
+ memcpy(out, s.data(), s.length());
+ out += s.length();
+ }
+ return out;
+}
+
+std::string StrCat(std::string_view a, std::string_view b) {
+ std::string::size_type result_size = a.length() + b.length();
+ // Create result with enough room to fit all operands.
+ std::string result;
+ // __resize_default_init is provided by libc++ >= 8.0 and allows us to
+ // allocate room for the content we're about to copy while avoiding the
+ // unnecessary zero-initialization that the normal std::string::resize will
+ // perform.
+ //
+ // The current absl implementation copies a null char to the character at
+ // previous_size after the call to resize_default_init due to implementation
+ // differences between libstdc++ and libc++. That behavior is NOT copied over
+ // here because the following lines are just about to overwrite that character
+ // anyways.
+ result.__resize_default_init(result_size);
+
+ char* out = &result[0];
+ out = Append(out, a);
+ out = Append(out, b);
+ return result;
+}
+
+std::string StrCat(std::string_view a, std::string_view b, std::string_view c) {
+ std::string::size_type result_size = a.length() + b.length() + c.length();
+ // Create result with enough room to fit all operands.
+ std::string result;
+ // __resize_default_init is provided by libc++ >= 8.0 and allows us to
+ // allocate room for the content we're about to copy while avoiding the
+ // unnecessary zero-initialization that the normal std::string::resize will
+ // perform.
+ //
+ // The current absl implementation copies a null char to the character at
+ // previous_size after the call to resize_default_init due to implementation
+ // differences between libstdc++ and libc++. That behavior is NOT copied over
+ // here because the following lines are just about to overwrite that character
+ // anyways.
+ result.__resize_default_init(result_size);
+
+ char* out = &result[0];
+ out = Append(out, a);
+ out = Append(out, b);
+ out = Append(out, c);
+ return result;
+}
+
+std::string StrCat(std::string_view a, std::string_view b, std::string_view c,
+ std::string_view d) {
+ std::string::size_type result_size =
+ a.length() + b.length() + c.length() + d.length();
+ // Create result with enough room to fit all operands.
+ std::string result;
+ // __resize_default_init is provided by libc++ >= 8.0 and allows us to
+ // allocate room for the content we're about to copy while avoiding the
+ // unnecessary zero-initialization that the normal std::string::resize will
+ // perform.
+ //
+ // The current absl implementation copies a null char to the character at
+ // previous_size after the call to resize_default_init due to implementation
+ // differences between libstdc++ and libc++. That behavior is NOT copied over
+ // here because the following lines are just about to overwrite that character
+ // anyways.
+ result.__resize_default_init(result_size);
+
+ char* out = &result[0];
+ out = Append(out, a);
+ out = Append(out, b);
+ out = Append(out, c);
+ out = Append(out, d);
+ return result;
+}
+
+std::string StrCatPieces(std::vector<std::string_view> pieces) {
+ std::string::size_type result_size = 0;
+ for (std::string_view s : pieces) {
+ result_size += s.length();
+ }
+ // Create result with enough room to fit all operands.
+ std::string result;
+ // __resize_default_init is provided by libc++ >= 8.0 and allows us to
+ // allocate room for the content we're about to copy while avoiding the
+ // unnecessary zero-initialization that the normal std::string::resize will
+ // perform.
+ //
+ // The current absl implementation copies a null char to the character at
+ // previous_size after the call to resize_default_init due to implementation
+ // differences between libstdc++ and libc++. That behavior is NOT copied over
+ // here because the following lines are just about to overwrite that character
+ // anyways.
+ result.__resize_default_init(result_size);
+
+ char* out = &result[0];
+ for (std::string_view s : pieces) {
+ out = Append(out, s);
+ }
+ return result;
+}
+
+void StrAppend(std::string* dest, std::string_view a) {
+ std::string::size_type old_size = dest->size();
+ std::string::size_type new_size = old_size + a.length();
+ dest->__resize_default_init(new_size);
+
+ char* out = &(*dest)[old_size];
+ out = Append(out, a);
+}
+
+void StrAppend(std::string* dest, std::string_view a, std::string_view b) {
+ std::string::size_type old_size = dest->size();
+ std::string::size_type new_size = old_size + a.length() + b.length();
+ dest->__resize_default_init(new_size);
+
+ char* out = &(*dest)[old_size];
+ out = Append(out, a);
+ out = Append(out, b);
+}
+
+void StrAppend(std::string* dest, std::string_view a, std::string_view b,
+ std::string_view c) {
+ std::string::size_type old_size = dest->size();
+ std::string::size_type new_size =
+ old_size + a.length() + b.length() + c.length();
+ dest->__resize_default_init(new_size);
+
+ char* out = &(*dest)[old_size];
+ out = Append(out, a);
+ out = Append(out, b);
+ out = Append(out, c);
+}
+
+void StrAppend(std::string* dest, std::string_view a, std::string_view b,
+ std::string_view c, std::string_view d) {
+ std::string::size_type old_size = dest->size();
+ std::string::size_type new_size =
+ old_size + a.length() + b.length() + c.length() + d.length();
+ dest->__resize_default_init(new_size);
+
+ char* out = &(*dest)[old_size];
+ out = Append(out, a);
+ out = Append(out, b);
+ out = Append(out, c);
+ out = Append(out, d);
+}
+
+void StrAppendPieces(std::string* dest, std::vector<std::string_view> pieces) {
+ std::string::size_type old_size = dest->size();
+ std::string::size_type result_size = old_size;
+ for (std::string_view s : pieces) {
+ result_size += s.length();
+ }
+ // Create result with enough room to fit all operands.
+ std::string result;
+ result.__resize_default_init(result_size);
+
+ char* out = &(*dest)[old_size];
+ for (std::string_view s : pieces) {
+ out = Append(out, s);
+ }
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/str_cat.h b/icing/absl_ports/str_cat.h
new file mode 100644
index 0000000..b2dd63d
--- /dev/null
+++ b/icing/absl_ports/str_cat.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_STR_CAT_H_
+#define ICING_ABSL_PORTS_STR_CAT_H_
+
+#include <cstdarg>
+#include <cstring>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// Appends the content of s to the char buffer starting at out and returns the
+// address of the first character after the content copied from s.
+// REQUIRES: out is large enough to hold all content from s.
+char* Append(char* out, std::string_view s);
+
+// A port of absl::StrCat.
+//
+// Merges given strings or numbers, using no delimiter(s), returning the merged
+// result as a string.
+//
+// Unlike absl::StrCat, this version only accepts string_views. For converting
+// numerics to strings, use StringPrintf.
+//
+// Separate implementations for 2-4 arguments are provided separately from the
+// variadic definition, just like absl does. This is a minor optimization to
+// avoid constructing a vector and copying all string_view params.
+std::string StrCat(std::string_view a, std::string_view b);
+std::string StrCat(std::string_view a, std::string_view b, std::string_view c);
+std::string StrCat(std::string_view a, std::string_view b, std::string_view c,
+ std::string_view d);
+
+std::string StrCatPieces(std::vector<std::string_view> pieces);
+
+template <typename... AV>
+std::string StrCat(const AV&... args) {
+ return StrCatPieces({static_cast<const std::string_view&>(args)...});
+}
+
+// A port of absl::StrAppend.
+//
+// Appends a string or set of strings to an existing string, in a similar
+// fashion to `StrCat()`.
+//
+// Unlike absl::StrAppend, this version only accepts string_views. For
+// converting numerics to strings, use StringPrintf.
+void StrAppend(std::string* dest, std::string_view a);
+void StrAppend(std::string* dest, std::string_view a, std::string_view b);
+void StrAppend(std::string* dest, std::string_view a, std::string_view b,
+ std::string_view c);
+void StrAppend(std::string* dest, std::string_view a, std::string_view b,
+ std::string_view c, std::string_view d);
+
+void StrAppendPieces(std::string* dest, std::vector<std::string_view> pieces);
+
+template <typename... AV>
+void StrAppend(std::string* dest, const AV&... args) {
+ StrAppendPieces(dest, {static_cast<const std::string_view&>(args)...});
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_STR_CAT_H_
diff --git a/icing/absl_ports/str_join.h b/icing/absl_ports/str_join.h
new file mode 100644
index 0000000..7c8936a
--- /dev/null
+++ b/icing/absl_ports/str_join.h
@@ -0,0 +1,111 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_STR_JOIN_H_
+#define ICING_ABSL_PORTS_STR_JOIN_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+class DefaultFormatter {
+ public:
+ template <typename T>
+ std::string operator()(const T& element) {
+ return std::string(element);
+ }
+};
+
+class NumberFormatter {
+ public:
+ template <typename T>
+ std::string operator()(const T& number) {
+ return std::to_string(number);
+ }
+};
+
+// A port of absl::StrJoin.
+//
+// Joins a range of elements and returns the result as a std::string.
+// `absl::StrJoin()` takes a range, a separator string to use between the
+// elements joined.
+//
+// A Formatter may be supplied to convert the Iterator's elements to a
+// std::string.
+template <typename Iterator, typename Formatter>
+std::string StrJoin(Iterator first, Iterator last, std::string_view sep,
+ Formatter&& formatter) {
+ std::string::size_type result_size = 0;
+ bool add_separator_before_element = false;
+ for (Iterator current = first; current != last; ++current) {
+ if (add_separator_before_element) {
+ result_size += sep.length();
+ }
+
+ std::string formatted = formatter(*current);
+ result_size += formatted.length();
+
+ add_separator_before_element = true;
+ }
+ // Create result with enough room to fit all operands.
+ std::string result;
+ // __resize_default_init is provided by libc++ >= 8.0 and allows us to
+ // allocate room for the content we're about to copy while avoiding the
+ // unnecessary zero-initialization that the normal std::string::resize will
+ // perform.
+ //
+ // The current absl implementation copies a null char to the character at
+ // previous_size after the call to resize_default_init due to implementation
+ // differences between libstdc++ and libc++. That behavior is NOT copied over
+ // here because the following lines are just about to overwrite that character
+ // anyways.
+ result.__resize_default_init(result_size);
+
+ add_separator_before_element = false;
+ for (char* out = &result[0]; first != last; ++first) {
+ if (add_separator_before_element) {
+ out = Append(out, sep);
+ }
+
+ std::string formatted = formatter(*first);
+ out = Append(out, formatted);
+
+ add_separator_before_element = true;
+ }
+
+ return result;
+}
+
+template <typename Container, typename Formatter>
+std::string StrJoin(const Container& container, std::string_view sep,
+ Formatter&& formatter) {
+ return absl_ports::StrJoin(std::begin(container), std::end(container), sep,
+ formatter);
+}
+
+template <typename Container>
+std::string StrJoin(const Container& container, std::string_view sep) {
+ return absl_ports::StrJoin(container, sep, DefaultFormatter());
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_STR_JOIN_H_
diff --git a/icing/absl_ports/thread_annotations.h b/icing/absl_ports/thread_annotations.h
new file mode 100644
index 0000000..f5de7b7
--- /dev/null
+++ b/icing/absl_ports/thread_annotations.h
@@ -0,0 +1,208 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is a port of absl::thread_annotations.
+//
+// This header file contains macro definitions for thread safety annotations
+// that allow developers to document the locking policies of multi-threaded
+// code. The annotations can also help program analysis tools to identify
+// potential thread safety issues.
+//
+// These annotations are implemented using compiler attributes. Using the macros
+// defined here instead of raw attributes allow for portability and future
+// compatibility.
+//
+// When referring to mutexes in the arguments of the attributes, you should
+// use variable names or more complex expressions (e.g. my_object->mutex_)
+// that evaluate to a concrete mutex object whenever possible. If the mutex
+// you want to refer to is not in scope, you may use a member pointer
+// (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
+
+#ifndef ICING_ABSL_PORTS_THREAD_ANNOTATIONS_H_
+#define ICING_ABSL_PORTS_THREAD_ANNOTATIONS_H_
+
+#if defined(__clang__)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op
+#endif // defined(__clang__)
+
+// GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. GUARDED_BY() allows the user to specify a particular mutex that
+// should be held when accessing the annotated variable.
+//
+// Although this annotation (and PT_GUARDED_BY, below) cannot be applied to
+// local variables, a local variable and its associated mutex can often be
+// combined into a small class or struct, thereby allowing the annotation.
+//
+// Example:
+//
+// class Foo {
+// Mutex mu_;
+// int p1_ GUARDED_BY(mu_);
+// ...
+// };
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+// PT_GUARDED_BY()
+//
+// Documents if the memory location pointed to by a pointer should be guarded
+// by a mutex when dereferencing the pointer.
+//
+// Example:
+// class Foo {
+// Mutex mu_;
+// int *p1_ PT_GUARDED_BY(mu_);
+// ...
+// };
+//
+// Note that a pointer variable to a shared memory location could itself be a
+// shared variable.
+//
+// Example:
+//
+// // `q_`, guarded by `mu1_`, points to a shared memory location that is
+// // guarded by `mu2_`:
+// int *q_ GUARDED_BY(mu1_) PT_GUARDED_BY(mu2_);
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+// ACQUIRED_AFTER() / ACQUIRED_BEFORE()
+//
+// Documents the acquisition order between locks that can be held
+// simultaneously by a thread. For any two locks that need to be annotated
+// to establish an acquisition order, only one of them needs the annotation.
+// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
+// and ACQUIRED_BEFORE.)
+//
+// As with GUARDED_BY, this is only applicable to mutexes that are shared
+// fields or global variables.
+//
+// Example:
+//
+// Mutex m1_;
+// Mutex m2_ ACQUIRED_AFTER(m1_);
+#define ACQUIRED_AFTER(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define ACQUIRED_BEFORE(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+// EXCLUSIVE_LOCKS_REQUIRED() / SHARED_LOCKS_REQUIRED()
+//
+// Documents a function that expects a mutex to be held prior to entry.
+// The mutex is expected to be held both on entry to, and exit from, the
+// function.
+//
+// An exclusive lock allows read-write access to the guarded data member(s), and
+// only one thread can acquire a lock exclusively at any one time. A shared lock
+// allows read-only access, and any number of threads can acquire a shared lock
+// concurrently.
+//
+// Generally, non-const methods should be annotated with
+// EXCLUSIVE_LOCKS_REQUIRED, while const methods should be annotated with
+// SHARED_LOCKS_REQUIRED.
+//
+// Example:
+//
+// Mutex mu1, mu2;
+// int a GUARDED_BY(mu1);
+// int b GUARDED_BY(mu2);
+//
+// void foo() EXCLUSIVE_LOCKS_REQUIRED(mu1, mu2) { ... }
+// void bar() const SHARED_LOCKS_REQUIRED(mu1, mu2) { ... }
+#define EXCLUSIVE_LOCKS_REQUIRED(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
+
+#define SHARED_LOCKS_REQUIRED(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
+
+// LOCKS_EXCLUDED()
+//
+// Documents the locks acquired in the body of the function. These locks
+// cannot be held when calling this function.
+#define LOCKS_EXCLUDED(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+// LOCK_RETURNED()
+//
+// Documents a function that returns a mutex without acquiring it. For example,
+// a public getter method that returns a pointer to a private mutex should
+// be annotated with LOCK_RETURNED.
+#define LOCK_RETURNED(x) \
+ THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+// LOCKABLE
+//
+// Documents if a class/type is a lockable type.
+#define LOCKABLE \
+ THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+
+// SCOPED_LOCKABLE
+//
+// Documents if a class does RAII locking.
+// The constructor should use `LOCK_FUNCTION()` to specify the mutex that is
+// acquired, and the destructor should use `UNLOCK_FUNCTION()` with no
+// arguments; the analysis will assume that the destructor unlocks whatever the
+// constructor locked.
+#define SCOPED_LOCKABLE \
+ THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+// EXCLUSIVE_LOCK_FUNCTION()
+//
+// Documents functions that acquire a lock in the body of a function, and do
+// not release it.
+#define EXCLUSIVE_LOCK_FUNCTION(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
+
+// SHARED_LOCK_FUNCTION()
+//
+// Documents functions that acquire a shared (reader) lock in the body of a
+// function, and do not release it.
+#define SHARED_LOCK_FUNCTION(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
+
+// UNLOCK_FUNCTION()
+//
+// Documents functions that expect a lock to be held on entry to the function,
+// and release it in the body of the function.
+#define UNLOCK_FUNCTION(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
+
+// EXCLUSIVE_TRYLOCK_FUNCTION() / SHARED_TRYLOCK_FUNCTION()
+//
+// Documents functions that try to acquire a lock, and return success or failure
+// (or a non-boolean value that can be interpreted as a boolean).
+// The first argument should be `true` for functions that return `true` on
+// success, or `false` for functions that return `false` on success. The second
+// argument specifies the mutex that is locked on success. If unspecified, this
+// mutex is assumed to be `this`.
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
+
+#define SHARED_TRYLOCK_FUNCTION(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
+
+// ASSERT_EXCLUSIVE_LOCK() / ASSERT_SHARED_LOCK()
+//
+// Documents functions that dynamically check to see if a lock is held, and fail
+// if it is not held.
+#define ASSERT_EXCLUSIVE_LOCK(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(assert_exclusive_lock(__VA_ARGS__))
+
+#define ASSERT_SHARED_LOCK(...) \
+ THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_lock(__VA_ARGS__))
+
+#endif // ICING_ABSL_PORTS_THREAD_ANNOTATIONS_H_
diff --git a/icing/document-builder.h b/icing/document-builder.h
new file mode 100644
index 0000000..2bbe590
--- /dev/null
+++ b/icing/document-builder.h
@@ -0,0 +1,302 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_DOCUMENT_BUILDER_H_
+#define ICING_DOCUMENT_BUILDER_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+class DocumentBuilder {
+ public:
+ DocumentBuilder() = default;
+ explicit DocumentBuilder(DocumentProto document)
+ : document_(std::move(document)) {}
+
+ DocumentBuilder& SetNamespace(std::string name_space) {
+ document_.set_namespace_(std::move(name_space));
+ return *this;
+ }
+
+ DocumentBuilder& SetUri(std::string uri) {
+ document_.set_uri(std::move(uri));
+ return *this;
+ }
+
+ DocumentBuilder& SetKey(std::string name_space, std::string uri) {
+ return SetNamespace(std::move(name_space)).SetUri(std::move(uri));
+ }
+
+ DocumentBuilder& SetSchema(std::string schema) {
+ document_.set_schema(std::move(schema));
+ return *this;
+ }
+
+ DocumentBuilder& SetCreationTimestampSecs(uint64_t creation_timestamp_secs) {
+ document_.set_creation_timestamp_secs(creation_timestamp_secs);
+ return *this;
+ }
+
+ DocumentBuilder& SetScore(int32_t score) {
+ document_.set_score(score);
+ return *this;
+ }
+
+ DocumentBuilder& SetTtlSecs(uint64_t ttl_secs) {
+ document_.set_ttl_secs(ttl_secs);
+ return *this;
+ }
+
+ DocumentBuilder& ClearProperties() {
+ document_.clear_properties();
+ return *this;
+ }
+
+ DocumentBuilder& ClearCustomProperties() {
+ document_.clear_custom_properties();
+ return *this;
+ }
+
+ // Takes a property name and any number of string values.
+ template <typename... V>
+ DocumentBuilder& AddStringProperty(std::string property_name,
+ V... string_values) {
+ return AddStringProperty(std::move(property_name), {string_values...});
+ }
+
+ // Takes a custom property name and any number of string values.
+ template <typename... V>
+ DocumentBuilder& AddCustomStringProperty(std::string property_name,
+ V... string_values) {
+ return AddCustomStringProperty(std::move(property_name),
+ {string_values...});
+ }
+
+ // Takes a property name and any number of int64_t values.
+ template <typename... V>
+ DocumentBuilder& AddInt64Property(std::string property_name,
+ V... int64_values) {
+ return AddInt64Property(std::move(property_name), {int64_values...});
+ }
+
+ // Takes a custom property name and any number of int64_t values.
+ template <typename... V>
+ DocumentBuilder& AddCustomInt64Property(std::string property_name,
+ V... int64_values) {
+ return AddCustomInt64Property(std::move(property_name), {int64_values...});
+ }
+
+ // Takes a property name and any number of double values.
+ template <typename... V>
+ DocumentBuilder& AddDoubleProperty(std::string property_name,
+ V... double_values) {
+ return AddDoubleProperty(std::move(property_name), {double_values...});
+ }
+
+ // Takes a custom property name and any number of double values.
+ template <typename... V>
+ DocumentBuilder& AddCustomDoubleProperty(std::string property_name,
+ V... double_values) {
+ return AddCustomDoubleProperty(std::move(property_name),
+ {double_values...});
+ }
+
+ // Takes a property name and any number of boolean values.
+ template <typename... V>
+ DocumentBuilder& AddBooleanProperty(std::string property_name,
+ V... boolean_values) {
+ return AddBooleanProperty(std::move(property_name), {boolean_values...});
+ }
+
+ // Takes a custom property name and any number of boolean values.
+ template <typename... V>
+ DocumentBuilder& AddCustomBooleanProperty(std::string property_name,
+ V... boolean_values) {
+ return AddCustomBooleanProperty(std::move(property_name),
+ {boolean_values...});
+ }
+
+ // Takes a property name and any number of bytes values.
+ template <typename... V>
+ DocumentBuilder& AddBytesProperty(std::string property_name,
+ V... bytes_values) {
+ return AddBytesProperty(std::move(property_name), {bytes_values...});
+ }
+
+ // Takes a custom property name and any number of bytes values.
+ template <typename... V>
+ DocumentBuilder& AddCustomBytesProperty(std::string property_name,
+ V... bytes_values) {
+ return AddCustomBytesProperty(std::move(property_name), {bytes_values...});
+ }
+
+ // Takes a property name and any number of document values.
+ template <typename... V>
+ DocumentBuilder& AddDocumentProperty(std::string property_name,
+ V&&... document_values) {
+ return AddDocumentProperty(std::move(property_name), {document_values...});
+ }
+
+ // Takes a custom property name and any number of document values.
+ template <typename... V>
+ DocumentBuilder& AddCustomDocumentProperty(std::string property_name,
+ V&&... document_values) {
+ return AddCustomDocumentProperty(std::move(property_name),
+ {document_values...});
+ }
+
+ DocumentProto Build() const { return document_; }
+
+ private:
+ DocumentProto document_;
+
+ DocumentBuilder& AddStringProperty(
+ std::string property_name,
+ std::initializer_list<std::string_view> string_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (std::string_view string_value : string_values) {
+ property->mutable_string_values()->Add(std::string(string_value));
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomStringProperty(
+ std::string property_name,
+ std::initializer_list<std::string_view> string_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (std::string_view string_value : string_values) {
+ custom_property->mutable_string_values()->Add(std::string(string_value));
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddInt64Property(
+ std::string property_name, std::initializer_list<int64_t> int64_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (int64_t int64_value : int64_values) {
+ property->mutable_int64_values()->Add(int64_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomInt64Property(
+ std::string property_name, std::initializer_list<int64_t> int64_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (int64_t int64_value : int64_values) {
+ custom_property->mutable_int64_values()->Add(int64_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddDoubleProperty(
+ std::string property_name, std::initializer_list<double> double_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (double double_value : double_values) {
+ property->mutable_double_values()->Add(double_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomDoubleProperty(
+ std::string property_name, std::initializer_list<double> double_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (double double_value : double_values) {
+ custom_property->mutable_double_values()->Add(double_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddBooleanProperty(
+ std::string property_name, std::initializer_list<bool> boolean_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (bool boolean_value : boolean_values) {
+ property->mutable_boolean_values()->Add(boolean_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomBooleanProperty(
+ std::string property_name, std::initializer_list<bool> boolean_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (bool boolean_value : boolean_values) {
+ custom_property->mutable_boolean_values()->Add(boolean_value);
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddBytesProperty(
+ std::string property_name,
+ std::initializer_list<std::string> bytes_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (const std::string& bytes_value : bytes_values) {
+ property->mutable_bytes_values()->Add(std::string(bytes_value));
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomBytesProperty(
+ std::string property_name,
+ std::initializer_list<std::string> bytes_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (const std::string& bytes_value : bytes_values) {
+ custom_property->mutable_bytes_values()->Add(std::string(bytes_value));
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddDocumentProperty(
+ std::string property_name,
+ std::initializer_list<DocumentProto> document_values) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (DocumentProto document_value : document_values) {
+ property->mutable_document_values()->Add(std::move(document_value));
+ }
+ return *this;
+ }
+
+ DocumentBuilder& AddCustomDocumentProperty(
+ std::string property_name,
+ std::initializer_list<DocumentProto> document_values) {
+ auto custom_property = document_.add_custom_properties();
+ custom_property->set_name(std::move(property_name));
+ for (DocumentProto document_value : document_values) {
+ custom_property->mutable_document_values()->Add(
+ std::move(document_value));
+ }
+ return *this;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_DOCUMENT_BUILDER_H_
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
new file mode 100644
index 0000000..0eb9474
--- /dev/null
+++ b/icing/file/file-backed-bitmap.cc
@@ -0,0 +1,328 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/file-backed-bitmap.h"
+
+#include <cstdint>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+
+namespace icing {
+namespace lib {
+
+int FileBackedBitmap::GetBlockCapacity(int num_blocks) {
+ // The first block has a lower capacity due to the Header.
+ const int capacity_bytes = kBlockByteSize * num_blocks - kHeaderByteSize;
+ return capacity_bytes * 8;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<FileBackedBitmap>>
+FileBackedBitmap::Create(const Filesystem* filesystem,
+ std::string_view file_path,
+ MemoryMappedFile::Strategy mmap_strategy) {
+ if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
+ return absl_ports::UnimplementedError(
+ "FileBackedBitmap currently doesn't support READ_WRITE_MANUAL_SYNC "
+ "mmap strategy.");
+ }
+
+ auto bitmap = std::unique_ptr<FileBackedBitmap>(
+ new FileBackedBitmap(filesystem, file_path, mmap_strategy));
+
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = bitmap->Initialize();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message();
+ return status;
+ }
+ return bitmap;
+}
+
+FileBackedBitmap::FileBackedBitmap(const Filesystem* filesystem,
+ std::string_view file_path,
+ MemoryMappedFile::Strategy mmap_strategy)
+ : filesystem_(filesystem),
+ file_path_(file_path),
+ mmapper_(new MemoryMappedFile(*filesystem, file_path, mmap_strategy)) {}
+
+FileBackedBitmap::~FileBackedBitmap() {
+ // Only update if we have auto_sync setup, otherwise the checksum will be
+ // updated when the client calls PersistToDisk
+ if (mmapper_->strategy() ==
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
+ // Any valid, initialized file should at least have 1 block.
+ if (mmapper_->region_size() >= kBlockByteSize &&
+ header().version == kCurrentVersion &&
+ header().state == Header::ChecksumState::kStale) {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist bitmap to disk while destructing "
+ << file_path_;
+ }
+ }
+ }
+}
+
+const FileBackedBitmap::Header& FileBackedBitmap::header() const {
+ return reinterpret_cast<const Header&>(*mmapper_->region());
+}
+
+FileBackedBitmap::Header* FileBackedBitmap::mutable_header() {
+ return reinterpret_cast<Header*>(mmapper_->mutable_region());
+}
+
+libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
+ ICING_VLOG(1) << "Initialize bitmap file: " << file_path_;
+
+ const bool is_new_bitmap = !filesystem_->FileExists(file_path_.c_str());
+
+ int64_t file_size = 0;
+ if (is_new_bitmap) {
+ file_size = kBlockByteSize;
+ if (!filesystem_->Grow(file_path_.c_str(), file_size)) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Unable to create a minimal bitmap; "
+ "filename: %s; target size: %lld",
+ file_path_.c_str(), static_cast<long long>(file_size)));
+ }
+
+ ICING_VLOG(1) << "Creating new bitmap in file: " << file_path_
+ << " of size: " << file_size;
+ } else {
+ file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "File corrupted; filename: %s; size: %lld.", file_path_.c_str(),
+ static_cast<long long>(file_size)));
+ }
+
+ ICING_VLOG(1) << "Loading bitmap from file: " << file_path_
+ << " of size: " << file_size;
+ }
+
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message();
+ return status;
+ }
+
+ if (is_new_bitmap) {
+ mutable_header()->version = kCurrentVersion;
+ mutable_header()->state = Header::ChecksumState::kStale;
+ mutable_header()->checksum = 0;
+
+ return mmapper_->PersistToDisk();
+ }
+
+ if (header().state == Header::ChecksumState::kStale) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "File corrupted, has partially flushed data; filename: ", file_path_));
+ }
+
+ if (header().checksum != ComputeChecksum()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "File corrupted, checksum doesn't match; filename: ", file_path_));
+ }
+
+ if (header().version != kCurrentVersion) {
+ return UpgradeToCurrentVersion();
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status FileBackedBitmap::UpgradeToCurrentVersion() {
+ // Currently, only 1 format is supported.
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "File corrupted, mismatched version; filename: %s; %d vs %d.",
+ file_path_.c_str(), header().version, kCurrentVersion));
+}
+
+libtextclassifier3::Status FileBackedBitmap::SetWord(int word_index,
+ Word word) {
+ if (word_index >= NumBits() / kNumWordBits) {
+ ICING_LOG(ERROR) << "word_index: " << word_index
+ << ", number of words: " << NumBits() / kNumWordBits;
+ return absl_ports::InternalError("Trying to access invalid memory");
+ }
+
+ Word* bitmap_data =
+ reinterpret_cast<Word*>(mmapper_->mutable_region() + kHeaderByteSize);
+
+ bitmap_data[word_index] = word;
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<FileBackedBitmap::Word> FileBackedBitmap::GetWord(
+ int word_index) const {
+ if (word_index >= NumBits() / kNumWordBits) {
+ ICING_LOG(ERROR) << "word_index: " << word_index
+ << ", number of words: " << NumBits() / kNumWordBits;
+ return absl_ports::InternalError("Trying to access invalid memory");
+ }
+
+ const Word* bitmap_data = reinterpret_cast<const Word*>(
+ mmapper_->mutable_region() + kHeaderByteSize);
+ return bitmap_data[word_index];
+}
+
+int FileBackedBitmap::NumBits() const {
+ return (mmapper_->region_size() - kHeaderByteSize) * 8;
+}
+
+libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
+ bool bit_value) {
+ if (bit_index >= NumBits()) {
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = GrowTo(bit_index);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message();
+ return status;
+ }
+
+ if (!bit_value) {
+ // All newly added bits are set to false.
+ return libtextclassifier3::Status::OK;
+ }
+ }
+
+ // Figure out which word needs to be modified.
+ const int word_index = bit_index / kNumWordBits;
+ const int word_mask = 1u << (bit_index % kNumWordBits);
+
+ ICING_ASSIGN_OR_RETURN(Word old_word, GetWord(word_index));
+ Word new_word = bit_value ? (old_word | word_mask) : old_word & ~word_mask;
+ if (new_word != old_word) {
+ ICING_RETURN_IF_ERROR(SetWord(word_index, new_word));
+ mutable_header()->state = Header::ChecksumState::kStale;
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<bool> FileBackedBitmap::Get(int bit_index) const {
+ if (bit_index >= NumBits()) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Bitmap file %s is of size %d and can't read bit_index %d.",
+ file_path_.c_str(), NumBits(), bit_index));
+ }
+
+ const Word word_index = bit_index / kNumWordBits;
+ const Word word_mask = 1u << (bit_index % kNumWordBits);
+
+ ICING_ASSIGN_OR_RETURN(Word word, GetWord(word_index));
+ return word & word_mask;
+}
+
+size_t FileBackedBitmap::FileSizeForBits(int num_bits) {
+ const int word_index = num_bits / kNumWordBits;
+ size_t new_file_size = kHeaderByteSize + (word_index + 1) * sizeof(Word);
+ return math_util::RoundUpTo(new_file_size,
+ static_cast<size_t>(kBlockByteSize));
+}
+
+libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
+ if (new_num_bits > kMaxNumBits) {
+ return absl_ports::ResourceExhaustedError(IcingStringUtil::StringPrintf(
+ "Bitmap file %s has a max-capacity of %d bits and cannot fit %d bits",
+ file_path_.c_str(), kMaxNumBits, new_num_bits));
+ }
+
+ const size_t new_file_size = FileSizeForBits(new_num_bits);
+ if (!filesystem_->Grow(file_path_.c_str(), new_file_size)) {
+ return absl_ports::InternalError(
+ IcingStringUtil::StringPrintf("Growing file %s to new size %zd failed",
+ file_path_.c_str(), new_file_size));
+ }
+
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message();
+ return status;
+ }
+
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Grew file %s to new size %zd", file_path_.c_str(), new_file_size);
+ mutable_header()->state = Header::ChecksumState::kStale;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
+ if (new_num_bits > NumBits()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ const size_t new_file_size = FileSizeForBits(new_num_bits);
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message();
+ return status;
+ }
+ if (!filesystem_->Truncate(file_path_.c_str(), new_file_size)) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Truncating file %s to new size %zd failed", file_path_.c_str(),
+ new_file_size));
+ }
+
+ const int word_index = new_num_bits / kNumWordBits;
+ // Mask to only keep bits <= new_num_bits and clear everything else.
+ const Word word_mask = (1u << (new_num_bits % kNumWordBits)) - 1;
+
+ ICING_ASSIGN_OR_RETURN(Word old_word, GetWord(word_index));
+ Word new_word = old_word & word_mask;
+ ICING_RETURN_IF_ERROR(SetWord(word_index, new_word));
+
+ // TODO(cassiewang) It might be worth replacing this with memset().
+ const int num_words = NumBits() / kNumWordBits;
+ for (int i = word_index + 1; i < num_words; ++i) {
+ ICING_RETURN_IF_ERROR(SetWord(i, 0));
+ }
+
+ mutable_header()->state = Header::ChecksumState::kStale;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status FileBackedBitmap::PersistToDisk() {
+ mutable_header()->checksum = ComputeChecksum();
+ mutable_header()->state = Header::ChecksumState::kFresh;
+ return mmapper_->PersistToDisk();
+}
+
+uint32_t FileBackedBitmap::ComputeChecksum() const {
+ std::string_view bitmap_bytes(mmapper_->region() + kHeaderByteSize,
+ NumBits() / 8);
+ return Crc32().Append(bitmap_bytes);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-bitmap.h b/icing/file/file-backed-bitmap.h
new file mode 100644
index 0000000..54d9245
--- /dev/null
+++ b/icing/file/file-backed-bitmap.h
@@ -0,0 +1,219 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A file-backed bitmap with fast & efficient reads/writes of bits.
+// The bitmap will automatically grow in size as more bits are added, with a
+// max-capacity of 2M bits.
+//
+// Note on Performance:
+// This class internally uses mmap() without a readahead buffer. This keeps the
+// memory-usage low while also having low (amortized) read/write latency.
+// However, some reads/writes will pay the cost of page-faults.
+// In order to keep memory-mapping efficient, the bitmap always grows in
+// 4KiB sized blocks so that it is aligned with system page-size.
+//
+// This class doesn't aggressively flush/sync changes to disk and relies on the
+// system to buffer and flush changes in the background. This greatly reduces
+// disk-churn and performance of writes. However, an unexpected crash or an
+// abrupt reboot of the system could lead to data-loss. This can be mitigated
+// by manually calling PersistToDisk() when needed.
+//
+// Usage:
+// auto bitmap = RETURN_OR_ASSIGN(FileBackedBitmap::Create(...));
+//
+// bitmap.Set(100, false);
+// bitmap.Set(10, true);
+//
+// bitmap.Get(0); // Default default of 'false'.
+// bitmap.Get(10);
+//
+// bitmap.PersistToDisk(); // Optional. Immediately syncs all changes to disk.
+// bitmap.reset();
+
+#ifndef ICING_FILE_FILE_BACKED_BITMAP_H_
+#define ICING_FILE_FILE_BACKED_BITMAP_H_
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+
+namespace icing {
+namespace lib {
+
+class FileBackedBitmap {
+ public:
+ // Growth of FileBackedBitmap is in blocks of a fixed size. This helper
+ // returns the number of bits that can be fitted in the specified number of
+ // blocks.
+ //
+ // NOTE: This is meant for tests and clients shouldn't care about this.
+ static int GetBlockCapacity(int num_blocks);
+
+ // Returns an initialized instance of the bitmap that can immediately handle
+ // read/write operations.
+ //
+ // file_path : Specifies the file to persist the bitmap to; must be a path
+ // within a directory that already exists. If the file itself
+ // doesn't exist, a new bitmap will be created.
+ //
+ // mmap_strategy : Mmap strategy for the underlying file, see
+ // MemoryMappedFile::Strategy for more details.
+ //
+ // Returns an error if the file was corrupted or if any IO error was
+ // encountered. An error here implies that the old data has been lost and
+ // the file has to be deleted and re-initialized again.
+ static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedBitmap>> Create(
+ const Filesystem* filesystem, std::string_view file_path,
+ MemoryMappedFile::Strategy mmap_strategy);
+
+ // If the bitmap was created with
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
+ // synced by the system and the checksum will be updated.
+ ~FileBackedBitmap();
+
+ // Set the bit at the specified position. The bitmap is automatically resized
+ // to at least fit 'index' number of bits. bit_index should not be larger than
+ // 2M, which is the max-capacity of FileBackedBitmap.
+ //
+ // Returns any encountered IO error.
+ //
+ // NOTE: While changes take place immediately, they may not be fully persisted
+ // to disk till PersistToDisk() is called.
+ //
+ // NOTE: The Bitmap grows in blocks of 4KiB. So, setting a specific bit
+ // beyond current capacity can lead to pre-allocating up to ~32K extra bits.
+ libtextclassifier3::Status Set(int bit_index, bool bit_value);
+
+ // Get the bit at the specified index. Unset bits default to 'false'.
+ //
+ // Returns OUT_OF_RANGE error if bit_index > NumBits().
+ libtextclassifier3::StatusOr<bool> Get(int bit_index) const;
+
+ // Count of bits currently being stored in this bitmap.
+ //
+ // NOTE: BitMap growth happens in blocks of 4KiB. So, the smallest bitmap will
+ // automatically have ~32K bits pre-allocated. Subsequently, future
+ // growths/truncations of the bitmap will change NumBits() in multiples of
+ // 32K.
+ int NumBits() const;
+
+ // Truncates the size of the bitmap to 'new_num_bits'. Any data beyond this
+ // will be lost.
+ libtextclassifier3::Status TruncateTo(int new_num_bits);
+
+ // Syncs all the changes made to the bitmap to disk and updates the checksum.
+ //
+ // Returns any encountered IO error.
+ //
+ // NOTE: Neither Set() nor the ~FileBackedBitmap() guarantee syncing all
+ // changes to disk. This method should be explicitly called to protect the
+ // data from an abrupt system reboot.
+ libtextclassifier3::Status PersistToDisk();
+
+ private:
+ // Limit the max-size of the bitmap. Someone wanting to store more bits will
+ // likely benefit from a custom solution.
+ static constexpr int kMaxNumBits = 2 * 1024 * 1024;
+
+ // Growth of FileBackedBitmap will be in blocks of this size. This size
+ // should align with the page-size of the system so that mmapping can be
+ // most efficient.
+ static constexpr int kBlockByteSize = 4 * 1024;
+
+ // Version of the file-format used by the class. Every time the format is
+ // modified in a backwards-incompatible way, this needs to be incremented
+ static constexpr int32_t kCurrentVersion = 1;
+
+ struct Header {
+ // Version of the file-format used by this class. This allows us to change
+ // the format and upgrade old data to the new format without losing it.
+ int32_t version;
+
+ // Checksum of the entire file when it was last persisted to disk.
+ // This is used on init to make sure that the file has not been corrupted.
+ //
+ // NOTE: The checksum is not expected to match when ChecksumState=kStale.
+ uint32_t checksum;
+
+ // As an optimization, FileBackedBitmap delays recomputation of the checksum
+ // even when some bits in the Bitmap are modified. While this improves
+ // performance, it increases the risk of losing data due to a crash.
+ // ChecksumState tracks if the changes to the bitmap have been fully
+ // reflected in the checksum stored above.
+ //
+ // NOTE: We use int32_t to store a bool info here to keep the Header
+ // aligned.
+ enum ChecksumState : int32_t { kFresh, kStale };
+ ChecksumState state;
+ };
+
+ // The size of the backing file to store the specified number of bits. This
+ // size is aligned to the page-size of the system so that it can be
+ // efficiently memory mapped.
+ static size_t FileSizeForBits(int num_bits);
+
+ static constexpr int kHeaderByteSize = sizeof(Header);
+
+ // Helpers to read/modify the header of the bitmap file.
+ const Header& header() const;
+ Header* mutable_header();
+
+ // Use FileBackedBitmap::Create() to instantiate.
+ FileBackedBitmap(const Filesystem* filesystem, std::string_view file_path,
+ MemoryMappedFile::Strategy mmap_strategy);
+
+ // Verify the contents of the bitmap and get ready for read/write operations.
+ //
+ // Returns an error if the file was corrupted or if any IO error was
+ // encountered. An error here implies that the old data has been lost and
+ // the file has to be deleted and re-initialized again.
+ libtextclassifier3::Status Initialize();
+
+ // Makes sure that the data on disk is upgraded to match the file-format
+ // represented by kCurrentVersion.
+ libtextclassifier3::Status UpgradeToCurrentVersion();
+
+ // Grows the size of the bitmap to match 'new_num_bits'. Any newly added bit
+ // will default to 'false'.
+ //
+ // The upper-bound for new_num_bits is kMaxNumBits. Requests to further
+ // increase the size will fail with an INVALID_ARGUMENT error.
+ libtextclassifier3::Status GrowTo(int new_num_bits);
+
+ using Word = uint32_t;
+ static constexpr int kNumWordBits = sizeof(Word) * 8;
+
+ // Helpers to perform 32bit read/write operations on the raw bitmap data.
+ // This makes it easy to use 32bit bitwise operations to modify the bitmap.
+ libtextclassifier3::StatusOr<Word> GetWord(int word_index) const;
+ libtextclassifier3::Status SetWord(int word_index, Word word);
+
+ // CRC32 based checksum of all the bits stored in the bitmap. This checksum
+ // only uses the data and not the contents of the header.
+ uint32_t ComputeChecksum() const;
+
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+ std::unique_ptr<MemoryMappedFile> mmapper_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_FILE_BACKED_BITMAP_H_
diff --git a/icing/file/file-backed-bitmap_test.cc b/icing/file/file-backed-bitmap_test.cc
new file mode 100644
index 0000000..9bfec65
--- /dev/null
+++ b/icing/file/file-backed-bitmap_test.cc
@@ -0,0 +1,395 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/file-backed-bitmap.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::_;
+using ::testing::HasSubstr;
+
+class FileBackedBitmapTest : public testing::Test {
+ protected:
+ static int GetBlockCapacity(int num_blocks) {
+ return FileBackedBitmap::GetBlockCapacity(num_blocks);
+ }
+
+ void SetUp() override { file_path_ = GetTestTempDir() + "/bitmap"; }
+
+ void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+ std::string file_path_;
+ Filesystem filesystem_;
+ MemoryMappedFile::Strategy mmap_strategy_ =
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC;
+};
+
+// TODO(cassiewang) Add tests for other corruption scenarios where the
+// file has an invalid checksum or is dirty on init.
+TEST_F(FileBackedBitmapTest, InvalidFile) {
+ ASSERT_THAT(FileBackedBitmap::Create(&filesystem_, "", mmap_strategy_)
+ .status()
+ .error_message(),
+ HasSubstr("Unable to create"));
+
+ std::string bad_data = "Bad bitmap file content";
+ filesystem_.Write(file_path_.c_str(), bad_data.data(), bad_data.size());
+
+ ASSERT_THAT(FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_)
+ .status()
+ .error_message(),
+ HasSubstr("corrupted"));
+}
+
+TEST_F(FileBackedBitmapTest, CreateNewBitMap) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ // Even a new bitmap will have 1 block with pre-allocated bits.
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(0), IsOkAndHolds(false));
+ ICING_EXPECT_OK(bitmap->PersistToDisk());
+}
+
+TEST_F(FileBackedBitmapTest, CanReadAfterWrite) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+
+ // Can Grow bitmap to store both true and false.
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ ICING_EXPECT_OK(bitmap->Set(200000, false));
+
+ EXPECT_THAT(bitmap->Get(0), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(200000), IsOkAndHolds(false));
+
+ // Can write new data without growing the bitmap.
+ ICING_EXPECT_OK(bitmap->Set(50, true));
+ EXPECT_THAT(bitmap->Get(50), IsOkAndHolds(true));
+
+ // Can modify the value of a previously written bit.
+ ICING_EXPECT_OK(bitmap->Set(100000, false));
+ ICING_EXPECT_OK(bitmap->Set(200000, true));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(200000), IsOkAndHolds(true));
+}
+
+// Make sure that the growth of the bitmap is in multiples of 4KiB blocks.
+// This is required to keep our memory mapping efficient.
+TEST_F(FileBackedBitmapTest, BitMapGrowsInMultipleOfBlocks) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(1) - 1), IsOkAndHolds(false));
+
+ // 100K bits need four 4KiB sized blocks.
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // 200K bits need seven 4KiB sized blocks.
+ ICING_EXPECT_OK(bitmap->Set(200000, false));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(7));
+
+ // Reusing pre-allocated bits doesn't require any growth.
+ ICING_EXPECT_OK(bitmap->Set(0, false));
+ ICING_EXPECT_OK(bitmap->Set(1000, false));
+ ICING_EXPECT_OK(bitmap->Set(GetBlockCapacity(7) - 1, false));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(7));
+}
+
+TEST_F(FileBackedBitmapTest, CanPersistBitmapToDiskRegularly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+
+ // Can read older data after PersistToDisk.
+ ICING_EXPECT_OK(bitmap->Set(100, true));
+ ICING_EXPECT_OK(bitmap->PersistToDisk());
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(50), IsOkAndHolds(false));
+
+ // Can grow the bitmap to write new data after PersistToDisk.
+ ICING_EXPECT_OK(bitmap->Set(50000, false));
+ ICING_EXPECT_OK(bitmap->PersistToDisk());
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(2));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(50000), IsOkAndHolds(false));
+
+ // Can write new data without growing the bitmap after PersistToDisk.
+ ICING_EXPECT_OK(bitmap->Set(50, true));
+ ICING_EXPECT_OK(bitmap->PersistToDisk());
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(2));
+ EXPECT_THAT(bitmap->Get(50), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(50000), IsOkAndHolds(false));
+}
+
+TEST_F(FileBackedBitmapTest, BitmapUsableAcrossMultipleInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ ICING_EXPECT_OK(bitmap->Set(100, true));
+
+ // Persist all data and reset the bitmap instance.
+ ICING_EXPECT_OK(bitmap->PersistToDisk());
+ bitmap.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(0), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+
+ // Reset the bitmap instance without explicitly persisting data.
+ // Even here, the system should flush the data, unless the device reboots.
+ bitmap.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(0), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+
+ // We can continue to read/write bits on an existing data.
+ ICING_EXPECT_OK(bitmap->Set(200, false));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(200), IsOkAndHolds(false));
+}
+
+TEST_F(FileBackedBitmapTest, HandleOutOfRangeReads) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ // The bitmap initially has 4K bits pre-allocated.
+ EXPECT_THAT(bitmap->Get(0), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(1) - 1), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(1)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(2)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Expand bitmap to use 2 blocks.
+ ICING_EXPECT_OK(bitmap->Set(GetBlockCapacity(1) + 100, true));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(1) + 1), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(2) - 1), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(2) + 1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(5)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(GetBlockCapacity(5)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedBitmapTest, TruncateBitmap) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+
+ // NOTE: Set uses zero-based index. So, Set(99) sets the 100th bit.
+ ICING_EXPECT_OK(bitmap->Set(99, true));
+ ICING_EXPECT_OK(bitmap->Set(100, true));
+ ICING_EXPECT_OK(bitmap->Set(101, true));
+
+ // Bitmap has fewer bits. So, nothing to truncate.
+ ICING_EXPECT_OK(bitmap->TruncateTo(200));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(99), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(101), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(199), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(200), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(201), IsOkAndHolds(false));
+
+ // Bitmap size doesn't have any set bits beyond the requested size.
+ // So, nothing to truncate.
+ ICING_EXPECT_OK(bitmap->TruncateTo(102));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(99), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(101), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(102), IsOkAndHolds(false));
+
+ // TruncateTo(100) should reduce the size of the bitmap to 100.
+ ICING_EXPECT_OK(bitmap->TruncateTo(100));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(99), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(101), IsOkAndHolds(false));
+}
+
+TEST_F(FileBackedBitmapTest, TruncateBitmapAcrossMultipleBlocks) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ ICING_EXPECT_OK(bitmap->Set(100, true));
+ ICING_EXPECT_OK(bitmap->Set(1000, true));
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // Bitmap only has 100K bits. So, nothing to truncate.
+ ICING_EXPECT_OK(bitmap->TruncateTo(200000));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // Bitmap only has 100K bits. So, nothing to truncate.
+ ICING_EXPECT_OK(bitmap->TruncateTo(100001));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // Truncate to 50K should chop off block#3 and block#4.
+ // It should also clear bits beyond 50K.
+ ICING_EXPECT_OK(bitmap->Set(49999, true));
+ ICING_EXPECT_OK(bitmap->Set(50000, true));
+ ICING_EXPECT_OK(bitmap->Set(50001, true));
+ ICING_EXPECT_OK(bitmap->TruncateTo(50000));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(2));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(49999), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(50000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(50001), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Truncate to 2K should chop off the 2nd block.
+ ICING_EXPECT_OK(bitmap->TruncateTo(2000));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(50000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Truncate to 500 not chop any blocks off, but clear subsequent bits.
+ ICING_EXPECT_OK(bitmap->TruncateTo(500));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(50000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedBitmapTest, TruncateBitmapAcrossInstances) {
+ // Instance#1
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ ICING_EXPECT_OK(bitmap->Set(100, true));
+ ICING_EXPECT_OK(bitmap->Set(1000, true));
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // Instance#2
+ bitmap.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ ICING_EXPECT_OK(bitmap->TruncateTo(50000));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(2));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(49999), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(50000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Instance#3
+ bitmap.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ ICING_EXPECT_OK(bitmap->TruncateTo(500));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(100), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(500), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+// Make sure that a bitmap can both grow and be truncated many times.
+TEST_F(FileBackedBitmapTest, TruncateAndGrowBitmap) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+
+ // Grow#1
+ ICING_EXPECT_OK(bitmap->Set(1000, true));
+ ICING_EXPECT_OK(bitmap->Set(100000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(4));
+
+ // Truncate#1
+ ICING_EXPECT_OK(bitmap->TruncateTo(50000));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(2));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Grow#2
+ ICING_EXPECT_OK(bitmap->Set(200000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(7));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(true));
+ EXPECT_THAT(bitmap->Get(100000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(200000), IsOkAndHolds(true));
+
+ // Truncate#2
+ ICING_EXPECT_OK(bitmap->TruncateTo(1000));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+ EXPECT_THAT(bitmap->Get(1000), IsOkAndHolds(false));
+ EXPECT_THAT(bitmap->Get(100000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(bitmap->Get(200000),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedBitmapTest, BitMapCantGrowTooBigInSize) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedBitmap> bitmap,
+ FileBackedBitmap::Create(&filesystem_, file_path_, mmap_strategy_));
+ // Set a bit is within the 2M bit limit.
+ ICING_EXPECT_OK(bitmap->Set(1000, true));
+
+ // Go beyond the 2M bit limit.
+ EXPECT_THAT(bitmap->Set(3 * 1024 * 1024, true),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ // Subsequent calls work fine.
+ ICING_EXPECT_OK(bitmap->Set(2000, true));
+ EXPECT_THAT(bitmap->NumBits(), GetBlockCapacity(1));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
new file mode 100644
index 0000000..d17757f
--- /dev/null
+++ b/icing/file/file-backed-proto-log.h
@@ -0,0 +1,867 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// File-backed log of protos with append-only writes and position based reads.
+//
+// There should only be one instance of a FileBackedProtoLog of the same file at
+// a time; using multiple instances at the same time may lead to undefined
+// behavior.
+//
+// The entire checksum is computed on initialization to verify the contents are
+// valid. On failure, the log will be truncated to the last verified state when
+// PersistToDisk() was called. If the log cannot successfully restore the last
+// state due to disk corruption or some other inconsistency, then the entire log
+// will be lost.
+//
+// Each proto written to the file will have a metadata written just before it.
+// The metadata consists of
+// {
+// 1 bytes of kProtoMagic;
+// 3 bytes of the proto size
+// n bytes of the proto itself
+// }
+//
+// Example usage:
+// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
+// options));
+// auto proto_log = create_result.proto_log;
+//
+// Document document;
+// document.set_namespace("com.google.android.example");
+// document.set_uri("www.google.com");
+//
+// int64_t document_offset = proto_log->WriteProto(document));
+// Document same_document = proto_log->ReadProto(document_offset));
+// proto_log->PersistToDisk();
+//
+// TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
+// migration method.
+
+#ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
+#define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/zlib.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+template <typename ProtoT>
+class FileBackedProtoLog {
+ public:
+ struct Options {
+ // Whether to compress each proto before writing to the proto log.
+ bool compress;
+
+ // Byte-size limit for each proto written to the store. This does not
+ // include the bytes needed for the metadata of each proto.
+ //
+ // NOTE: Currently, we only support protos up to 16MiB. We store the proto
+ // size in 3 bytes within the metadata.
+ //
+ // NOTE: This limit is only enforced for future writes. If the store
+ // previously had a higher limit, then reading older entries could return
+ // larger protos.
+ //
+ // NOTE: The max_proto_size is the upper limit for input protos into the
+ // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
+ // to a smaller size, ProtoLog will not accept it. Protos that result in a
+ // compressed size larger than max_proto_size are also not accepted.
+ const int32_t max_proto_size;
+
+ // Must specify values for options.
+ Options() = delete;
+ explicit Options(bool compress_in,
+ const int32_t max_proto_size_in = kMaxProtoSize)
+ : compress(compress_in), max_proto_size(max_proto_size_in) {}
+ };
+
+ // Header stored at the beginning of the file before the rest of the log
+ // contents. Stores metadata on the log.
+ //
+ // TODO(b/139375388): Migrate the Header struct to a proto. This makes
+ // migrations easier since we don't need to worry about different size padding
+ // (which would affect the checksum) and different endians.
+ struct Header {
+ static constexpr int32_t kMagic = 0xf4c6f67a;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic = kMagic;
+
+ // Whether to compress the protos before writing to the log.
+ bool compress = true;
+
+ // The maximum proto size that can be written to the log.
+ int32_t max_proto_size = 0;
+
+ // Checksum of the log elements, doesn't include the header fields.
+ uint32_t log_checksum = 0;
+
+ // Last known good offset at which the log and its checksum were updated.
+ // If we crash between writing to the log and updating the checksum, we can
+ // try to rewind the log to this offset and verify the checksum is still
+ // valid instead of throwing away the entire log.
+ int64_t rewind_offset = sizeof(Header);
+
+ // Must be at the end. Contains the crc checksum of the preceding fields.
+ uint32_t header_checksum = 0;
+
+ uint32_t CalculateHeaderChecksum() const {
+ Crc32 crc;
+ std::string_view header_str(reinterpret_cast<const char*>(this),
+ offsetof(Header, header_checksum));
+ crc.Append(header_str);
+ return crc.Get();
+ }
+ };
+
+ struct CreateResult {
+ // A successfully initialized log.
+ std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
+
+ // Whether there was some data loss while initializing from a previous
+ // state. This can happen if the file is corrupted or some previously added
+ // data was unpersisted. This may be used to signal that any derived data
+ // off of the proto log may need to be regenerated.
+ bool data_loss;
+ };
+
+ // Factory method to create, initialize, and return a FileBackedProtoLog. Will
+ // create the file if it doesn't exist.
+ //
+ // If on re-initialization the log detects disk corruption or some previously
+ // added data was unpersisted, the log will rewind to the last-good state. The
+ // log saves these checkpointed "good" states when PersistToDisk() is called
+ // or the log is safely destructed. If the log rewinds successfully to the
+ // last-good state, then the returned CreateResult.data_loss indicates
+ // there was some data loss so that any derived data may know that it
+ // needs to be updated. If the log re-initializes successfully without any
+ // data loss, the boolean will be false.
+ //
+ // Params:
+ // filesystem: Handles system level calls
+ // file_path: Path of the underlying file. Directory of the file should
+ // already exist
+ // options: Configuration options for the proto log
+ //
+ // Returns:
+ // FileBackedProtoLog::CreateResult on success
+ // INVALID_ARGUMENT on an invalid option
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> Create(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Not copyable
+ FileBackedProtoLog(const FileBackedProtoLog&) = delete;
+ FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
+
+ // This will update the checksum of the log as well.
+ ~FileBackedProtoLog();
+
+ // Writes the serialized proto to the underlying file. Writes are applied
+ // directly to the underlying file. Users do not need to sync the file after
+ // writing.
+ //
+ // Returns:
+ // Offset of the newly appended proto in file on success
+ // INVALID_ARGUMENT if proto is too large, as decided by
+ // Options.max_proto_size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
+
+ // Reads out a proto located at file_offset from the file.
+ //
+ // Returns:
+ // A proto on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+
+ // Calculates and returns the disk usage in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // An iterator helping to find offsets of all the protos in file.
+ // Example usage:
+ //
+ // while (iterator.Advance().ok()) {
+ // int64_t offset = iterator.GetOffset();
+ // // Do something
+ // }
+ class Iterator {
+ public:
+ Iterator(const Filesystem& filesystem, const std::string& file_path,
+ int64_t initial_offset);
+
+ // Advances to the position of next proto.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if it reaches the end
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Advance();
+
+ // Returns the file offset of current proto.
+ int64_t GetOffset();
+
+ private:
+ static constexpr int64_t kInvalidOffset = -1;
+ // Used to read proto metadata
+ MemoryMappedFile mmapped_file_;
+ // Offset of first proto
+ int64_t initial_offset_;
+ int64_t current_offset_;
+ int64_t file_size_;
+ };
+
+ // Returns an iterator of current proto log. The caller needs to keep the
+ // proto log unchanged while using the iterator, otherwise unexpected
+ // behaviors could happen.
+ Iterator GetIterator();
+
+ // Persists all changes since initialization or the last call to
+ // PersistToDisk(). Any changes that aren't persisted may be lost if the
+ // system fails to close safely.
+ //
+ // Example use case:
+ //
+ // Document document;
+ // document.set_namespace("com.google.android.example");
+ // document.set_uri("www.google.com");
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // We lose the document here since it wasn't persisted.
+ // // *SYSTEM CRASH*
+ // }
+ //
+ // {
+ // // Can still successfully create after a crash since the log can
+ // // rewind/truncate to recover into a previously good state
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // Lost the proto since we didn't PersistToDisk before the crash
+ // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // Persisted this time, so we should be ok.
+ // ICING_ASSERT_OK(proto_log->PersistToDisk());
+ // }
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // SUCCESS
+ // Document same_document = proto_log->ReadProto(document_offset));
+ // }
+ //
+ // NOTE: Since all protos are already written to the file directly, this
+ // just updates the checksum and rewind position. Without these updates,
+ // future initializations will truncate the file and discard unpersisted
+ // changes.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates the checksum of the log contents. Excludes the header content.
+ //
+ // Returns:
+ // Crc of the log content
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ private:
+ // Object can only be instantiated via the ::Create factory.
+ FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
+ std::unique_ptr<Header> header);
+
+ // Initializes a new proto log.
+ //
+ // Returns:
+ // std::unique_ptr<FileBackedProtoLog> that can be used immediately
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Verifies that the existing proto log is in a good state. If not in a good
+ // state, then the proto log may be truncated to the last good state and
+ // content will be lost.
+ //
+ // Returns:
+ // std::unique_ptr<FileBackedProtoLog> that can be used immediately
+ // INTERNAL_ERROR on IO error or internal inconsistencies in the file
+ // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
+ // instances
+ static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options, int64_t file_size);
+
+ // Takes an initial checksum and updates it with the content between `start`
+ // and `end` offsets in the file.
+ //
+ // Returns:
+ // Crc of the content between `start`, inclusive, and `end`, exclusive.
+ // INTERNAL_ERROR on IO error
+ // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
+ static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end);
+
+ // Magic number added in front of every proto. Used when reading out protos
+ // as a first check for corruption in each entry in the file. Even if there is
+ // a corruption, the best we can do is roll back to our last recovery point
+ // and throw away un-flushed data. We can discard/reuse this byte if needed so
+ // that we have 4 bytes to store the size of protos, and increase the size of
+ // protos we support.
+ static constexpr uint8_t kProtoMagic = 0x5C;
+
+ // Our internal max for protos.
+ //
+ // WARNING: Changing this to a larger number may invalidate our assumption
+ // that that proto size can safely be stored in the last 3 bytes of the proto
+ // header.
+ static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
+ static_assert(kMaxProtoSize <= 0x00FFFFFF,
+ "kMaxProtoSize doesn't fit in 3 bytes");
+
+ // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
+ static constexpr int kDeflateCompressionLevel = 3;
+
+ // Chunks of the file to mmap at a time, so we don't mmap the entire file.
+ static constexpr int kMmapChunkSize = 4 * 1024;
+
+ ScopedFd fd_;
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+
+ // Reads out the metadata of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // Proto's metadata on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
+ // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
+ static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+ std::unique_ptr<Header> header_;
+};
+
+template <typename ProtoT>
+constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic;
+
+template <typename ProtoT>
+FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
+ const std::string& file_path,
+ std::unique_ptr<Header> header)
+ : filesystem_(filesystem),
+ file_path_(file_path),
+ header_(std::move(header)) {
+ fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
+}
+
+template <typename ProtoT>
+FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Error persisting to disk during destruction of FileBackedProtoLog: "
+ << file_path_;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
+FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
+ const std::string& file_path,
+ const Options& options) {
+ if (options.max_proto_size <= 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be greater than 0, was %d",
+ options.max_proto_size));
+ }
+
+ // Since we store the proto_size in 3 bytes, we can only support protos of up
+ // to 16MiB.
+ if (options.max_proto_size > kMaxProtoSize) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be under 16MiB, was %d",
+ options.max_proto_size));
+ }
+
+ if (!filesystem->FileExists(file_path.c_str())) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size '", file_path, "'"));
+ }
+
+ if (file_size == 0) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ return InitializeExistingFile(filesystem, file_path, options, file_size);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
+FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
+ const std::string& file_path,
+ const Options& options) {
+ // Create the header
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ header->compress = options.compress;
+ header->max_proto_size = options.max_proto_size;
+ header->header_checksum = header->CalculateHeaderChecksum();
+
+ if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write header for file: ", file_path));
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<FileBackedProtoLog<ProtoT>>(
+ new FileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header))),
+ /*data_loss=*/false};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
+FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
+ const std::string& file_path,
+ const Options& options,
+ int64_t file_size) {
+ if (file_size < sizeof(Header)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("File header too short for: ", file_path));
+ }
+
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
+ /*offset=*/0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to read header for file: ", file_path));
+ }
+
+ // Make sure the header is still valid before we use any of its values. This
+ // is covered by the header_checksum check below, but this is a quick check
+ // that can save us from an extra crc computation.
+ if (header->magic != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
+ }
+
+ if (header->header_checksum != header->CalculateHeaderChecksum()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header checksum for: ", file_path));
+ }
+
+ if (header->compress != options.compress) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Inconsistent compress option, expected %d, actual %d",
+ header->compress, options.compress));
+ }
+
+ if (header->max_proto_size > options.max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Max proto size cannot be smaller than previous "
+ "instantiations, previous size %d, wanted size %d",
+ header->max_proto_size, options.max_proto_size));
+ }
+ header->max_proto_size = options.max_proto_size;
+
+ bool data_loss = false;
+ ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(),
+ sizeof(Header), file_size));
+ // Double check that the log checksum is the same as the one that was
+ // persisted last time. If not, we start recovery logic.
+ if (header->log_checksum != calculated_log_checksum.Get()) {
+ // Need to rewind the proto log since the checksums don't match
+ data_loss = true;
+ // Worst case, we have to rewind the entire log back to just the header
+ int64_t last_known_good = sizeof(Header);
+
+ // Calculate the checksum of the log contents just up to the last rewind
+ // offset point. This will be valid if we just appended contents to the log
+ // without updating the checksum, and we can rewind back to this point
+ // safely.
+ ICING_ASSIGN_OR_RETURN(
+ calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header),
+ header->rewind_offset));
+ if (header->log_checksum == calculated_log_checksum.Get()) {
+ // Check if it matches our last rewind state. If so, this becomes our last
+ // good state and we can safely truncate and recover from here.
+ last_known_good = header->rewind_offset;
+ } else {
+ // Otherwise, we're going to truncate the entire log and this resets the
+ // checksum to an empty log state.
+ header->log_checksum = 0;
+ }
+
+ if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Error truncating file: ", file_path));
+ }
+
+ ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
+ << last_known_good;
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<FileBackedProtoLog<ProtoT>>(
+ new FileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header))),
+ data_loss};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end) {
+ auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY);
+ Crc32 new_crc(initial_crc.Get());
+
+ if (start < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Starting checksum offset of file '%s' must be greater than 0, was "
+ "%lld",
+ file_path.c_str(), static_cast<long long>(start)));
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (end > file_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be within "
+ "file size of %lld, was %lld",
+ file_path.c_str(), static_cast<long long>(file_size),
+ static_cast<long long>(end)));
+ }
+
+ for (int i = start; i < end; i += kMmapChunkSize) {
+ // Don't read past the file size.
+ int next_chunk_size = kMmapChunkSize;
+ if ((i + kMmapChunkSize) >= end) {
+ next_chunk_size = end - i;
+ }
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+
+ auto mmap_str = std::string_view(mmapped_file.region(), next_chunk_size);
+ new_crc.Append(mmap_str);
+ }
+
+ return new_crc;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
+ const ProtoT& proto) {
+ int64_t proto_size = proto.ByteSizeLong();
+ int32_t metadata;
+ int metadata_size = sizeof(metadata);
+ int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
+
+ if (proto_size > header_->max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "proto_size, %lld, was too large to write. Max is %d",
+ static_cast<long long>(proto_size), header_->max_proto_size));
+ }
+
+ // At this point, we've guaranteed that proto_size is under kMaxProtoSize (see
+ // ::Create), so we can safely store it in an int.
+ int final_size = 0;
+
+ std::string proto_str;
+ google3_proto_compat::io::StringOutputStream proto_stream(&proto_str);
+
+ if (header_->compress) {
+ google3_proto_compat::io::GzipOutputStream::Options options;
+ options.format = google3_proto_compat::io::GzipOutputStream::ZLIB;
+ options.compression_level = kDeflateCompressionLevel;
+
+ google3_proto_compat::io::GzipOutputStream compressing_stream(&proto_stream,
+ options);
+
+ bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
+ compressing_stream.Close();
+
+ if (!success) {
+ return absl_ports::InternalError("Error compressing proto.");
+ }
+
+ final_size = proto_str.size();
+
+ // In case the compressed proto is larger than the original proto, we also
+ // can't write it.
+ if (final_size > header_->max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Compressed proto size, %d, was greater than "
+ "max_proto_size, %d",
+ final_size, header_->max_proto_size));
+ }
+ } else {
+ // Serialize the proto directly into the write buffer at an offset of the
+ // metadata.
+ proto.SerializeToZeroCopyStream(&proto_stream);
+ final_size = proto_str.size();
+ }
+
+ // 1st byte for magic, next 3 bytes for proto size.
+ metadata = (kProtoMagic << 24) | final_size;
+
+ // Actually write metadata, has to be done after we know the possibly
+ // compressed proto size
+ if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
+ }
+
+ // Write the serialized proto
+ if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto to: ", file_path_));
+ }
+
+ return current_position;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
+ int64_t file_offset) const {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ MemoryMappedFile mmapped_file(*filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get the
+ // inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ // Copy out however many bytes it says the proto is
+ int stored_size = metadata & 0x00FFFFFF;
+
+ ICING_RETURN_IF_ERROR(
+ mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+ google3_proto_compat::io::ArrayInputStream proto_stream(
+ mmapped_file.mutable_region(), stored_size);
+
+ // Deserialize proto
+ ProtoT proto;
+ if (header_->compress) {
+ google3_proto_compat::io::GzipInputStream decompress_stream(&proto_stream);
+ proto.ParseFromZeroCopyStream(&decompress_stream);
+ } else {
+ proto.ParseFromZeroCopyStream(&proto_stream);
+ }
+
+ return proto;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
+ const {
+ int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+ if (size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Failed to get disk usage of proto log");
+ }
+ return size;
+}
+
+template <typename ProtoT>
+FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
+ const std::string& file_path,
+ int64_t initial_offset)
+ : mmapped_file_(filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY),
+ initial_offset_(initial_offset),
+ current_offset_(kInvalidOffset),
+ file_size_(filesystem.GetFileSize(file_path.c_str())) {
+ if (file_size_ == Filesystem::kBadFileSize) {
+ // Fails all Advance() calls
+ file_size_ = 0;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
+ if (current_offset_ == kInvalidOffset) {
+ // First Advance() call
+ current_offset_ = initial_offset_;
+ } else {
+ // Jumps to the next proto position
+ ICING_ASSIGN_OR_RETURN(
+ int metadata,
+ ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+ int proto_size = metadata & 0x00FFFFFF;
+ current_offset_ += sizeof(metadata) + proto_size;
+ }
+
+ if (current_offset_ < file_size_) {
+ return libtextclassifier3::Status::OK;
+ } else {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "The next proto offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(current_offset_),
+ static_cast<long long>(file_size_)));
+ }
+}
+
+template <typename ProtoT>
+int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
+ return current_offset_;
+}
+
+template <typename ProtoT>
+typename FileBackedProtoLog<ProtoT>::Iterator
+FileBackedProtoLog<ProtoT>::GetIterator() {
+ return Iterator(*filesystem_, file_path_, /*initial_offset=*/sizeof(Header));
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+ // Checks file_offset
+ if (file_offset >= file_size) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+ int metadata;
+ int metadata_size = sizeof(metadata);
+ if (file_offset + metadata_size >= file_size) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Wrong metadata offset %lld, metadata doesn't fit in "
+ "with file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+ // Reads metadata
+ ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
+ memcpy(&metadata, mmapped_file->region(), metadata_size);
+ // Checks magic number
+ uint8_t stored_k_proto_magic = metadata >> 24;
+ if (stored_k_proto_magic != kProtoMagic) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
+ stored_k_proto_magic));
+ }
+ return metadata;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == header_->rewind_offset) {
+ // No changes made, don't need to update the checksum.
+ return libtextclassifier3::Status::OK;
+ }
+
+ int64_t new_content_size = file_size - header_->rewind_offset;
+ Crc32 crc;
+ if (new_content_size < 0) {
+ // File shrunk, recalculate the entire checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
+ file_size));
+ } else {
+ // Append new changes to the existing checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc,
+ ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
+ header_->rewind_offset, file_size));
+ }
+
+ header_->log_checksum = crc.Get();
+ header_->rewind_offset = file_size;
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+FileBackedProtoLog<ProtoT>::ComputeChecksum() {
+ return FileBackedProtoLog<ProtoT>::ComputeChecksum(
+ filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
+ /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_FILE_BACKED_PROTO_LOG_H_
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
new file mode 100644
index 0000000..26e0fb0
--- /dev/null
+++ b/icing/file/file-backed-proto-log_benchmark.cc
@@ -0,0 +1,169 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/file-backed-proto-log.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+// go/microbenchmarks
+//
+// To build and run on a local machine:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:file-backed-proto-log_benchmark
+//
+// $ blaze-bin/icing/file/file-backed-proto-log_benchmark
+// --benchmarks=all
+//
+//
+// To build and run on an Android device (must be connected and rooted):
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:file-backed-proto-log_benchmark
+//
+// $ adb root
+//
+// $ adb push
+// blaze-bin/icing/file/file-backed-proto-log_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark
+// --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static void BM_Write(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->WriteProto(document));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Write)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+static void BM_Read(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log =
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ReadProto(write_offset));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Read)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
new file mode 100644
index 0000000..3a9060d
--- /dev/null
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -0,0 +1,519 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/file-backed-proto-log.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::A;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::Return;
+
+class FileBackedProtoLogTest : public ::testing::Test {
+ protected:
+ // Adds a user-defined default construct because a const member variable may
+ // make the compiler accidentally delete the default constructor.
+ // https://stackoverflow.com/a/47368753
+ FileBackedProtoLogTest() {}
+
+ void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; }
+
+ void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+ const Filesystem filesystem_;
+ std::string file_path_;
+ bool compress_ = true;
+ int64_t max_proto_size_ = 256 * 1024; // 256 KiB
+};
+
+TEST_F(FileBackedProtoLogTest, Initialize) {
+ // max_proto_size must be greater than 0
+ int invalid_max_proto_size = 0;
+ ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ compress_, invalid_max_proto_size)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ EXPECT_THAT(create_result.proto_log, NotNull());
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Can't recreate the same file with different options.
+ ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(!compress_,
+ max_proto_size_)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
+ int max_proto_size = 1;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Proto is too large for the max_proto_size_in
+ ASSERT_THAT(proto_log->WriteProto(document),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write a proto
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
+ proto_log->WriteProto(document));
+
+ // The 4 bytes of metadata that just doesn't have the same kProtoMagic
+ // specified in file-backed-proto-log.h
+ uint32_t wrong_magic = 0x7E000000;
+
+ // Sanity check that we opened the file correctly
+ int fd = filesystem_.OpenForWrite(file_path_.c_str());
+ ASSERT_GT(fd, 0);
+
+ // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
+ // a proto entry.
+ filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
+
+ ASSERT_THAT(proto_log->ReadProto(file_offset),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
+ int last_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
+ int last_offset;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, CorruptHeader) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ int corrupt_offset =
+ offsetof(FileBackedProtoLog<DocumentProto>::Header, rewind_offset);
+ // We should never rewind to a negative offset.
+ int invalid_rewind_offset = -1;
+ filesystem_.PWrite(file_path_.c_str(), corrupt_offset,
+ &invalid_rewind_offset, sizeof(invalid_rewind_offset));
+ }
+
+ {
+ // Reinitialize the same proto_log
+ ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, CorruptContent) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist an document.
+ ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // "Corrupt" the content written in the log.
+ document.set_uri("invalid");
+ std::string serialized_document = document.SerializeAsString();
+ filesystem_.PWrite(file_path_.c_str(), document_offset,
+ serialized_document.data(), serialized_document.size());
+ }
+
+ {
+ // We can recover, but we have data loss.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.data_loss);
+
+ // Lost everything in the log since the rewind position doesn't help if
+ // there's been data corruption within the persisted region
+ ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
+ sizeof(FileBackedProtoLog<DocumentProto>::Header));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, PersistToDisk) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace2", "uri2").Build();
+ int document1_offset, document2_offset;
+ int log_size;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Write, but don't explicitly persist the second proto
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ log_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_GT(log_size, 0);
+ }
+
+ {
+ // The header rewind position and checksum aren't updated in this "system
+ // crash" scenario.
+
+ std::string bad_proto =
+ "some incomplete proto that we didn't finish writing before the system "
+ "crashed";
+ filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
+ bad_proto.size());
+
+ // Double check that we actually wrote something to the underlying file
+ ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
+ }
+
+ {
+ // We can recover, but we have data loss
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.data_loss);
+
+ // Check that everything was persisted across instances
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // We correctly rewound to the last good state.
+ ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, Iterator) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ {
+ // Empty iterator
+ auto iterator = proto_log->GetIterator();
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterates through some documents
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->WriteProto(document2));
+ auto iterator = proto_log->GetIterator();
+ // 1st proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document1)));
+ // 2nd proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document2)));
+ // Tries to advance
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterator with bad filesystem
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+ FileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
+ mock_filesystem, file_path_, /*initial_offset=*/0);
+ ASSERT_THAT(bad_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ Crc32 checksum;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
+
+ // Calling it twice with no changes should get us the same checksum
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+
+ // Checksum should be consistent across instances
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // PersistToDisk shouldn't affect the checksum value
+ ICING_EXPECT_OK(proto_log->PersistToDisk());
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // Check that modifying the log leads to a different checksum
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h
new file mode 100644
index 0000000..1dc19ca
--- /dev/null
+++ b/icing/file/file-backed-proto.h
@@ -0,0 +1,251 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A simple file-backed proto with an in-memory cache.
+// WARNING: Only use this for small protos. Files storing larger protos can
+// benefit from more sophisticated strategies like chunked reads/writes,
+// using mmap and ideally, not even using protos.
+//
+// TODO(b/133793579) Consider exposing a checksum mismatch to callers.
+
+#ifndef ICING_FILE_FILE_BACKED_PROTO_H_
+#define ICING_FILE_FILE_BACKED_PROTO_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/thread_annotations.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// This class is go/thread-compatible
+template <typename ProtoT>
+class FileBackedProto {
+ public:
+ // Header stored at the beginning of the file before the proto.
+ struct Header {
+ static constexpr int32_t kMagic = 0x726f746f;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic;
+
+ // Checksum of the serialized proto, for a more thorough check against file
+ // corruption.
+ uint32_t proto_checksum;
+ };
+
+ // Used the specified file to read older version of the proto and store
+ // newer versions of the proto.
+ //
+ // file_path : Must be a path within in a directory that already exists.
+ FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
+
+ // Returns a reference to the proto read from the file. It
+ // internally caches the read proto so that future calls are fast.
+ //
+ // NOTE: The caller does NOT get ownership of the object returned and
+ // the returned object is only valid till a new version of the proto is
+ // written to the file.
+ //
+ // Returns NOT_FOUND if the file was empty or never written to.
+ // Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
+ libtextclassifier3::StatusOr<const ProtoT*> Read() const
+ LOCKS_EXCLUDED(mutex_);
+
+ // Writes the new version of the proto provided through to disk.
+ // Successful Write() invalidates any previously read version of the proto.
+ //
+ // Returns INTERNAL_ERROR if any IO error is encountered and will NOT
+ // invalidate any previously read versions of the proto.
+ //
+ // TODO(cassiewang) The implementation today loses old data if Write() fails.
+ // We should write to a tmp file first and rename the file to fix this.
+ // TODO(samzheng) Change to Write(ProtoT&& proto)
+ libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
+ LOCKS_EXCLUDED(mutex_);
+
+ // Disallow copy and assign.
+ FileBackedProto(const FileBackedProto&) = delete;
+ FileBackedProto& operator=(const FileBackedProto&) = delete;
+
+ private:
+ // Upper bound of file-size that is supported.
+ static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB.
+
+ // Used to provide reader and writer locks
+ mutable absl_ports::shared_mutex mutex_;
+
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+
+ mutable std::unique_ptr<ProtoT> cached_proto_ GUARDED_BY(mutex_);
+};
+
+template <typename ProtoT>
+constexpr int32_t FileBackedProto<ProtoT>::kMaxFileSize;
+
+template <typename ProtoT>
+FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem,
+ const std::string_view file_path)
+ : filesystem_(&filesystem), file_path_(file_path) {}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
+ const {
+ ICING_VLOG(1) << "Reading proto from file: " << file_path_;
+
+ absl_ports::unique_lock l(&mutex_);
+
+ // Return cached proto if we've already read from disk.
+ if (cached_proto_ != nullptr) {
+ ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_;
+ return cached_proto_.get();
+ }
+
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == Filesystem::kBadFileSize || file_size == 0) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Missing file: ", file_path_));
+ }
+
+ if (file_size > kMaxFileSize) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "File larger than expected, couldn't read: ", file_path_));
+ }
+
+ ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str()));
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to open file for read: ", file_path_));
+ }
+
+ ICING_VLOG(1) << "Loading proto from file: " << file_path_
+ << " of size: " << file_size;
+
+ Header header;
+ if (!filesystem_->PRead(fd.get(), &header, sizeof(Header),
+ /*offset=*/0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to read header of: ", file_path_));
+ }
+
+ if (header.magic != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for: ", file_path_));
+ }
+
+ int proto_size = file_size - sizeof(Header);
+ auto buffer = std::make_unique<uint8_t[]>(proto_size);
+ if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size,
+ /*offset=*/sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("File read failed: ", file_path_));
+ }
+
+ std::string_view buffer_str(reinterpret_cast<const char*>(buffer.get()),
+ proto_size);
+ Crc32 crc;
+ crc.Append(buffer_str);
+ if (header.proto_checksum != crc.Get()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Checksum of file does not match: ", file_path_));
+ }
+
+ auto proto = std::make_unique<ProtoT>();
+ if (!proto->ParseFromArray(buffer.get(), proto_size)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_));
+ }
+
+ ICING_VLOG(1) << "Successfully read proto from file: " << file_path_;
+ cached_proto_ = std::move(proto);
+ return cached_proto_.get();
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status FileBackedProto<ProtoT>::Write(
+ std::unique_ptr<ProtoT> new_proto) {
+ ICING_VLOG(1) << "Writing proto to file: " << file_path_;
+
+ absl_ports::unique_lock l(&mutex_);
+
+ const std::string new_proto_str = new_proto->SerializeAsString();
+ if (new_proto_str.size() >= kMaxFileSize) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "New proto too large. size: %d; limit: %d.",
+ static_cast<int>(new_proto_str.size()), kMaxFileSize));
+ }
+
+ if (cached_proto_ != nullptr &&
+ cached_proto_->SerializeAsString() == new_proto_str) {
+ ICING_VLOG(1) << "Skip writing proto to file as contents are identical: "
+ << file_path_;
+ return libtextclassifier3::Status::OK;
+ }
+
+ ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str()));
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to open file for write: ", file_path_));
+ }
+
+ if (!filesystem_->Truncate(fd.get(), 0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to truncate file: ", file_path_));
+ }
+
+ Header header;
+ header.magic = Header::kMagic;
+
+ Crc32 crc;
+ crc.Append(new_proto_str);
+ header.proto_checksum = crc.Get();
+ if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write header to file: ", file_path_));
+ }
+
+ if (!filesystem_->Write(fd.get(), new_proto_str.data(),
+ new_proto_str.size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto to file: ", file_path_));
+ }
+
+ if (!filesystem_->DataSync(fd.get())) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to sync file; filename: %s; content_size: %d ",
+ file_path_.c_str(), static_cast<int>(new_proto_str.size())));
+ }
+
+ ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_;
+ cached_proto_ = std::move(new_proto);
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_FILE_BACKED_PROTO_H_
diff --git a/icing/file/file-backed-proto_test.cc b/icing/file/file-backed-proto_test.cc
new file mode 100644
index 0000000..7f994fb
--- /dev/null
+++ b/icing/file/file-backed-proto_test.cc
@@ -0,0 +1,148 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/file-backed-proto.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Not;
+using ::testing::Pointee;
+
+namespace icing {
+namespace lib {
+namespace {
+
+class FileBackedProtoTest : public ::testing::Test {
+ protected:
+ void SetUp() override { filename_ = GetTestTempDir() + "/schema.pb"; }
+
+ void TearDown() override { filesystem_.DeleteFile(filename_.c_str()); }
+
+ Filesystem filesystem_;
+ std::string filename_;
+};
+
+TEST_F(FileBackedProtoTest, SimpleReadWriteTest) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace", "google.com").Build();
+
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ ICING_ASSERT_OK(file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ EXPECT_THAT(file_proto.Read(), IsOkAndHolds(Pointee(EqualsProto(document))));
+ // Multiple reads work.
+ EXPECT_THAT(file_proto.Read(), IsOkAndHolds(Pointee(EqualsProto(document))));
+ EXPECT_THAT(file_proto.Read(), IsOkAndHolds(Pointee(EqualsProto(document))));
+}
+
+TEST_F(FileBackedProtoTest, DataPersistsAcrossMultipleInstancesTest) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace", "google.com").Build();
+
+ {
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ EXPECT_THAT(file_proto.Read(), Not(IsOk())); // Nothing to read.
+
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(document))));
+ }
+ {
+ // Different instance of FileBackedProto.
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(document))));
+ }
+}
+
+TEST_F(FileBackedProtoTest, MultipleUpdatesToProtoTest) {
+ DocumentProto googleProto =
+ DocumentBuilder().SetKey("namespace", "google.com").Build();
+ DocumentProto youtubeProto =
+ DocumentBuilder().SetKey("namespace", "youtube.com").Build();
+ DocumentProto wazeProto =
+ DocumentBuilder().SetKey("namespace", "waze.com").Build();
+
+ {
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(googleProto)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(googleProto))));
+
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(youtubeProto)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(youtubeProto))));
+ }
+ {
+ // Different instance of FileBackedProto.
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(youtubeProto))));
+
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(wazeProto)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(wazeProto))));
+
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(googleProto)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(googleProto))));
+ }
+}
+
+TEST_F(FileBackedProtoTest, InvalidFilenameTest) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace", "google.com").Build();
+
+ FileBackedProto<DocumentProto> file_proto(filesystem_, "");
+ EXPECT_THAT(file_proto.Read(), Not(IsOk()));
+ EXPECT_THAT(file_proto.Write(absl::make_unique<DocumentProto>(document)),
+ Not(IsOk()));
+}
+
+TEST_F(FileBackedProtoTest, FileCorruptionTest) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace", "google.com").Build();
+
+ {
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ ICING_ASSERT_OK(
+ file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ EXPECT_THAT(file_proto.Read(),
+ IsOkAndHolds(Pointee(EqualsProto(document))));
+ }
+
+ document.set_uri("g00gle.com");
+ std::string document_str = document.SerializeAsString();
+ filesystem_.PWrite(filename_.c_str(),
+ /*offset=*/sizeof(FileBackedProto<DocumentProto>::Header),
+ document_str.data(), document_str.size());
+
+ FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
+ EXPECT_THAT(file_proto.Read(), Not(IsOk()));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
new file mode 100644
index 0000000..dc8a675
--- /dev/null
+++ b/icing/file/file-backed-vector.h
@@ -0,0 +1,708 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A file-backed vector that can store fixed-width elements. It provides
+// built-in support for checksums to verify data integrity and an in-memory
+// cache for fast read/writes.
+//
+// If the file is corrupted/in an invalid state, all contents are lost, i.e.
+// there is no clear recovery path other than recreating/repopulating the
+// contents.
+//
+// Note on Performance:
+// The class keeps the vector in a mmapped area. This allows users to specify
+// which MemoryMappedFile::Strategy they wish to use with this class. The vector
+// will implicitly grow when the user tries to access an element beyond its
+// current size. Growing happens in 16KiB chunks, up to a maximum size of 1MiB.
+//
+// Note on Checksumming:
+// Checksumming happens lazily. We do tail checksums to avoid recalculating the
+// checksum of the entire file on each modfification. A full checksum will be
+// computed/verified at creation time, when persisting to disk, or whenever the
+// user manually calls ComputeChecksum(). A separate header checksum is kept for
+// a quick integrity check.
+//
+//
+// Usage:
+// RETURN_OR_ASSIGN(auto vector, FileBackedVector<char>::Create(...));
+//
+// ICING_RETURN_IF_ERROR(vector->Set(0, 'a'));
+// ICING_RETURN_IF_ERROR(vector->Set(1, 'b'));
+// ICING_RETURN_IF_ERROR(vector->Set(2, 'c'));
+//
+// vector->num_elements(); // Returns 3
+//
+// vector->At(2); // Returns 'c'
+//
+// vector->TruncateTo(1);
+// vector->num_elements(); // Returns 1
+// vector->At(0); // Returns 'a'
+//
+// vector->ComputeChecksum(); // Force a checksum update and gets the checksum
+//
+// vector->PersistToDisk(); // Persist contents to disk.
+
+#ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
+#define ICING_FILE_FILE_BACKED_VECTOR_H_
+
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class FileBackedVector {
+ public:
+ // Header stored at the beginning of the file before the rest of the vector
+ // elements. Stores metadata on the vector.
+ struct Header {
+ // Static assert constants.
+ static constexpr int32_t kHeaderSize = 24;
+ static constexpr int32_t kHeaderChecksumOffset = 16;
+
+ static constexpr int32_t kMagic = 0x8bbbe237;
+
+ // Holds the magic as quick sanity check against file corruption
+ int32_t magic;
+
+ // Byte size of each element in the vector
+ int32_t element_size;
+
+ // Number of elements currently in the vector
+ int32_t num_elements;
+
+ // Checksum of the vector elements, doesn't include the header fields.
+ //
+ // TODO(cassiewang): Add a checksum state that can track if the checksum is
+ // fresh or stale. This lets us short circuit checksum computations if we
+ // know the checksum is fresh.
+ uint32_t vector_checksum;
+
+ // Must be below all actual header content fields and above the padding
+ // field. Contains the crc checksum of the preceding fields.
+ uint32_t header_checksum;
+
+ // This field has no actual meaning here but is just used as padding for the
+ // struct so the size of the struct can be a multiple of 8. Doing this makes
+ // the address right after the header a multiple of 8 and prevents a ubsan
+ // misalign-pointer-use error (go/ubsan).
+ //
+ // NOTE: please remove this when adding new fields and re-assert that the
+ // size is multiple of 8.
+ int32_t padding_for_ptr_alignment;
+
+ uint32_t CalculateHeaderChecksum() const {
+ // Sanity check that the memory layout matches the disk layout.
+ static_assert(std::is_standard_layout<FileBackedVector::Header>::value,
+ "");
+ static_assert(sizeof(FileBackedVector::Header) == kHeaderSize, "");
+ static_assert(
+ sizeof(FileBackedVector::Header) % sizeof(void*) == 0,
+ "Header has insufficient padding for void* pointer alignment");
+ static_assert(offsetof(FileBackedVector::Header, header_checksum) ==
+ kHeaderChecksumOffset,
+ "");
+
+ Crc32 crc;
+ std::string_view header_str(
+ reinterpret_cast<const char*>(this),
+ offsetof(FileBackedVector::Header, header_checksum));
+ crc.Append(header_str);
+ return crc.Get();
+ }
+ };
+
+ // Creates a new FileBackedVector to read/write content to.
+ //
+ // filesystem: Object to make system level calls
+ // file_path : Specifies the file to persist the vector to; must be a path
+ // within a directory that already exists.
+ // mmap_strategy : Strategy/optimizations to access the content in the vector,
+ // see MemoryMappedFile::Strategy for more details
+ static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+ Create(const Filesystem& filesystem, const std::string& file_path,
+ MemoryMappedFile::Strategy mmap_strategy);
+
+ // Deletes the FileBackedVector
+ //
+ // filesystem: Object to make system level calls
+ // file_path : Specifies the file the vector is persisted to.
+ static libtextclassifier3::Status Delete(const Filesystem& filesystem,
+ const std::string& file_path);
+
+ // Not copyable
+ FileBackedVector(const FileBackedVector&) = delete;
+ FileBackedVector& operator=(const FileBackedVector&) = delete;
+
+ // If the vector was created with
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
+ // synced by the system and the checksum will be updated.
+ ~FileBackedVector();
+
+ // Accesses the element at idx.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
+
+ // Writes the value at idx.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
+ libtextclassifier3::Status Set(int32_t idx, const T& value);
+
+ // Resizes to first len elements. The crc is not updated on truncation.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
+ libtextclassifier3::Status TruncateTo(int32_t len);
+
+ // Flushes content to underlying file.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates and returns the disk usage in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Accessors.
+ const T* array() const {
+ return reinterpret_cast<const T*>(mmapped_file_->region());
+ }
+
+ T* mutable_array() const {
+ return reinterpret_cast<T*>(mmapped_file_->mutable_region());
+ }
+
+ int32_t num_elements() const { return header_->num_elements; }
+
+ // Updates checksum of the vector contents and returns it.
+ //
+ // Returns:
+ // INTERNAL_ERROR if the vector's internal state is inconsistent
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ private:
+ // We track partial updates to the array for crc updating. This
+ // requires extra memory to keep track of original buffers but
+ // allows for much faster crc re-computation. This is the frac limit
+ // of byte len after which we will discard recorded changes and
+ // recompute the entire crc instead.
+ static constexpr int32_t kPartialCrcLimitDiv = 8; // limit is 1/8th
+
+ // Grow file by at least this many elements if array is growable.
+ static constexpr int64_t kGrowElements = 1u << 14; // 16K
+
+ // Max number of elements that can be held by the vector.
+ static constexpr int64_t kMaxNumElements = 1u << 20; // 1M
+
+ // Can only be created through the factory ::Create function
+ FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
+ std::unique_ptr<Header> header,
+ std::unique_ptr<MemoryMappedFile> mmapped_file);
+
+ // Initialize a new FileBackedVector, and create the file.
+ static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+ InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
+ ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
+
+ // Initialize a FileBackedVector from an existing file.
+ static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+ InitializeExistingFile(const Filesystem& filesystem,
+ const std::string& file_path, ScopedFd fd,
+ MemoryMappedFile::Strategy mmap_strategy);
+
+ // Grows the underlying file to hold at least num_elements
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if we can't grow to the specified size
+ libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
+
+ // Cached constructor params.
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+ std::unique_ptr<Header> header_;
+ std::unique_ptr<MemoryMappedFile> mmapped_file_;
+
+ // Offset before which all the elements have been included in the calculation
+ // of crc at the time it was calculated.
+ int32_t changes_end_ = 0;
+
+ // Offset of changes that have happened since the last crc update between [0,
+ // changes_end_).
+ std::vector<int32_t> changes_;
+
+ // Buffer of the original elements that have been changed since the last crc
+ // update. Will be cleared if the size grows too big.
+ std::string saved_original_buffer_;
+
+ // Keep track of all pages we touched so we can write them back to
+ // disk.
+ std::vector<bool> dirty_pages_;
+};
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
+
+template <typename T>
+constexpr int64_t FileBackedVector<T>::kGrowElements;
+
+template <typename T>
+constexpr int64_t FileBackedVector<T>::kMaxNumElements;
+
+template <typename T>
+libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+FileBackedVector<T>::Create(const Filesystem& filesystem,
+ const std::string& file_path,
+ MemoryMappedFile::Strategy mmap_strategy) {
+ if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
+ // FileBackedVector's behavior of growing the file underneath the mmap is
+ // inherently broken with MAP_PRIVATE. Growing the vector requires extending
+ // the file size, then unmapping and then re-mmapping over the new, larger
+ // file. But when we unmap, we lose all the vector's contents if they
+ // weren't manually persisted. Either don't allow READ_WRITE_MANUAL_SYNC
+ // vectors from growing, or make users aware of this somehow
+ return absl_ports::UnimplementedError(
+ "FileBackedVector currently doesn't support READ_WRITE_MANUAL_SYNC "
+ "mmap strategy.");
+ }
+
+ ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to open ", file_path));
+ }
+
+ int64_t file_size = filesystem.GetFileSize(file_path.c_str());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size for file ", file_path));
+ }
+
+ const bool new_file = file_size == 0;
+ if (new_file) {
+ return InitializeNewFile(filesystem, file_path, std::move(fd),
+ mmap_strategy);
+ }
+ return InitializeExistingFile(filesystem, file_path, std::move(fd),
+ mmap_strategy);
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+FileBackedVector<T>::InitializeNewFile(
+ const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
+ MemoryMappedFile::Strategy mmap_strategy) {
+ // Create header.
+ auto header = std::make_unique<Header>();
+ header->magic = FileBackedVector<T>::Header::kMagic;
+ header->element_size = sizeof(T);
+ header->header_checksum = header->CalculateHeaderChecksum();
+
+ // We use Write() here, instead of writing through the mmapped region
+ // created below, so we can gracefully handle errors that occur when the
+ // disk is full. See b/77309668 for details.
+ if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError("Failed to write header");
+ }
+
+ // Constructor of MemoryMappedFile doesn't actually call mmap(), mmap()
+ // happens on MemoryMappedFile::Remap(). So having a potentially unflushed fd
+ // at this point shouldn't run into issues with a mmap of the same file. But
+ // we'll close the fd just in case.
+ fd.reset();
+ auto mmapped_file =
+ std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
+
+ return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
+ filesystem, file_path, std::move(header), std::move(mmapped_file)));
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
+FileBackedVector<T>::InitializeExistingFile(
+ const Filesystem& filesystem, const std::string& file_path,
+ const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
+ int64_t file_size = filesystem.GetFileSize(file_path.c_str());
+ if (file_size < sizeof(FileBackedVector<T>::Header)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("File header too short for ", file_path));
+ }
+
+ auto header = std::make_unique<Header>();
+ if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
+ /*offset=*/0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to read header of ", file_path));
+ }
+
+ // Make sure the header is still valid before we use any of its values. This
+ // should technically be included in the header_checksum check below, but this
+ // is a quick/fast check that can save us from an extra crc computation.
+ if (header->kMagic != FileBackedVector<T>::Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for ", file_path));
+ }
+
+ // Mmap the content of the vector, excluding the header so its easier to
+ // access elements from the mmapped region
+ auto mmapped_file =
+ std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
+ ICING_RETURN_IF_ERROR(
+ mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
+
+ // Check header
+ if (header->header_checksum != header->CalculateHeaderChecksum()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header crc for ", file_path));
+ }
+
+ if (header->element_size != sizeof(T)) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Inconsistent element size, expected %zd, actual %d", sizeof(T),
+ header->element_size));
+ }
+
+ // Check vector contents
+ Crc32 vector_checksum;
+ std::string_view vector_contents(
+ reinterpret_cast<const char*>(mmapped_file->region()),
+ header->num_elements * sizeof(T));
+ vector_checksum.Append(vector_contents);
+
+ if (vector_checksum.Get() != header->vector_checksum) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid vector contents for ", file_path));
+ }
+
+ return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
+ filesystem, file_path, std::move(header), std::move(mmapped_file)));
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::Delete(
+ const Filesystem& filesystem, const std::string& file_path) {
+ if (!filesystem.DeleteFile(file_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to delete file: ", file_path));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+FileBackedVector<T>::FileBackedVector(
+ const Filesystem& filesystem, const std::string& file_path,
+ std::unique_ptr<Header> header,
+ std::unique_ptr<MemoryMappedFile> mmapped_file)
+ : filesystem_(&filesystem),
+ file_path_(file_path),
+ header_(std::move(header)),
+ mmapped_file_(std::move(mmapped_file)),
+ changes_end_(header_->num_elements) {}
+
+template <typename T>
+FileBackedVector<T>::~FileBackedVector() {
+ if (mmapped_file_->strategy() ==
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist vector to disk while destructing "
+ << file_path_;
+ }
+ }
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
+ int32_t idx) const {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx >= header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than vector size, %d", idx,
+ header_->num_elements));
+ }
+
+ return &array()[idx];
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
+ const T& value) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ int32_t start_byte = idx * sizeof(T);
+
+ ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
+
+ if (idx + 1 > header_->num_elements) {
+ header_->num_elements = idx + 1;
+ }
+
+ if (mutable_array()[idx] == value) {
+ // No need to update
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Cache original value to update crcs.
+ if (idx < changes_end_) {
+ // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
+ // revert to full CRC.
+ if ((saved_original_buffer_.size() + sizeof(T)) *
+ FileBackedVector<T>::kPartialCrcLimitDiv >
+ changes_end_ * sizeof(T)) {
+ ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+ } else {
+ changes_.push_back(idx);
+ saved_original_buffer_.append(
+ reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
+ sizeof(T));
+ }
+ }
+
+ mutable_array()[idx] = value;
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
+ int32_t num_elements) {
+ if (sizeof(T) == 0) {
+ // Growing is a no-op
+ return libtextclassifier3::Status::OK;
+ }
+
+ // TODO(cassiewang): Benchmark to see if having ABSL_PREDICT_TRUE is impactful
+ if (ABSL_PREDICT_TRUE(num_elements <= header_->num_elements)) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ if (num_elements > FileBackedVector<T>::kMaxNumElements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "%d exceeds maximum number of elements allowed, %lld", num_elements,
+ static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
+ }
+
+ int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
+
+ if (least_file_size_needed <= current_file_size) {
+ // Our underlying file can hold the target num_elements cause we've grown
+ // before
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Otherwise, we need to grow. Grow to kGrowElements boundary.
+ least_file_size_needed = math_util::RoundUpTo(
+ least_file_size_needed,
+ int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
+ if (!filesystem_->Grow(file_path_.c_str(), least_file_size_needed)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't grow file ", file_path_));
+ }
+
+ ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
+ sizeof(Header), least_file_size_needed - sizeof(Header)));
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
+ int32_t new_num_elements) {
+ if (new_num_elements < 0) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Truncated length %d must be >= 0", new_num_elements));
+ }
+
+ if (new_num_elements >= header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Truncated length %d must be less than the current size %d",
+ new_num_elements, header_->num_elements));
+ }
+
+ header_->num_elements = new_num_elements;
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
+ // First apply the modified area. Keep a bitmap of already updated
+ // regions so we don't double-update.
+ std::vector<bool> updated(changes_end_);
+ uint32_t cur_offset = 0;
+ Crc32 cur_crc(header_->vector_checksum);
+ int num_partial_crcs = 0;
+ int num_truncated = 0;
+ int num_overlapped = 0;
+ int num_duplicate = 0;
+ for (size_t i = 0; i < changes_.size(); i++) {
+ const int32_t change_offset = changes_[i];
+ if (change_offset > changes_end_) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to update crc, change offset %d, changes_end_ %d",
+ change_offset, changes_end_));
+ }
+
+ // Skip truncated tracked changes.
+ if (change_offset >= header_->num_elements) {
+ ++num_truncated;
+ continue;
+ }
+
+ // Turn change buffer into change^original.
+ const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
+ const char* cur_array =
+ reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
+ // Now xor in. SSE acceleration please?
+ for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
+ cur++, cur_array++) {
+ *cur ^= *cur_array;
+ }
+
+ // Skip over already updated bytes by setting update to 0.
+ bool new_update = false;
+ bool overlap = false;
+ uint32_t cur_element = change_offset;
+ for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
+ cur_element++, cur += sizeof(T)) {
+ if (updated[cur_element]) {
+ memset(cur, 0, sizeof(T));
+ overlap = true;
+ } else {
+ updated[cur_element] = true;
+ new_update = true;
+ }
+ }
+
+ // Apply update to crc.
+ if (new_update) {
+ // Explicitly create the string_view with length
+ std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
+ if (!cur_crc
+ .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
+ change_offset * sizeof(T))
+ .ok()) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to update crc, change offset %d, change "
+ "length %zd changes_end_ %d",
+ change_offset, xored_str.length(), changes_end_));
+ }
+ num_partial_crcs++;
+ if (overlap) {
+ num_overlapped++;
+ }
+ } else {
+ num_duplicate++;
+ }
+ cur_offset += sizeof(T);
+ }
+ if (!changes_.empty()) {
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
+ num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
+ }
+
+ // Now update with grown area.
+ if (changes_end_ < header_->num_elements) {
+ // Explicitly create the string_view with length
+ std::string_view update_str(
+ reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
+ (header_->num_elements - changes_end_) * sizeof(T));
+ cur_crc.Append(update_str);
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Array update tail crc offset %d -> %d", changes_end_,
+ header_->num_elements);
+ }
+
+ // Clear, now that we've applied changes.
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = header_->num_elements;
+
+ // Commit new crc.
+ header_->vector_checksum = cur_crc.Get();
+ return cur_crc;
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
+ // Update and write the header
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ header_->vector_checksum = checksum.Get();
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError("Failed to sync header");
+ }
+
+ MemoryMappedFile::Strategy strategy = mmapped_file_->strategy();
+
+ if (strategy == MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC) {
+ // Changes should have been applied to the underlying file, but call msync()
+ // as an extra safety step to ensure they are written out.
+ ICING_RETURN_IF_ERROR(mmapped_file_->PersistToDisk());
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
+ const {
+ int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+ if (size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of file-backed vector");
+ }
+ return size;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_FILE_BACKED_VECTOR_H_
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
new file mode 100644
index 0000000..7561b57
--- /dev/null
+++ b/icing/file/file-backed-vector_test.cc
@@ -0,0 +1,429 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/file-backed-vector.h"
+
+#include <errno.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+using ::testing::Eq;
+using ::testing::Pointee;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class FileBackedVectorTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/test.array";
+ fd_ = filesystem_.OpenForWrite(file_path_.c_str());
+ ASSERT_NE(-1, fd_);
+ ASSERT_TRUE(filesystem_.Truncate(fd_, 0));
+ }
+
+ void TearDown() override {
+ close(fd_);
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
+
+ // Helper method to loop over some data and insert into the vector at some idx
+ template <typename T>
+ void Insert(FileBackedVector<T>* vector, int32_t idx, std::string data) {
+ for (int i = 0; i < data.length(); ++i) {
+ ICING_ASSERT_OK(vector->Set(idx + i, data.at(i)));
+ }
+ }
+
+ // Helper method to retrieve data from the beginning of the vector
+ template <typename T>
+ std::string_view Get(FileBackedVector<T>* vector, int32_t expected_len) {
+ return Get(vector, 0, expected_len);
+ }
+
+ template <typename T>
+ std::string_view Get(FileBackedVector<T>* vector, int32_t idx,
+ int32_t expected_len) {
+ return std::string_view(vector->array() + idx, expected_len);
+ }
+
+ Filesystem filesystem_;
+ std::string file_path_;
+ int fd_;
+};
+
+TEST_F(FileBackedVectorTest, Create) {
+ {
+ // Create a vector for a new file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ }
+
+ {
+ // We can create it again based on the same file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ }
+}
+
+TEST_F(FileBackedVectorTest, SimpleShared) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ std::string expected = "abcde";
+ Insert(vector.get(), 0, expected);
+ EXPECT_EQ(expected.length(), vector->num_elements());
+ EXPECT_EQ(expected, Get(vector.get(), expected.length()));
+
+ uint32_t good_crc_value = 1134899064U;
+ const Crc32 good_crc(good_crc_value);
+ // Explicit call to update the crc does update the value
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
+
+ // PersistToDisk does nothing bad.
+ ICING_EXPECT_OK(vector->PersistToDisk());
+
+ // Close out the old vector to ensure everything persists properly before we
+ // reassign it
+ vector.reset();
+
+ // Write a bad crc, this would be a mismatch compared to the computed crc of
+ // the contents on reinitialization.
+ uint32_t bad_crc_value = 123;
+ filesystem_.PWrite(file_path_.data(),
+ offsetof(FileBackedVector<char>::Header, vector_checksum),
+ &bad_crc_value, sizeof(bad_crc_value));
+
+ ASSERT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+
+ // Get it back into an ok state
+ filesystem_.PWrite(file_path_.data(),
+ offsetof(FileBackedVector<char>::Header, vector_checksum),
+ &good_crc_value, sizeof(good_crc_value));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_EQ(expected, Get(vector.get(), expected.length()));
+
+ // Close out the old vector to ensure everything persists properly before we
+ // reassign it
+ vector.reset();
+
+ // Can reinitialize it safely
+ ICING_ASSERT_OK_AND_ASSIGN(
+ vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Truncate the content
+ ICING_EXPECT_OK(vector->TruncateTo(0));
+
+ // We don't automatically update the crc when we truncate.
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
+ EXPECT_EQ(0u, vector->num_elements());
+}
+
+TEST_F(FileBackedVectorTest, Get) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ std::string expected = "abc";
+ Insert(vector.get(), 0, expected);
+ EXPECT_EQ(expected.length(), vector->num_elements());
+
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+ EXPECT_THAT(vector->Get(2), IsOkAndHolds(Pointee(Eq('c'))));
+
+ // Out of bounds error
+ EXPECT_THAT(vector->Get(3),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Get(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, IncrementalCrc_NonOverlappingChanges) {
+ int num_elements = 1000;
+ int incremental_size = 3;
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ Insert(vector.get(), 0, std::string(num_elements, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ // Non-overlapping changes to the array, with increasing intervals
+ // between updating the checksum. Validate by mapping another array on top.
+ uint32_t next_update = 2;
+ for (uint32_t i = 0; i < num_elements; i += incremental_size) {
+ Insert(vector.get(), i, std::string(incremental_size, 'b'));
+
+ if (i >= next_update) {
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 incremental_crc,
+ vector->ComputeChecksum());
+ ICING_LOG(INFO) << "Now crc @" << incremental_crc.Get();
+
+ Crc32 full_crc;
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+ full_crc.Append(reconstructed_view);
+
+ ASSERT_EQ(incremental_crc, full_crc);
+ next_update *= 2;
+ }
+ }
+
+ for (uint32_t i = 0; i < num_elements; ++i) {
+ EXPECT_THAT(vector->Get(i), IsOkAndHolds(Pointee(Eq('b'))));
+ }
+}
+
+TEST_F(FileBackedVectorTest, IncrementalCrc_OverlappingChanges) {
+ int num_elements = 1000;
+ int incremental_size = 3;
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ Insert(vector.get(), 0, std::string(num_elements, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ // Overlapping changes to the array, with increasing intervals
+ // between updating the checksum. Validate by mapping another array on top.
+ uint32_t next_update = 2;
+ for (uint32_t i = 0; i < num_elements; i++) {
+ Insert(vector.get(), i, std::string(incremental_size, 'b'));
+
+ if (i >= next_update) {
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 incremental_crc,
+ vector->ComputeChecksum());
+ ICING_LOG(INFO) << "Now crc @" << incremental_crc.Get();
+
+ Crc32 full_crc;
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+ full_crc.Append(reconstructed_view);
+
+ ASSERT_EQ(incremental_crc, full_crc);
+ next_update *= 2;
+ }
+ }
+ for (uint32_t i = 0; i < num_elements; ++i) {
+ EXPECT_THAT(vector->Get(i), IsOkAndHolds(Pointee(Eq('b'))));
+ }
+}
+
+TEST_F(FileBackedVectorTest, Grow) {
+ // This is the same value as FileBackedVector::kMaxNumElts
+ constexpr int32_t kMaxNumElts = 1U << 20;
+
+ ASSERT_TRUE(filesystem_.Truncate(fd_, 0));
+
+ // Create an array and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(-1, 'a'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ uint32_t start = kMaxNumElts - 13;
+ Insert(vector.get(), start, "abcde");
+
+ // Crc works?
+ const Crc32 good_crc(1134899064U);
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
+
+ // PersistToDisk does nothing bad, and ensures the content is still there
+ // after we recreate the vector
+ ICING_EXPECT_OK(vector->PersistToDisk());
+
+ // Close out the old vector to ensure everything persists properly before we
+ // reassign it
+ vector.reset();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ std::string expected = "abcde";
+ EXPECT_EQ(expected, Get(vector.get(), start, expected.length()));
+}
+
+TEST_F(FileBackedVectorTest, GrowsInChunks) {
+ // This is the same value as FileBackedVector::kGrowElements
+ constexpr int32_t kGrowElements = 1U << 14; // 16K
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Our initial file size should just be the size of the header
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(sizeof(FileBackedVector<char>::Header)));
+
+ // Once we add something though, we'll grow to kGrowElements big
+ Insert(vector.get(), 0, "a");
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(kGrowElements * sizeof(int)));
+
+ // Should still be the same size, don't need to grow underlying file
+ Insert(vector.get(), 1, "b");
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(kGrowElements * sizeof(int)));
+
+ // Now we grow by a kGrowElements chunk, so the underlying file is 2
+ // kGrowElements big
+ Insert(vector.get(), 2, std::string(kGrowElements, 'c'));
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(kGrowElements * 2 * sizeof(int)));
+
+ // Destroy/persist the contents.
+ vector.reset();
+
+ // Reinitialize
+ ICING_ASSERT_OK_AND_ASSIGN(
+ vector, FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Should be the same file size as before
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(kGrowElements * 2 * sizeof(int)));
+}
+
+TEST_F(FileBackedVectorTest, Delete) {
+ // Can delete even if there's nothing there
+ ICING_EXPECT_OK(FileBackedVector<int64_t>::Delete(filesystem_, file_path_));
+
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ std::string expected = "abcde";
+ Insert(vector.get(), 0, expected);
+ ASSERT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(1134899064U)));
+ ASSERT_EQ(expected.length(), vector->num_elements());
+
+ // Close out the old vector to ensure everything persists properly before we
+ // delete the underlying files
+ vector.reset();
+
+ ICING_EXPECT_OK(FileBackedVector<int64_t>::Delete(filesystem_, file_path_));
+
+ EXPECT_FALSE(filesystem_.FileExists(file_path_.data()));
+
+ // Can successfully create again.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+}
+
+TEST_F(FileBackedVectorTest, TruncateTo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+
+ EXPECT_EQ(2, vector->num_elements());
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(1658635950)));
+
+ // Modify 1 element, out of 2 total elements. 1/2 changes exceeds the partial
+ // crc limit, so our next checksum call will recompute the entire vector's
+ // checksum.
+ Insert(vector.get(), 1, "J");
+ // We'll ignore everything after the 1st element, so the full vector's
+ // checksum will only include "J".
+ ICING_EXPECT_OK(vector->TruncateTo(1));
+ EXPECT_EQ(1, vector->num_elements());
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
+
+ // Truncating doesn't cause the checksum to be updated.
+ ICING_EXPECT_OK(vector->TruncateTo(0));
+ EXPECT_EQ(0, vector->num_elements());
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
+
+ // Can't truncate past end.
+ EXPECT_THAT(vector->TruncateTo(100),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Must be greater than or equal to 0
+ EXPECT_THAT(vector->TruncateTo(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
new file mode 100644
index 0000000..5367e87
--- /dev/null
+++ b/icing/file/filesystem.cc
@@ -0,0 +1,678 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/filesystem.h"
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <pthread.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <unordered_set>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+
+using std::vector;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// The size of the block for st_blksize returned by stat() and as a
+// consequence also the granularity of GetDiskUsage(). It seems that there is
+// no appropriate constant for this. See http://linux.die.net/man/2/stat
+constexpr int kStatBlockSize = 512;
+
+// Logs information about open file descriptors.
+//
+// This function uses getrlimit() to find the maximum number of file
+// descriptors, then calls readlink("/proc/self/fd/N") for each possible file
+// descriptor number to get a description of the open file from procfs.
+//
+// We don't use readdir() to list the contents of /proc/self/fd (which would be
+// the more obvious approach) because that would require a free file descriptor
+// to open the directory, while we call this function when all file descriptors
+// are in use.
+void LogOpenFileDescriptors() {
+ // Determine the limit on file descriptor numbers. RLIMIT_NOFILE should return
+ // the maximum file descriptor + 1, which is 1024 on Android by default. We
+ // restrict the limit to 4096 so we don't take too much time if the value
+ // turns out to be much higher for some reason.
+ constexpr int kMaxFileDescriptorsToStat = 4096;
+ struct rlimit rlim = {0, 0};
+ if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "getrlimit() failed (errno=%d)", errno);
+ return;
+ }
+ int fd_lim = rlim.rlim_cur;
+ if (fd_lim > kMaxFileDescriptorsToStat) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Maximum number of file descriptors (%d) too large.", fd_lim);
+ fd_lim = kMaxFileDescriptorsToStat;
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Listing up to %d file descriptors.", fd_lim);
+
+ // Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
+ // inaccessible for some other reason. In that case, there's no point trying
+ // to read from it.
+ struct stat statbuf;
+ if (stat("/proc/self/fd", &statbuf) != 0 || !S_ISDIR(statbuf.st_mode)) {
+ ICING_LOG(ERROR) << "/proc/self/fd not available. Giving up.";
+ return;
+ }
+
+ // Now read each link individually.
+ char path[1024];
+ char target[1024];
+ for (int fd = 0; fd < fd_lim; ++fd) {
+ snprintf(path, arraysize(path), "/proc/self/fd/%d", fd);
+ ssize_t len = readlink(path, target, arraysize(target));
+ if (len >= 0) {
+ // Zero-terminate the buffer, because readlink() won't.
+ target[len < arraysize(target) ? len : arraysize(target) - 1] = '\0';
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
+ target);
+ } else if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
+ fd, errno);
+ }
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "File descriptor list complete.");
+}
+
+// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
+//
+// If errnum == EMFILE (too many open files), then it also logs a list of open
+// file descriptors (see LogOpenFileDescriptors() above).
+void LogOpenError(const char* desc1, const char* file_name, const char* desc2,
+ int errnum) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ if (errnum == EMFILE) {
+ LogOpenFileDescriptors();
+ }
+}
+
+// Recursive implementation of ListDirectory. Prefix is used to prepend the
+// directory name during recursion.
+// We cannot use scandir due to a bug in old platform versions. See b/7339844.
+bool ListDirectoryInternal(const char* dir_name,
+ const std::unordered_set<std::string>& exclude,
+ bool recursive, const char* prefix,
+ std::vector<std::string>* entries) {
+ DIR* dir = opendir(dir_name);
+ if (!dir) {
+ LogOpenError("Unable to open directory ", dir_name, ": ", errno);
+ return false;
+ }
+
+ dirent* p;
+ // readdir's implementation seems to be thread safe.
+ while ((p = readdir(dir)) != nullptr) {
+ std::string file_name(p->d_name);
+ if (file_name == "." || file_name == ".." ||
+ exclude.find(file_name) != exclude.end()) {
+ continue;
+ }
+ std::string relative_path = absl_ports::StrCat(prefix, p->d_name);
+ entries->push_back(relative_path);
+ // Recurse down directories, if requested.
+ if (recursive && (p->d_type == DT_DIR)) {
+ std::string sub_dir_name = absl_ports::StrCat(dir_name, "/", p->d_name);
+ std::string relative_path_with_slash =
+ absl_ports::StrCat(relative_path, "/");
+ if (!ListDirectoryInternal(sub_dir_name.c_str(), exclude, recursive,
+ relative_path_with_slash.c_str(), entries)) {
+ return false;
+ }
+ }
+ }
+ if (closedir(dir) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Error closing %s: %s", dir_name, strerror(errno));
+ }
+ return true;
+}
+
+} // namespace
+
+ScopedFd::~ScopedFd() {
+ if (fd_ >= 0) {
+ close(fd_);
+ }
+}
+
+void ScopedFd::reset(int fd) {
+ if (fd_ >= 0) {
+ close(fd_);
+ }
+ fd_ = fd;
+}
+
+const int64_t Filesystem::kBadFileSize;
+
+bool Filesystem::DeleteFile(const char* file_name) const {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ int ret = unlink(file_name);
+ if (ret != 0 && errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Deleting file %s failed: %s", file_name, strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::DeleteDirectory(const char* dir_name) const {
+ int ret = rmdir(dir_name);
+ if (ret != 0 && errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
+ // Ensure the dir_name really is a directory and exists.
+ struct stat st;
+ if (stat(dir_name, &st) < 0) {
+ if (errno == ENOENT) {
+ return true; // If directory didn't exist, this was successful.
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Stat %s failed: %s", dir_name, strerror(errno));
+ return false;
+ }
+ vector<std::string> entries;
+ if (!ListDirectory(dir_name, &entries)) {
+ return false;
+ }
+
+ bool success = true;
+ for (vector<std::string>::iterator i = entries.begin(); i != entries.end();
+ ++i) {
+ std::string filename = std::string(dir_name) + '/' + *i;
+ if (stat(filename.c_str(), &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ success = false;
+ } else if (S_ISDIR(st.st_mode)) {
+ success = DeleteDirectoryRecursively(filename.c_str()) && success;
+ } else {
+ success = DeleteFile(filename.c_str()) && success;
+ }
+ }
+
+ if (success) {
+ success = DeleteDirectory(dir_name);
+ }
+
+ return success;
+}
+
+bool Filesystem::FileExists(const char* file_name) const {
+ bool exists = false;
+ struct stat st;
+ if (stat(file_name, &st) == 0) {
+ exists = S_ISREG(st.st_mode) != 0;
+ } else {
+ if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to stat file %s: %s", file_name, strerror(errno));
+ }
+ exists = false;
+ }
+ return exists;
+}
+
+bool Filesystem::DirectoryExists(const char* dir_name) const {
+ bool exists = false;
+ struct stat st;
+ if (stat(dir_name, &st) == 0) {
+ exists = S_ISDIR(st.st_mode) != 0;
+ } else {
+ if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ }
+ exists = false;
+ }
+ return exists;
+}
+
+int Filesystem::GetBasenameIndex(const char* file_name) const {
+ // Find final slash.
+ const char* last_slash = strrchr(file_name, '/');
+ if (!last_slash) {
+ // file_name is just basename.
+ return 0;
+ }
+
+ // Skip slash.
+ return last_slash + 1 - file_name;
+}
+
+std::string Filesystem::GetBasename(const char* file_name) const {
+ size_t len = strlen(file_name);
+ int idx = GetBasenameIndex(file_name);
+ return std::string(file_name + idx, len - idx);
+}
+
+std::string Filesystem::GetDirname(const char* file_name) const {
+ int idx = GetBasenameIndex(file_name);
+ // Remove the trailing slash
+ if (idx > 0) {
+ idx -= 1;
+ }
+ return std::string(file_name, idx);
+}
+
+bool Filesystem::ListDirectory(const char* dir_name,
+ vector<std::string>* entries) const {
+ entries->clear();
+ return ListDirectory(dir_name, /*exclude=*/{}, /*recursive=*/false, entries);
+}
+
+bool Filesystem::ListDirectory(const char* dir_name,
+ const std::unordered_set<std::string>& exclude,
+ bool recursive,
+ std::vector<std::string>* entries) const {
+ return ListDirectoryInternal(dir_name, exclude, recursive, /*prefix=*/"",
+ entries);
+}
+
+bool Filesystem::GetMatchingFiles(const char* glob,
+ vector<std::string>* matches) const {
+ matches->clear();
+
+ // Split dirname/basename.
+ int basename_idx = GetBasenameIndex(glob);
+ if (basename_idx == 0) {
+ // We need a directory.
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Expected directory, no matching files for: %s", glob);
+ return true;
+ }
+ const char* basename_glob = glob + basename_idx;
+ std::string dirname(glob, basename_idx);
+ vector<std::string> entries;
+ if (!ListDirectory(dirname.c_str(), &entries) && errno != ENOENT) {
+ return false;
+ }
+
+ for (vector<std::string>::iterator i = entries.begin(); i != entries.end();
+ ++i) {
+ // The filename needs to match glob following last_slash.
+ if (!fnmatch(basename_glob, i->c_str(), FNM_PATHNAME)) {
+ // Add it to the list.
+ matches->push_back(dirname + *i);
+ }
+ }
+ return true;
+}
+
+int Filesystem::OpenForWrite(const char* file_name) const {
+ int fd = open(file_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for write failed: ", errno);
+ }
+ return fd;
+}
+
+int Filesystem::OpenForAppend(const char* file_name) const {
+ // Don't use the O_APPEND flag because, although it opens for
+ // append, it doesn't set the file cursor to at the end until
+ // first write occurs. This can be confusing if you expect
+ // the file position at the end. Instead, explicitly
+ // seek to end after opening.
+ int fd = open(file_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for write failed: ", errno);
+ } else {
+ lseek(fd, 0, SEEK_END);
+ }
+ return fd;
+}
+
+int Filesystem::OpenForRead(const char* file_name) const {
+ int fd = open(file_name, O_RDONLY);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for read failed: ", errno);
+ }
+ return fd;
+}
+
+int64_t Filesystem::GetFileSize(int fd) const {
+ struct stat st;
+ if (fstat(fd, &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
+ strerror(errno));
+ return kBadFileSize;
+ }
+ return st.st_size;
+}
+
+int64_t Filesystem::GetFileSize(const char* filename) const {
+ struct stat st;
+ if (stat(filename, &st) < 0) {
+ if (errno == ENOENT) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Unable to stat file %s: %s", filename, strerror(errno));
+ } else {
+ ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
+ "Unable to stat file %s: %s", filename, strerror(errno));
+ }
+ return kBadFileSize;
+ }
+ return st.st_size;
+}
+
+bool Filesystem::Truncate(int fd, int64_t new_size) const {
+ if (ftruncate(fd, new_size) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to truncate file: %s", strerror(errno));
+ return false;
+ }
+ lseek(fd, new_size, SEEK_SET);
+ return true;
+}
+
+bool Filesystem::Truncate(const char* filename, int64_t new_size) const {
+ int fd = OpenForAppend(filename);
+ if (fd == -1) {
+ return false;
+ }
+ bool success = Truncate(fd, new_size);
+ close(fd);
+ return success;
+}
+
+bool Filesystem::Grow(int fd, int64_t new_size) const {
+ if (ftruncate(fd, new_size) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
+ strerror(errno));
+ return false;
+ }
+
+ return true;
+}
+
+bool Filesystem::Grow(const char* filename, int64_t new_size) const {
+ int fd = OpenForAppend(filename);
+ if (fd == -1) {
+ return false;
+ }
+
+ bool grew = Grow(fd, new_size);
+ close(fd);
+ return grew;
+}
+
+bool Filesystem::Write(int fd, const void* data, size_t data_size) const {
+ size_t write_len = data_size;
+ do {
+ // Don't try to write too much at once.
+ size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
+ ssize_t wrote = write(fd, data, chunk_size);
+ if (wrote < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
+ strerror(errno));
+ return false;
+ }
+ data = static_cast<const uint8_t*>(data) + wrote;
+ write_len -= wrote;
+ } while (write_len > 0);
+ return true;
+}
+
+bool Filesystem::Write(const char* filename, const void* data,
+ size_t data_size) const {
+ int fd = OpenForWrite(filename);
+ if (fd == -1) {
+ return false;
+ }
+
+ bool success = Write(fd, data, data_size);
+ close(fd);
+ return success;
+}
+
+bool Filesystem::PWrite(int fd, off_t offset, const void* data,
+ size_t data_size) const {
+ size_t write_len = data_size;
+ do {
+ // Don't try to write too much at once.
+ size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
+ ssize_t wrote = pwrite(fd, data, chunk_size, offset);
+ if (wrote < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
+ strerror(errno));
+ return false;
+ }
+ data = static_cast<const uint8_t*>(data) + wrote;
+ write_len -= wrote;
+ offset += wrote;
+ } while (write_len > 0);
+ return true;
+}
+
+bool Filesystem::PWrite(const char* filename, off_t offset, const void* data,
+ size_t data_size) const {
+ int fd = OpenForWrite(filename);
+ if (fd == -1) {
+ return false;
+ }
+
+ bool success = PWrite(fd, offset, data, data_size);
+ close(fd);
+ return success;
+}
+
+bool Filesystem::Read(int fd, void* buf, size_t buf_size) const {
+ ssize_t read_status = read(fd, buf, buf_size);
+ if (read_status < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::Read(const char* filename, void* buf, size_t buf_size) const {
+ int fd = OpenForRead(filename);
+ if (fd == -1) {
+ return false;
+ }
+
+ bool success = Read(fd, buf, buf_size);
+ close(fd);
+ return success;
+}
+
+bool Filesystem::PRead(int fd, void* buf, size_t buf_size, off_t offset) const {
+ ssize_t read_status = pread(fd, buf, buf_size, offset);
+ if (read_status < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::PRead(const char* filename, void* buf, size_t buf_size,
+ off_t offset) const {
+ int fd = OpenForRead(filename);
+ if (fd == -1) {
+ return false;
+ }
+
+ bool success = PRead(fd, buf, buf_size, offset);
+ close(fd);
+ return success;
+}
+
+bool Filesystem::DataSync(int fd) const {
+#ifdef __APPLE__ // iOS has no fdatasync(), only fsync()
+ int result = fsync(fd);
+#else
+ int result = fdatasync(fd);
+#endif
+
+ if (result < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::RenameFile(const char* old_name, const char* new_name) const {
+ if (rename(old_name, new_name) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to rename file %s to %s: %s", old_name, new_name,
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool Filesystem::SwapFiles(const char* one, const char* two) const {
+ std::string tmp_name = absl_ports::StrCat(one, ".tmp");
+ const char* tmp_cstr = tmp_name.c_str();
+
+ // Blow away a tmp file if it already exists
+ if (FileExists(tmp_cstr) && !DeleteFile(tmp_cstr)) {
+ return false;
+ }
+ if (DirectoryExists(tmp_cstr) && !DeleteDirectoryRecursively(tmp_cstr)) {
+ return false;
+ }
+
+ // Perform the swap
+ if (!RenameFile(one, tmp_cstr)) {
+ return false;
+ }
+ if (!RenameFile(two, one)) {
+ return false;
+ }
+ if (!RenameFile(tmp_cstr, two)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool Filesystem::CreateDirectory(const char* dir_name) const {
+ bool success = DirectoryExists(dir_name);
+ if (!success) {
+ if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
+ success = true;
+ } else {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Creating directory %s failed: %s", dir_name, strerror(errno));
+ }
+ }
+ return success;
+}
+
+bool Filesystem::CreateDirectoryRecursively(const char* dir_name) const {
+ if ((strlen(dir_name) == 0) || DirectoryExists(dir_name)) {
+ return true;
+ }
+ std::string path_before = GetDirname(dir_name);
+ if (!CreateDirectoryRecursively(path_before.c_str())) {
+ return false;
+ }
+ return CreateDirectory(dir_name);
+}
+
+int64_t Filesystem::GetDiskUsage(int fd) const {
+ struct stat st;
+ if (fstat(fd, &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
+ strerror(errno));
+ return kBadFileSize;
+ }
+ return st.st_blocks * kStatBlockSize;
+}
+
+int64_t Filesystem::GetFileDiskUsage(const char* path) const {
+ struct stat st;
+ if (stat(path, &st) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
+ path, strerror(errno));
+ return kBadFileSize;
+ }
+ return st.st_blocks * kStatBlockSize;
+}
+
+int64_t Filesystem::GetDiskUsage(const char* path) const {
+ struct stat st;
+ if (stat(path, &st) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
+ path, strerror(errno));
+ return kBadFileSize;
+ }
+ int64_t result = st.st_blocks * kStatBlockSize;
+ if (S_ISDIR(st.st_mode)) {
+ vector<std::string> list;
+ if (!ListDirectory(path, &list)) {
+ return kBadFileSize;
+ }
+ for (vector<std::string>::iterator i = list.begin(); i != list.end(); ++i) {
+ std::string sub_path = std::string(path) + '/' + *i;
+ uint64_t sub_usage = GetDiskUsage(sub_path.c_str());
+ if (sub_usage != kBadFileSize) {
+ result += sub_usage;
+ } // Else just ignore the failing entry.
+ }
+ }
+ return result;
+}
+
+int64_t Filesystem::GetCurrentPosition(int fd) const {
+ return lseek(fd, 0, SEEK_CUR);
+}
+
+int64_t Filesystem::SetPosition(int fd, int offset) const {
+ return lseek(fd, offset, SEEK_SET);
+}
+
+void Filesystem::IncrementByOrSetInvalid(int64_t size, int64_t* to_increment) {
+ if (*to_increment == kBadFileSize) {
+ return;
+ }
+ if (size == kBadFileSize) {
+ *to_increment = kBadFileSize;
+ return;
+ }
+ *to_increment += size;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
new file mode 100644
index 0000000..b85f3a0
--- /dev/null
+++ b/icing/file/filesystem.h
@@ -0,0 +1,237 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Methods for interacting with the filesystem.
+
+#ifndef ICING_FILE_FILESYSTEM_H_
+#define ICING_FILE_FILESYSTEM_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+// Closes fd when it goes out of scope, if fd >= 0.
+class ScopedFd {
+ public:
+ explicit ScopedFd(int fd = -1) : fd_(fd) {}
+ ScopedFd(const ScopedFd&) = delete;
+ ScopedFd(ScopedFd&& other) : ScopedFd() { *this = std::move(other); }
+
+ ScopedFd& operator=(const ScopedFd&) = delete;
+ ScopedFd& operator=(ScopedFd&& other) {
+ std::swap(fd_, other.fd_);
+ return *this;
+ }
+ ~ScopedFd();
+
+ bool is_valid() const { return fd_ >= 0; }
+ int operator*() const { return fd_; }
+ int get() const { return fd_; }
+ void reset(int fd = -1);
+
+ private:
+ int fd_;
+};
+
+struct FILEDeleter {
+ void operator()(FILE* fp) const {
+ if (fp) {
+ fclose(fp);
+ }
+ }
+};
+typedef std::unique_ptr<FILE, FILEDeleter> ScopedFILE;
+
+// Class containing file operation methods.
+// LINT.IfChange
+class Filesystem {
+ public:
+ static const int64_t kBadFileSize = std::numeric_limits<int64_t>::max();
+
+ constexpr Filesystem() = default;
+ virtual ~Filesystem() = default;
+
+ // Deletes a file, returns true on success or if the file did
+ // not yet exist.
+ virtual bool DeleteFile(const char* file_name) const;
+
+ // Deletes a directory, returns true on success or if the directory did
+ // not yet exist.
+ virtual bool DeleteDirectory(const char* dir_name) const;
+
+ // Deletes a directory, including any contents, and returns true on
+ // success or if the directory did not yet exist.
+ virtual bool DeleteDirectoryRecursively(const char* dir_name) const;
+
+ // Returns true if a file exists. False if the file doesn't exist.
+ // If there is an error getting stat on the file, it logs the error and //
+ // asserts.
+ virtual bool FileExists(const char* file_name) const;
+
+ // Returns true if a directory exists. False if the file doesn't exist.
+ // If there is an error getting stat on the file, it logs the error and
+ // asserts.
+ virtual bool DirectoryExists(const char* dir_name) const;
+
+ // Return index to start of basename in file_name. Anything before
+ // basename is the dirname (including the final slash).
+ virtual int GetBasenameIndex(const char* file_name) const;
+
+ // Return a string containing the basename.
+ virtual std::string GetBasename(const char* file_name) const;
+
+ // Return a string containing the dirname.
+ virtual std::string GetDirname(const char* file_name) const;
+
+ // Gets the names of the entries of a given directory. Does not include "."
+ // and "..". Returns false on error.
+ virtual bool ListDirectory(const char* dir_name,
+ std::vector<std::string>* entries) const;
+
+ // Adds the names of the entries of a given directory -- recursively if
+ // specified, and excluding files/directories named in exclude -- to entries.
+ // Regardless of exclude, does not include "." and "..". Excluded files are
+ // excluded at every level. Returns false on error.
+ //
+ // Example use case: list all files & directories in fooDir/, recursively,
+ // excluding anything named "tmp" or "cache" (presumed directories) and the
+ // files within them.
+ virtual bool ListDirectory(const char* dir_name,
+ const std::unordered_set<std::string>& exclude,
+ bool recursive,
+ std::vector<std::string>* entries) const;
+
+ // Use glob to return matched files into "matches". Returns false if
+ // glob had an error.
+ //
+ // Cannot match multiple directories so everything up the last slash
+ // must be literal.
+ virtual bool GetMatchingFiles(const char* glob,
+ std::vector<std::string>* matches) const;
+
+ // Opens the file for read/write. Creates if not existing. Returns
+ // -1 on fail or an open file descriptor on success.
+ virtual int OpenForWrite(const char* file_name) const;
+
+ // Opens the file for read/write, and positions the file at the
+ // end for appending. Creates if not existing. Returns -1 on fail
+ // or an open file descriptor on success.
+ virtual int OpenForAppend(const char* file_name) const;
+
+ // Opens a file for read only. Fails if file doesn't exist. Returns
+ // file descriptor or -1 on fail. Set quiet to true to suppress
+ // log warnings.
+ virtual int OpenForRead(const char* file_name) const;
+
+ // Gets the size of a file, given an open file descriptor.
+ // Returns kBadFileSize on error.
+ virtual int64_t GetFileSize(int fd) const;
+
+ // Gets the size of a file, given a filename.
+ virtual int64_t GetFileSize(const char* filename) const;
+
+ // Truncates the file to the requested size. Seeks to the
+ // end position of the file after truncate. Returns false
+ // if fails.
+ virtual bool Truncate(int fd, int64_t new_size) const;
+
+ // Truncates the file to the requested size.
+ // Returns false if fails.
+ virtual bool Truncate(const char* filename, int64_t new_size) const;
+
+ // Grows the file to the requested size. Does not change the
+ // position pointer.
+ virtual bool Grow(int fd, int64_t new_size) const;
+ virtual bool Grow(const char* filename, int64_t new_size) const;
+
+ // Writes to a file. Returns true if all the data was successfully
+ // written. Handles interrupted writes.
+ virtual bool Write(int fd, const void* data, size_t data_size) const;
+ virtual bool Write(const char* filename, const void* data,
+ size_t data_size) const;
+
+ virtual bool PWrite(int fd, off_t offset, const void* data,
+ size_t data_size) const;
+ virtual bool PWrite(const char* filename, off_t offset, const void* data,
+ size_t data_size) const;
+
+ // Reads from a file. Returns true if data was successfully read out. If the
+ // file is seekable, read starts at the file offset, and the file offset is
+ // incremented by number of bytes read.
+ virtual bool Read(int fd, void* buf, size_t buf_size) const;
+ virtual bool Read(const char* filename, void* buf, size_t buf_size) const;
+ virtual bool PRead(int fd, void* buf, size_t buf_size, off_t offset) const;
+ virtual bool PRead(const char* filename, void* buf, size_t buf_size,
+ off_t offset) const;
+
+ // Syncs the file to disk (fdatasync). Returns true on success.
+ virtual bool DataSync(int fd) const;
+
+ // Renames a file. A file with new_name must not already exist.
+ virtual bool RenameFile(const char* old_name, const char* new_name) const;
+
+ // Renames two files or directories so their names are swapped.
+ // Both names must already exist.
+ virtual bool SwapFiles(const char* one, const char* two) const;
+
+ // Creates a directory if it does not yet exist.
+ virtual bool CreateDirectory(const char* dir_name) const;
+
+ // Creates a directory if it does not yet exist, building the entire path
+ // if it does not yet exist.
+ virtual bool CreateDirectoryRecursively(const char* dir_name) const;
+
+ // Compute the disk usage of the given file. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length.
+ // Returns kBadFileSize on error.
+ virtual int64_t GetDiskUsage(int fd) const;
+
+ // Compute the disk usage of the given file or directory. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length. Returns kBadFileSize
+ // on error. Does not recurse on directories.
+ virtual int64_t GetFileDiskUsage(const char* path) const;
+
+ // Compute the disk usage of the given file or directory. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length. Returns kBadFileSize
+ // on error. Recurses on directories.
+ virtual int64_t GetDiskUsage(const char* path) const;
+
+ // Returns the current position in the given file. Returns -1 and sets errno
+ // on failure.
+ virtual int64_t GetCurrentPosition(int fd) const;
+
+ virtual int64_t SetPosition(int fd, int offset) const;
+
+ // Increments to_increment by size if size is valid, or sets to_increment
+ // to kBadFileSize if either size or to_increment is kBadFileSize.
+ static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
+};
+// LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_FILESYSTEM_H_
diff --git a/icing/file/filesystem_test.cc b/icing/file/filesystem_test.cc
new file mode 100644
index 0000000..b5b8b6c
--- /dev/null
+++ b/icing/file/filesystem_test.cc
@@ -0,0 +1,452 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Test for Filesystem class and utils.
+
+#include "icing/file/filesystem.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/tmp-directory.h"
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+using std::sort;
+using std::vector;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::Le;
+using ::testing::Ne;
+using ::testing::UnorderedElementsAre;
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Create some test files in the specified directory. "test data" plus the
+// relative path name + "\n" is written to each.
+void CreateTestFiles(const vector<std::string>& file_names,
+ const std::string& append_dir) {
+ Filesystem filesystem;
+ for (const std::string& one_file_name : file_names) {
+ // Write the filename to the file
+ std::string one_file_path = append_dir + "/" + one_file_name;
+ int fd = filesystem.OpenForWrite(one_file_path.c_str());
+ ASSERT_THAT(fd, Gt(0));
+ std::string test_data = "test data " + one_file_name + "\n";
+ EXPECT_TRUE(
+ filesystem.Write(fd, test_data.c_str(), strlen(test_data.c_str())));
+ EXPECT_THAT(close(fd), Eq(0));
+ }
+}
+} // namespace
+
+// Indicates if the file system supports Sparse Files.
+// 'Sparse files' are essentially pre-allocated files of big sizes which do not
+// yet use any blocks. A few tests validate that disk-usage is accounted
+// correctly in those cases as zero.
+// However, on HFS+ file system sparse files are not supported.
+// The new AFS supports sparse files, but as of 2017-09 all simulators in prod
+// are running on MacOS using HFS+.
+bool FileSystemSupportsSparseFiles() {
+#ifdef TARGET_IPHONE_SIMULATOR
+ return false;
+#else
+ return true;
+#endif
+}
+
+class FilesystemTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ temp_dir_ = GetTestTempDir() + "/icing_filesystem";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(temp_dir_.c_str()));
+ }
+
+ void TearDown() override {
+ Filesystem filesystem;
+ EXPECT_TRUE(filesystem.DeleteDirectoryRecursively(temp_dir_.c_str()));
+ }
+
+ // Write junk data of given size to the given file descriptor
+ void WriteJunk(int fd, size_t size) {
+ const int kBufLen = 1024;
+ int buf[kBufLen];
+ for (int i = 0; i < kBufLen; ++i) {
+ buf[i] = i;
+ }
+ const int kBufSize = kBufLen * sizeof(int);
+
+ Filesystem filesystem;
+ for (size_t i = 0; i < size / kBufSize; ++i) {
+ EXPECT_TRUE(filesystem.Write(fd, buf, kBufSize));
+ }
+ if (size % kBufSize) {
+ EXPECT_TRUE(filesystem.Write(fd, buf, size % kBufSize));
+ }
+ }
+
+ std::string temp_dir_;
+};
+
+TEST_F(FilesystemTest, Names) {
+ const std::string filename("/foo/bar/README.txt");
+ Filesystem filesystem;
+
+ std::string basename = filesystem.GetBasename(filename.c_str());
+ EXPECT_THAT(basename, Eq("README.txt"));
+
+ std::string dirname = filesystem.GetDirname(filename.c_str());
+ EXPECT_THAT(dirname, Eq("/foo/bar"));
+
+ basename = filesystem.GetBasename(dirname.c_str());
+ EXPECT_THAT(basename, Eq("bar"));
+
+ dirname = filesystem.GetDirname(dirname.c_str());
+ EXPECT_THAT(dirname, Eq("/foo"));
+
+ basename = filesystem.GetBasename(dirname.c_str());
+ EXPECT_THAT(basename, Eq("foo"));
+
+ dirname = filesystem.GetDirname(dirname.c_str());
+ EXPECT_THAT(dirname, Eq(""));
+}
+
+TEST_F(FilesystemTest, OneLetter) {
+ Filesystem filesystem;
+
+ const std::string basename = filesystem.GetDirname("a");
+ EXPECT_THAT(basename, Eq(""));
+
+ const std::string dirname = filesystem.GetDirname("a");
+ EXPECT_THAT(dirname, Eq(""));
+}
+
+TEST_F(FilesystemTest, Directory) {
+ Filesystem filesystem;
+
+ const std::string foo_str = temp_dir_ + "/foo";
+ const std::string bar_str = foo_str + "/bar";
+ const char* foo_dir = foo_str.c_str();
+ const char* bar_dir = bar_str.c_str();
+
+ EXPECT_TRUE(filesystem.CreateDirectory(foo_dir));
+ EXPECT_TRUE(filesystem.DirectoryExists(foo_dir));
+ EXPECT_TRUE(filesystem.DeleteDirectory(foo_dir));
+ EXPECT_FALSE(filesystem.DirectoryExists(foo_dir));
+
+ EXPECT_FALSE(filesystem.CreateDirectory(bar_dir));
+ EXPECT_FALSE(filesystem.DirectoryExists(foo_dir));
+ EXPECT_FALSE(filesystem.DirectoryExists(bar_dir));
+ EXPECT_TRUE(filesystem.CreateDirectoryRecursively(bar_dir));
+ EXPECT_TRUE(filesystem.DirectoryExists(foo_dir));
+ EXPECT_TRUE(filesystem.DirectoryExists(bar_dir));
+
+ EXPECT_FALSE(filesystem.DeleteDirectory(foo_dir));
+ EXPECT_TRUE(filesystem.DeleteDirectoryRecursively(foo_dir));
+ EXPECT_FALSE(filesystem.DirectoryExists(foo_dir));
+ EXPECT_FALSE(filesystem.DirectoryExists(bar_dir));
+
+ // Deleting a non-existing directory returns true.
+ EXPECT_TRUE(filesystem.DeleteDirectory(foo_dir));
+ EXPECT_TRUE(filesystem.DeleteDirectoryRecursively(foo_dir));
+}
+
+TEST_F(FilesystemTest, FSync) {
+ Filesystem filesystem;
+ const std::string foo_file = temp_dir_ + "/foo_file";
+ int fd = filesystem.OpenForWrite(foo_file.c_str());
+ ASSERT_THAT(fd, Ne(-1));
+ EXPECT_TRUE(filesystem.DataSync(fd));
+ close(fd);
+}
+
+TEST_F(FilesystemTest, Truncate) {
+ Filesystem filesystem;
+ const std::string foo_file = temp_dir_ + "/foo_file";
+ const char* filename = foo_file.c_str();
+ int fd = filesystem.OpenForWrite(filename);
+ ASSERT_THAT(fd, Ne(-1));
+ char data[10000] = {0}; // Zero-init to satisfy msan.
+ EXPECT_TRUE(filesystem.Write(fd, data, sizeof(data)));
+ close(fd);
+ EXPECT_THAT(filesystem.GetFileSize(filename), Eq(sizeof(data)));
+ EXPECT_TRUE(filesystem.Truncate(filename, sizeof(data) / 2));
+ EXPECT_THAT(filesystem.GetFileSize(filename), Eq(sizeof(data) / 2));
+ EXPECT_TRUE(filesystem.Truncate(filename, 0));
+ EXPECT_THAT(filesystem.GetFileSize(filename), Eq(0u));
+}
+
+TEST_F(FilesystemTest, GetMatchingFiles) {
+ Filesystem filesystem;
+ const std::string foo_dir = temp_dir_ + "/foo";
+ const std::string glob = foo_dir + "/p_*_q";
+ vector<std::string> matches;
+
+ // Non existing directory
+ EXPECT_TRUE(filesystem.GetMatchingFiles(glob.c_str(), &matches));
+ EXPECT_THAT(matches.size(), Eq(0u));
+
+ // Existing directory
+ matches.clear();
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(foo_dir.c_str()));
+ EXPECT_TRUE(filesystem.GetMatchingFiles(glob.c_str(), &matches));
+ EXPECT_THAT(matches.size(), Eq(0u));
+
+ // With some files
+ matches.clear();
+ const char* files[] = {"p_1_q", "p_2_q", "p_3", "4_q"};
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(files); ++i) {
+ ScopedFd file(filesystem.OpenForWrite((foo_dir + "/" + files[i]).c_str()));
+ }
+ const std::string good[] = {foo_dir + "/p_1_q", foo_dir + "/p_2_q"};
+ vector<std::string> expected(good, good + ABSL_ARRAYSIZE(good));
+ EXPECT_TRUE(filesystem.GetMatchingFiles(glob.c_str(), &matches));
+ sort(matches.begin(), matches.end());
+ EXPECT_THAT(matches, Eq(expected));
+}
+
+TEST_F(FilesystemTest, IncrementByOrSetInvalid) {
+ int64_t to_increment = 1;
+ Filesystem::IncrementByOrSetInvalid(2, &to_increment);
+ EXPECT_THAT(to_increment, Eq(3));
+
+ Filesystem::IncrementByOrSetInvalid(Filesystem::kBadFileSize, &to_increment);
+ EXPECT_THAT(to_increment, Eq(Filesystem::kBadFileSize));
+
+ to_increment = Filesystem::kBadFileSize;
+ Filesystem::IncrementByOrSetInvalid(2, &to_increment);
+ EXPECT_THAT(to_increment, Eq(Filesystem::kBadFileSize));
+}
+
+TEST_F(FilesystemTest, GetDiskUsage) {
+ Filesystem filesystem;
+ const std::string foo_dir = temp_dir_ + "/foo";
+
+ const int64_t kCluster = 4096; // at least the anticipated fs cluster
+
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(foo_dir.c_str()));
+
+ // Grow a sparse file, and then append to it.
+ const std::string filename = foo_dir + "/myfile";
+ // Size to expand the sparse file to.
+ const int64_t kExpandedSize = 100 * kCluster - 5;
+ // Actual data to write to the file.
+ const int64_t kJunkSize = 5 * kCluster - 10;
+
+ EXPECT_TRUE(filesystem.Truncate(filename.c_str(), kExpandedSize));
+ ScopedFd fd(filesystem.OpenForWrite(filename.c_str()));
+ WriteJunk(*fd, kJunkSize);
+
+ int64_t size = filesystem.GetDiskUsage(*fd);
+ EXPECT_THAT(size, Ge(kJunkSize));
+ if (FileSystemSupportsSparseFiles()) {
+ EXPECT_THAT(size, Le(kExpandedSize));
+ }
+}
+
+TEST_F(FilesystemTest, GetDiskUsagePath) {
+ Filesystem filesystem;
+ const std::string foo_dir = temp_dir_ + "/foo";
+
+ const int64_t kCluster = 4096; // at least the anticipated fs cluster
+
+ // Non-existing
+ {
+ EXPECT_THAT(filesystem.GetDiskUsage(foo_dir.c_str()),
+ Eq(Filesystem::kBadFileSize));
+ }
+
+ // A single directory
+ {
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(foo_dir.c_str()));
+ int64_t size = filesystem.GetDiskUsage(foo_dir.c_str());
+ EXPECT_THAT(size, Ge(0 * kCluster));
+ EXPECT_THAT(size, Le(1 * kCluster));
+ }
+
+ // Nested directories
+ const std::string bar_dir = foo_dir + "/bar";
+ {
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(bar_dir.c_str()));
+ int64_t size = filesystem.GetDiskUsage(bar_dir.c_str());
+ EXPECT_THAT(size, Ge(0 * kCluster));
+ EXPECT_THAT(size, Le(2 * kCluster));
+ }
+
+ // Two regular files
+ const std::string reg1 = bar_dir + "/f1";
+ const std::string reg2 = bar_dir + "/f2";
+ {
+ {
+ ScopedFd f1(filesystem.OpenForWrite(reg1.c_str()));
+ ScopedFd f2(filesystem.OpenForWrite(reg2.c_str()));
+ WriteJunk(*f1, 5 * kCluster - 10);
+ WriteJunk(*f2, 8 * kCluster - 10);
+ }
+ int64_t size = filesystem.GetDiskUsage(foo_dir.c_str());
+ EXPECT_THAT(size, Ge(13 * kCluster));
+ EXPECT_THAT(size, Le(15 * kCluster));
+ }
+
+ // Two sparse files
+ const std::string sparse1 = foo_dir + "/s1";
+ const std::string sparse2 = foo_dir + "/s2";
+ {
+ EXPECT_TRUE(filesystem.Truncate(sparse1.c_str(), 100 * kCluster - 5));
+ EXPECT_TRUE(filesystem.Truncate(sparse2.c_str(), 200 * kCluster - 123));
+ int64_t size = filesystem.GetDiskUsage(foo_dir.c_str());
+ EXPECT_THAT(size, Ge(13 * kCluster));
+ if (FileSystemSupportsSparseFiles()) {
+ EXPECT_THAT(size, Le(17 * kCluster));
+ } else {
+ EXPECT_THAT(size, Le(313 * kCluster));
+ }
+ }
+
+ // Some junk in the sparse files
+ {
+ {
+ ScopedFd f1(filesystem.OpenForWrite(sparse1.c_str()));
+ ScopedFd f2(filesystem.OpenForWrite(sparse2.c_str()));
+ WriteJunk(*f1, 5 * kCluster - 10);
+ WriteJunk(*f2, 8 * kCluster - 10);
+ }
+ int64_t size = filesystem.GetDiskUsage(foo_dir.c_str());
+ EXPECT_THAT(size, Ge(26 * kCluster));
+ if (FileSystemSupportsSparseFiles()) {
+ EXPECT_THAT(size, Le(30 * kCluster));
+ } else {
+ EXPECT_THAT(size, Le(313 * kCluster));
+ }
+ }
+}
+
+// TODO(b/112435354): Add test case for original (non-recursive) ListDirectory()
+
+// Tests ListDirectory() with recursive dir search, with no exclusions
+// (simple test).
+TEST_F(FilesystemTest, ListDirectoryRecursiveSimple) {
+ Filesystem filesystem;
+ const std::string append_dir = temp_dir_ + "/append_test";
+ const std::string dir1_name = "dir1";
+ const std::string dir1_path = append_dir + "/" + dir1_name;
+ vector<std::string> some_files = {"file1", "file2", dir1_name + "/file3"};
+
+ // Make sure there is no pre-existing test-dir structure
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(append_dir.c_str()));
+
+ // Setup a test-dir structure
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(
+ dir1_path.c_str())); // deepest path for test
+ CreateTestFiles(some_files, append_dir);
+
+ // Call the ListDirectory API with recursive dir-search, no exclusions.
+ vector<std::string> result;
+ EXPECT_TRUE(filesystem.ListDirectory(append_dir.c_str(), /*exclude=*/{},
+ /*recursive=*/true, &result));
+
+ // Verify that all files are returned, and no extras.
+ EXPECT_THAT(result, UnorderedElementsAre(some_files[0], some_files[1],
+ dir1_name, some_files[2]));
+
+ // Clean up
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(append_dir.c_str()));
+}
+
+// Tests ListDirectory() with recursive dir search, with exclusions.
+// This test is similar in structure to ListDirectory_recursive_simple, but with
+// exclusions.
+TEST_F(FilesystemTest, ListDirectoryRecursiveExclude) {
+ Filesystem filesystem;
+ const std::string append_dir = temp_dir_ + "/append_test";
+ const std::string dir1_name = "dir1";
+ const std::string dir1_path = append_dir + "/" + dir1_name;
+ vector<std::string> some_files = {"file1", "file2", dir1_name + "/file3"};
+
+ // Make sure there is no pre-existing test-dir structure
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(append_dir.c_str()));
+
+ // Setup a test-dir structure
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(
+ dir1_path.c_str())); // deepest path for test
+ CreateTestFiles(some_files, append_dir);
+
+ // Call the ListDirectory API with recursive dir-search, but exclude dir1.
+ vector<std::string> result;
+ std::unordered_set<std::string> exclude;
+ bool success = filesystem.ListDirectory(append_dir.c_str(),
+ /*exclude=*/{dir1_name.c_str()},
+ /*recursive=*/true, &result);
+
+ // Verify that all files are returned, and no extras.
+ EXPECT_TRUE(success);
+ EXPECT_THAT(result, UnorderedElementsAre(some_files[0], some_files[1]));
+
+ // Clean up
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(append_dir.c_str()));
+}
+
+TEST_F(FilesystemTest, ReadWrite) {
+ Filesystem filesystem;
+ const std::string foo_file = temp_dir_ + "/foo_file";
+ int fd = filesystem.OpenForWrite(foo_file.c_str());
+ const std::string data = "hello world";
+ EXPECT_TRUE(filesystem.Write(fd, data.c_str(), strlen(data.c_str())));
+
+ std::string hello;
+ hello.resize(strlen("hello"));
+ EXPECT_TRUE(filesystem.Read(foo_file.c_str(), &hello[0], strlen("hello")));
+ EXPECT_THAT(hello, Eq("hello"));
+
+ // Read starts from wherever file offset is at the moment.
+ filesystem.SetPosition(fd, 0);
+ hello.clear();
+ hello.resize(strlen("hello"));
+ EXPECT_TRUE(filesystem.Read(fd, &hello[0], strlen("hello")));
+ EXPECT_THAT(hello, Eq("hello"));
+
+ // Shouldn't need to move file offset anymore since file offset gets updated
+ // after the read.
+ std::string world;
+ world.resize(strlen(" world"));
+ EXPECT_TRUE(filesystem.Read(fd, &world[0], strlen(" world")));
+ EXPECT_THAT(world, Eq(" world"));
+
+ // PRead should not be dependent on the file offset
+ world.clear();
+ world.resize(strlen(" world"));
+ EXPECT_TRUE(
+ filesystem.PRead(fd, &world[0], strlen(" world"), strlen("hello")));
+ EXPECT_THAT(world, Eq(" world"));
+
+ hello.clear();
+ hello.resize(strlen("hello"));
+ EXPECT_TRUE(
+ filesystem.PRead(foo_file.c_str(), &hello[0], strlen("hello"), 0));
+ EXPECT_THAT(hello, Eq("hello"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc
new file mode 100644
index 0000000..ebd419b
--- /dev/null
+++ b/icing/file/memory-mapped-file.cc
@@ -0,0 +1,171 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// TODO(cassiewang) Add unit-tests to this class.
+
+#include "icing/file/memory-mapped-file.h"
+
+#include <sys/mman.h>
+
+#include <cerrno>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/math-util.h"
+
+namespace icing {
+namespace lib {
+
+MemoryMappedFile::MemoryMappedFile(const Filesystem& filesystem,
+ const std::string_view file_path,
+ Strategy mmap_strategy)
+ : filesystem_(&filesystem),
+ file_path_(file_path),
+ strategy_(mmap_strategy) {}
+
+MemoryMappedFile::~MemoryMappedFile() { Unmap(); }
+
+void MemoryMappedFile::MemoryMappedFile::Unmap() {
+ if (mmap_result_ != nullptr) {
+ munmap(mmap_result_, region_size_);
+ mmap_result_ = nullptr;
+ }
+
+ file_offset_ = 0;
+ region_ = nullptr;
+ region_size_ = 0;
+ adjusted_mmap_size_ = 0;
+}
+
+libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
+ size_t mmap_size) {
+ // First unmap any previously mmapped region.
+ Unmap();
+
+ if (mmap_size == 0) {
+ // Nothing more to do.
+ return libtextclassifier3::Status::OK;
+ }
+
+ size_t aligned_offset =
+ math_util::RoundDownTo(file_offset, system_page_size());
+ size_t alignment_adjustment = file_offset - aligned_offset;
+ size_t adjusted_mmap_size = alignment_adjustment + mmap_size;
+
+ int mmap_flags = 0;
+ // Determines if the mapped region should just be readable or also writable.
+ int protection_flags = 0;
+ ScopedFd fd;
+ switch (strategy_) {
+ case Strategy::READ_ONLY: {
+ mmap_flags = MAP_PRIVATE;
+ protection_flags = PROT_READ;
+ fd.reset(filesystem_->OpenForRead(file_path_.c_str()));
+ break;
+ }
+ case Strategy::READ_WRITE_AUTO_SYNC: {
+ mmap_flags = MAP_SHARED;
+ protection_flags = PROT_READ | PROT_WRITE;
+ fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
+ break;
+ }
+ case Strategy::READ_WRITE_MANUAL_SYNC: {
+ mmap_flags = MAP_PRIVATE;
+ protection_flags = PROT_READ | PROT_WRITE;
+ // TODO(cassiewang) MAP_PRIVATE effectively makes it a read-only file.
+ // figure out if we can open this file in read-only mode.
+ fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
+ break;
+ }
+ default:
+ return absl_ports::UnknownError(IcingStringUtil::StringPrintf(
+ "Invalid value in switch statement: %d", strategy_));
+ }
+
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to open file meant to be mmapped: ", file_path_));
+ }
+
+ mmap_result_ = mmap(nullptr, adjusted_mmap_size, protection_flags, mmap_flags,
+ fd.get(), aligned_offset);
+
+ if (mmap_result_ == MAP_FAILED) {
+ mmap_result_ = nullptr;
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to mmap region due to error: ", strerror(errno)));
+ }
+
+ file_offset_ = file_offset;
+ region_ = reinterpret_cast<char*>(mmap_result_) + alignment_adjustment;
+ region_size_ = mmap_size;
+ adjusted_mmap_size_ = adjusted_mmap_size;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MemoryMappedFile::PersistToDisk() {
+ if (strategy_ == Strategy::READ_ONLY) {
+ return absl_ports::FailedPreconditionError(absl_ports::StrCat(
+ "Attempting to PersistToDisk on a read-only file: ", file_path_));
+ }
+
+ if (region_ == nullptr) {
+ // Nothing mapped to sync.
+ return libtextclassifier3::Status::OK;
+ }
+
+ if (strategy_ == Strategy::READ_WRITE_AUTO_SYNC &&
+ msync(mmap_result_, adjusted_mmap_size_, MS_SYNC) != 0) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to sync file using msync(): ", file_path_));
+ }
+
+ // In order to prevent automatic syncing of changes, files that use the
+ // READ_WRITE_MANUAL_SYNC strategy are mmapped using MAP_PRIVATE. Such files
+ // can't be synced using msync(). So, we have to directly write to the
+ // underlying file to update it.
+ if (strategy_ == Strategy::READ_WRITE_MANUAL_SYNC &&
+ !filesystem_->PWrite(file_path_.c_str(), 0, region(), region_size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to sync file using PWrite(): ", file_path_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MemoryMappedFile::OptimizeFor(
+ AccessPattern access_pattern) {
+ int madvise_flag = 0;
+ if (access_pattern == AccessPattern::ACCESS_ALL) {
+ madvise_flag = MADV_WILLNEED;
+ } else if (access_pattern == AccessPattern::ACCESS_NONE) {
+ madvise_flag = MADV_DONTNEED;
+ } else if (access_pattern == AccessPattern::ACCESS_RANDOM) {
+ madvise_flag = MADV_RANDOM;
+ } else if (access_pattern == AccessPattern::ACCESS_SEQUENTIAL) {
+ madvise_flag = MADV_SEQUENTIAL;
+ }
+
+ if (madvise(mmap_result_, adjusted_mmap_size_, madvise_flag) != 0) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to madvise file ", file_path_, "; Error: ", strerror(errno)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/memory-mapped-file.h b/icing/file/memory-mapped-file.h
new file mode 100644
index 0000000..1be3dd8
--- /dev/null
+++ b/icing/file/memory-mapped-file.h
@@ -0,0 +1,155 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Allows memory-mapping a full file or a specific region within the file.
+// It also supports efficiently switching the region being mapped.
+//
+// Note on Performance:
+// It supports different optimized strategies for common patterns on both
+// read-only and read-write files. This includes using read-ahead buffers for
+// faster reads as well as background-sync vs manual-sync of changes to disk.
+// For more details, see comments at MemoryMappedFile::Strategy.
+//
+// Usage:
+//
+// MemoryMappedFile mmapped_file(filesystem, "/file.pb", READ_WRITE_AUTO_SYNC));
+// mmapped_file->Remap(0, 16* 1024); // load the first 16K of the file.
+//
+// char read_byte = mmapped_file->region()[100];
+// mmapped_file->mutable_region()[10] = write_byte;
+//
+// mmapped_file->PersistToDisk(); // Optional; immediately writes changes to
+// disk.
+//
+// mmapped_file->Remap(16*1024, 16* 1024); // load the next 16K.
+// mmapped_file->mutable_region()[10] = write_byte;
+// mmapped_file.reset();
+
+#ifndef ICING_FILE_MEMORY_MAPPED_FILE_H_
+#define ICING_FILE_MEMORY_MAPPED_FILE_H_
+
+#include <unistd.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/status.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+class MemoryMappedFile {
+ public:
+ static size_t __attribute__((const)) system_page_size() {
+ static const size_t page_size = sysconf(_SC_PAGE_SIZE);
+ return page_size;
+ }
+
+ enum Strategy {
+ // Memory map a read-only file into a read-only memory region.
+ READ_ONLY,
+
+ // Memory map a read-write file into a writable memory region. Any changes
+ // made to the region are automatically flushed to the underlying file in
+ // the background.
+ READ_WRITE_AUTO_SYNC,
+
+ // Memory map a read-write file into a writable memory region. Changes made
+ // to this region will never be auto-synced to the underlying file. Unless
+ // the caller explicitly calls PersistToDisk(), all changes will be lost
+ // when the
+ // MemoryMappedFile is destroyed.
+ READ_WRITE_MANUAL_SYNC,
+ };
+
+ // file_path : Full path of the file that needs to be memory-mapped.
+ MemoryMappedFile(const Filesystem& filesystem, std::string_view file_path,
+ Strategy mmap_strategy);
+
+ // Frees any region that is still memory-mapped region.
+ ~MemoryMappedFile();
+
+ // Memory-map the newly specified region within the file specified by
+ // file_offset and mmap_size. Unmaps any previously mmapped region.
+ //
+ // Returns any encountered IO error.
+ libtextclassifier3::Status Remap(size_t file_offset, size_t mmap_size);
+
+ // unmap and free-up the region that has currently been memory mapped.
+ void Unmap();
+
+ // Explicitly persist any changes made to the currently mapped region to disk.
+ //
+ // NOTE: This is only valid if Strategy=READ_WRITE was used.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Advise the system to help it optimize the memory-mapped region for
+ // upcoming read/write operations.
+ //
+ // NOTE: See linux documentation of madvise() for additional details.
+ enum AccessPattern {
+ // Future memory access are expected to be in random order. So, readhead
+ // will have limited impact on latency.
+ ACCESS_RANDOM,
+
+ // Future memory access are expected to be sequential. So, some readahead
+ // can greatly improve latency.
+ ACCESS_SEQUENTIAL,
+
+ // Future memory access is expected to be high-volume and all over the file.
+ // So, preloading the whole region into memory would greatly improve
+ // latency.
+ ACCESS_ALL,
+
+ // Future memory access is expected to be rare. So, it is best to free up
+ // as much of preloaded memory as possible.
+ ACCESS_NONE,
+ };
+ libtextclassifier3::Status OptimizeFor(AccessPattern access_pattern);
+
+ // Accessors to the memory-mapped region. Returns null if nothing is mapped.
+ const char* region() const { return region_; }
+ char* mutable_region() { return region_; }
+
+ size_t region_size() const { return region_size_; }
+ Strategy strategy() const { return strategy_; }
+
+ private:
+ // Cached constructor params.
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+ const Strategy strategy_;
+
+ // Offset within the file at which the current memory-mapped region starts.
+ size_t file_offset_ = 0;
+
+ // Region that is currently memory-mapped.
+ char* region_ = nullptr;
+ size_t region_size_ = 0;
+
+ // The actual size of the region we mmapped. As the requested region might not
+ // align with system pages, we often mmap more bytes than requested.
+ size_t adjusted_mmap_size_ = 0;
+
+ // Raw pointer (or error) returned by calls to mmap().
+ void* mmap_result_ = nullptr;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_MEMORY_MAPPED_FILE_H_
diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h
new file mode 100644
index 0000000..a82f253
--- /dev/null
+++ b/icing/file/mock-filesystem.h
@@ -0,0 +1,325 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_MOCK_FILESYSTEM_H_
+#define ICING_FILE_MOCK_FILESYSTEM_H_
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+using ::testing::_;
+using ::testing::A;
+
+class MockFilesystem : public Filesystem {
+ public:
+ MockFilesystem() {
+ // For all methods, we always delegate calls to a real Filesystem instance
+ // by default.
+
+ ON_CALL(*this, DeleteFile).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.DeleteFile(file_name);
+ });
+
+ ON_CALL(*this, DeleteDirectory).WillByDefault([this](const char* dir_name) {
+ return real_filesystem_.DeleteDirectory(dir_name);
+ });
+
+ ON_CALL(*this, DeleteDirectoryRecursively)
+ .WillByDefault([this](const char* dir_name) {
+ return real_filesystem_.DeleteDirectoryRecursively(dir_name);
+ });
+
+ ON_CALL(*this, FileExists).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.FileExists(file_name);
+ });
+
+ ON_CALL(*this, DirectoryExists).WillByDefault([this](const char* dir_name) {
+ return real_filesystem_.DirectoryExists(dir_name);
+ });
+
+ ON_CALL(*this, GetBasenameIndex)
+ .WillByDefault([this](const char* file_name) {
+ return real_filesystem_.GetBasenameIndex(file_name);
+ });
+
+ ON_CALL(*this, GetBasename).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.GetBasename(file_name);
+ });
+
+ ON_CALL(*this, GetDirname).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.GetDirname(file_name);
+ });
+
+ ON_CALL(*this, ListDirectory(_, _))
+ .WillByDefault(
+ [this](const char* dir_name, std::vector<string>* entries) {
+ return real_filesystem_.ListDirectory(dir_name, entries);
+ });
+
+ ON_CALL(*this, ListDirectory(_, _, _, _))
+ .WillByDefault([this](const char* dir_name,
+ const std::unordered_set<string>& exclude,
+ bool recursive, std::vector<string>* entries) {
+ return real_filesystem_.ListDirectory(dir_name, exclude, recursive,
+ entries);
+ });
+
+ ON_CALL(*this, GetMatchingFiles)
+ .WillByDefault([this](const char* glob, std::vector<string>* matches) {
+ return real_filesystem_.GetMatchingFiles(glob, matches);
+ });
+
+ ON_CALL(*this, OpenForWrite).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.OpenForWrite(file_name);
+ });
+
+ ON_CALL(*this, OpenForAppend).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.OpenForAppend(file_name);
+ });
+
+ ON_CALL(*this, OpenForRead).WillByDefault([this](const char* file_name) {
+ return real_filesystem_.OpenForRead(file_name);
+ });
+
+ ON_CALL(*this, GetFileSize(A<int>())).WillByDefault([this](int fd) {
+ return real_filesystem_.GetFileSize(fd);
+ });
+
+ ON_CALL(*this, GetFileSize(A<const char*>()))
+ .WillByDefault([this](const char* file_name) {
+ return real_filesystem_.GetFileSize(file_name);
+ });
+
+ ON_CALL(*this, Truncate(A<int>(), _))
+ .WillByDefault([this](int fd, int64_t new_size) {
+ return real_filesystem_.Truncate(fd, new_size);
+ });
+
+ ON_CALL(*this, Truncate(A<const char*>(), _))
+ .WillByDefault([this](const char* filename, int64_t new_size) {
+ return real_filesystem_.Truncate(filename, new_size);
+ });
+
+ ON_CALL(*this, Grow(A<int>(), _))
+ .WillByDefault([this](int fd, int64_t new_size) {
+ return real_filesystem_.Grow(fd, new_size);
+ });
+
+ ON_CALL(*this, Grow(A<const char*>(), _))
+ .WillByDefault([this](const char* filename, int64_t new_size) {
+ return real_filesystem_.Grow(filename, new_size);
+ });
+
+ ON_CALL(*this, Write(A<int>(), _, _))
+ .WillByDefault([this](int fd, const void* data, size_t data_size) {
+ return real_filesystem_.Write(fd, data, data_size);
+ });
+
+ ON_CALL(*this, Write(A<const char*>(), _, _))
+ .WillByDefault(
+ [this](const char* filename, const void* data, size_t data_size) {
+ return real_filesystem_.Write(filename, data, data_size);
+ });
+
+ ON_CALL(*this, PWrite(A<int>(), _, _, _))
+ .WillByDefault(
+ [this](int fd, off_t offset, const void* data, size_t data_size) {
+ return real_filesystem_.PWrite(fd, offset, data, data_size);
+ });
+
+ ON_CALL(*this, PWrite(A<const char*>(), _, _, _))
+ .WillByDefault([this](const char* filename, off_t offset,
+ const void* data, size_t data_size) {
+ return real_filesystem_.PWrite(filename, offset, data, data_size);
+ });
+
+ ON_CALL(*this, Read(A<int>(), _, _))
+ .WillByDefault([this](int fd, void* buf, size_t buf_size) {
+ return real_filesystem_.Read(fd, buf, buf_size);
+ });
+
+ ON_CALL(*this, Read(A<const char*>(), _, _))
+ .WillByDefault(
+ [this](const char* filename, void* buf, size_t buf_size) {
+ return real_filesystem_.Read(filename, buf, buf_size);
+ });
+
+ ON_CALL(*this, PRead(A<int>(), _, _, _))
+ .WillByDefault(
+ [this](int fd, void* buf, size_t buf_size, off_t offset) {
+ return real_filesystem_.PRead(fd, buf, buf_size, offset);
+ });
+
+ ON_CALL(*this, PRead(A<const char*>(), _, _, _))
+ .WillByDefault([this](const char* filename, void* buf, size_t buf_size,
+ off_t offset) {
+ return real_filesystem_.PRead(filename, buf, buf_size, offset);
+ });
+
+ ON_CALL(*this, DataSync).WillByDefault([this](int fd) {
+ return real_filesystem_.DataSync(fd);
+ });
+
+ ON_CALL(*this, RenameFile)
+ .WillByDefault([this](const char* old_name, const char* new_name) {
+ return real_filesystem_.RenameFile(old_name, new_name);
+ });
+
+ ON_CALL(*this, SwapFiles)
+ .WillByDefault([this](const char* one, const char* two) {
+ return real_filesystem_.SwapFiles(one, two);
+ });
+
+ ON_CALL(*this, CreateDirectory).WillByDefault([this](const char* dir_name) {
+ return real_filesystem_.CreateDirectory(dir_name);
+ });
+
+ ON_CALL(*this, CreateDirectoryRecursively)
+ .WillByDefault([this](const char* dir_name) {
+ return real_filesystem_.CreateDirectoryRecursively(dir_name);
+ });
+
+ ON_CALL(*this, GetDiskUsage(A<int>())).WillByDefault([this](int fd) {
+ return real_filesystem_.GetDiskUsage(fd);
+ });
+
+ ON_CALL(*this, GetFileDiskUsage).WillByDefault([this](const char* path) {
+ return real_filesystem_.GetFileDiskUsage(path);
+ });
+
+ ON_CALL(*this, GetDiskUsage(A<const char*>()))
+ .WillByDefault([this](const char* path) {
+ return real_filesystem_.GetDiskUsage(path);
+ });
+
+ ON_CALL(*this, GetCurrentPosition).WillByDefault([this](int fd) {
+ return real_filesystem_.GetCurrentPosition(fd);
+ });
+
+ ON_CALL(*this, SetPosition).WillByDefault([this](int fd, int offset) {
+ return real_filesystem_.SetPosition(fd, offset);
+ });
+ }
+
+ MOCK_METHOD(bool, DeleteFile, (const char* file_name), (const));
+
+ MOCK_METHOD(bool, DeleteDirectory, (const char* dir_name), (const));
+
+ MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char* dir_name),
+ (const));
+
+ MOCK_METHOD(bool, FileExists, (const char* file_name), (const));
+
+ MOCK_METHOD(bool, DirectoryExists, (const char* dir_name), (const));
+
+ MOCK_METHOD(int, GetBasenameIndex, (const char* file_name), (const));
+
+ MOCK_METHOD(std::string, GetBasename, (const char* file_name), (const));
+
+ MOCK_METHOD(std::string, GetDirname, (const char* file_name), (const));
+
+ MOCK_METHOD(bool, ListDirectory,
+ (const char* dir_name, std::vector<std::string>* entries),
+ (const));
+
+ MOCK_METHOD(bool, ListDirectory,
+ (const char* dir_name,
+ const std::unordered_set<std::string>& exclude, bool recursive,
+ std::vector<std::string>* entries),
+ (const));
+
+ MOCK_METHOD(bool, GetMatchingFiles,
+ (const char* glob, std::vector<std::string>* matches), (const));
+
+ MOCK_METHOD(int, OpenForWrite, (const char* file_name), (const));
+
+ MOCK_METHOD(int, OpenForAppend, (const char* file_name), (const));
+
+ MOCK_METHOD(int, OpenForRead, (const char* file_name), (const));
+
+ MOCK_METHOD(int64_t, GetFileSize, (int fd), (const));
+
+ MOCK_METHOD(int64_t, GetFileSize, (const char* filename), (const));
+
+ MOCK_METHOD(bool, Truncate, (int fd, int64_t new_size), (const));
+
+ MOCK_METHOD(bool, Truncate, (const char* filename, int64_t new_size),
+ (const));
+
+ MOCK_METHOD(bool, Grow, (int fd, int64_t new_size), (const));
+
+ MOCK_METHOD(bool, Grow, (const char* filename, int64_t new_size), (const));
+
+ MOCK_METHOD(bool, Write, (int fd, const void* data, size_t data_size),
+ (const));
+
+ MOCK_METHOD(bool, Write,
+ (const char* filename, const void* data, size_t data_size),
+ (const));
+
+ MOCK_METHOD(bool, PWrite,
+ (int fd, off_t offset, const void* data, size_t data_size),
+ (const));
+
+ MOCK_METHOD(bool, PWrite,
+ (const char* filename, off_t offset, const void* data,
+ size_t data_size),
+ (const));
+
+ MOCK_METHOD(bool, Read, (int fd, void* buf, size_t buf_size), (const));
+
+ MOCK_METHOD(bool, Read, (const char* filename, void* buf, size_t buf_size),
+ (const));
+
+ MOCK_METHOD(bool, PRead, (int fd, void* buf, size_t buf_size, off_t offset),
+ (const));
+
+ MOCK_METHOD(bool, PRead,
+ (const char* filename, void* buf, size_t buf_size, off_t offset),
+ (const));
+
+ MOCK_METHOD(bool, DataSync, (int fd), (const));
+
+ MOCK_METHOD(bool, RenameFile, (const char* old_name, const char* new_name),
+ (const));
+
+ MOCK_METHOD(bool, SwapFiles, (const char* one, const char* two), (const));
+
+ MOCK_METHOD(bool, CreateDirectory, (const char* dir_name), (const));
+
+ MOCK_METHOD(bool, CreateDirectoryRecursively, (const char* dir_name),
+ (const));
+
+ MOCK_METHOD(int64_t, GetDiskUsage, (int fd), (const));
+
+ MOCK_METHOD(int64_t, GetFileDiskUsage, (const char* path), (const));
+
+ MOCK_METHOD(int64_t, GetDiskUsage, (const char* path), (const));
+
+ MOCK_METHOD(int64_t, GetCurrentPosition, (int fd), (const));
+
+ MOCK_METHOD(int64_t, SetPosition, (int fd, int offset), (const));
+
+ private:
+ Filesystem real_filesystem_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_MOCK_FILESYSTEM_H_
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
new file mode 100644
index 0000000..8e1d469
--- /dev/null
+++ b/icing/icing-search-engine.cc
@@ -0,0 +1,649 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/icing-search-engine.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/annotate.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/index-processor.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/icing-search-engine-options.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/query/query-processor.h"
+#include "icing/result-retriever.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scoring-processor.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr std::string_view kDocumentAndIndexSubfolderName =
+ "document_index_dir";
+constexpr std::string_view kSchemaSubfolderName = "schema_dir";
+constexpr std::string_view kIcingSearchEngineHeaderFilename =
+ "icing_search_engine_header";
+
+libtextclassifier3::Status ValidateOptions(
+ const IcingSearchEngineOptions& options) {
+ // These options are only used in IndexProcessor, which won't be created
+ // until the first Put call. So they must be checked here, so that any
+ // errors can be surfaced in Initialize.
+ if (options.max_tokens_per_doc() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "Options::max_tokens_per_doc must be greater than zero.");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status ValidateResultSpec(
+ const ResultSpecProto& result_spec) {
+ if (result_spec.num_to_retrieve() < 0) {
+ return absl_ports::InvalidArgumentError(
+ "ResultSpec::num_to_retrieve cannot be negative.");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+IndexProcessor::Options CreateIndexProcessorOptions(
+ const IcingSearchEngineOptions& options) {
+ IndexProcessor::Options index_processor_options;
+ index_processor_options.max_tokens_per_document =
+ options.max_tokens_per_doc();
+ index_processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
+ return index_processor_options;
+}
+
+std::string MakeHeaderFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kIcingSearchEngineHeaderFilename);
+}
+
+// Document store and index files are in a standalone subfolder because they
+// can be re-generated at the same time during full optimization. Others like
+// schema store can be optimized separately.
+std::string MakeDocumentAndIndexDirectoryPath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentAndIndexSubfolderName);
+}
+
+// Makes a temporary folder path for document and index which will be used
+// during full optimization.
+std::string MakeDocumentAndIndexTemporaryDirectoryPath(
+ const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentAndIndexSubfolderName,
+ "_optimize_tmp");
+}
+
+// SchemaStore files are in a standalone subfolder for easier file management.
+// We can delete and recreate the subfolder and not touch/affect anything
+// else.
+std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
+}
+
+// Helper function to wrap results in ScoredDocumentHit without changing the
+// order.
+std::vector<ScoredDocumentHit> WrapResults(
+ std::unique_ptr<DocHitInfoIterator> result_iterator, int num_to_return) {
+ std::vector<ScoredDocumentHit> document_hits;
+ while (result_iterator->Advance().ok() && num_to_return-- > 0) {
+ const DocHitInfo& doc_hit_info = result_iterator->doc_hit_info();
+ // Score is just a placeholder here and has no meaning.
+ document_hits.emplace_back(doc_hit_info.document_id(),
+ doc_hit_info.hit_section_ids_mask(),
+ /*score=*/0);
+ }
+ return document_hits;
+}
+
+libtextclassifier3::StatusOr<std::vector<ScoredDocumentHit>> RunScoring(
+ std::unique_ptr<DocHitInfoIterator> result_iterator,
+ const ScoringSpecProto& scoring_spec, int num_to_return,
+ const DocumentStore* document_store) {
+ if (scoring_spec.rank_by() == ScoringSpecProto::RankingStrategy::NONE) {
+ // No scoring needed, return in original order
+ return WrapResults(std::move(result_iterator), num_to_return);
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(scoring_spec, document_store));
+ return scoring_processor->ScoreAndRank(std::move(result_iterator),
+ num_to_return);
+}
+
+} // namespace
+
+IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options)
+ : IcingSearchEngine(options, std::make_unique<Filesystem>(),
+ std::make_unique<Clock>()) {}
+
+IcingSearchEngine::IcingSearchEngine(
+ IcingSearchEngineOptions options,
+ std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock)
+ : options_(std::move(options)),
+ filesystem_(std::move(filesystem)),
+ icing_filesystem_(std::make_unique<IcingFilesystem>()),
+ clock_(std::move(clock)) {
+ ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
+}
+
+IcingSearchEngine::~IcingSearchEngine() {
+ if (initialized_) {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(ERROR)
+ << "Error persisting to disk in IcingSearchEngine destructor";
+ }
+ }
+}
+
+libtextclassifier3::Status IcingSearchEngine::Initialize() {
+ ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
+ << options_.base_dir();
+
+ ICING_RETURN_IF_ERROR(ValidateOptions(options_));
+
+ // This method does both read and write so we need a writer lock. Using two
+ // locks (reader and writer) has the chance to be interrupted during
+ // switching.
+ absl_ports::unique_lock l(&mutex_);
+
+ // Make sure the base directory exists
+ if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not create directory: ", options_.base_dir()));
+ }
+
+ const std::string schema_store_dir =
+ MakeSchemaDirectoryPath(options_.base_dir());
+ // Make sure the sub-directory exists
+ if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Could not create directory: ", schema_store_dir));
+ }
+ ICING_ASSIGN_OR_RETURN(
+ schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir));
+
+ const std::string document_store_and_index_dir =
+ MakeDocumentAndIndexDirectoryPath(options_.base_dir());
+ // Make sure the sub-directory exists
+ if (!filesystem_->CreateDirectoryRecursively(
+ document_store_and_index_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not create directory: ", document_store_and_index_dir));
+ }
+ ICING_ASSIGN_OR_RETURN(
+ document_store_,
+ DocumentStore::Create(filesystem_.get(), document_store_and_index_dir,
+ clock_.get(), schema_store_.get()));
+
+ ICING_ASSIGN_OR_RETURN(language_segmenter_,
+ LanguageSegmenter::Create(options_.lang_model_path()));
+
+ ICING_ASSIGN_OR_RETURN(normalizer_,
+ Normalizer::Create(options_.max_token_length()));
+
+ Index::Options index_options(document_store_and_index_dir,
+ options_.index_merge_size());
+ ICING_ASSIGN_OR_RETURN(index_,
+ Index::Create(index_options, icing_filesystem_.get()));
+
+ // Even if each subcomponent initialized fine independently, we need to
+ // check if they're consistent with each other.
+ if (!CheckConsistency().ok()) {
+ ICING_VLOG(1)
+ << "IcingSearchEngine in inconsistent state, regenerating all "
+ "derived data";
+ ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ }
+
+ initialized_ = true;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::CheckConsistency() {
+ if (!HeaderExists()) {
+ // Without a header file, we have no checksum and can't even detect
+ // inconsistencies
+ return absl_ports::NotFoundError("No header file found.");
+ }
+
+ // Header does exist, verify that the header looks fine.
+ IcingSearchEngine::Header header;
+ if (!filesystem_->Read(MakeHeaderFilename(options_.base_dir()).c_str(),
+ &header, sizeof(header))) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Couldn't read: ", MakeHeaderFilename(options_.base_dir())));
+ }
+
+ if (header.magic != IcingSearchEngine::Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ",
+ MakeHeaderFilename(options_.base_dir())));
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ if (checksum.Get() != header.checksum) {
+ return absl_ports::InternalError(
+ "IcingSearchEngine checksum doesn't match");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles() {
+ ICING_RETURN_IF_ERROR(
+ document_store_->UpdateSchemaStore(schema_store_.get()));
+ ICING_RETURN_IF_ERROR(index_->Reset());
+ ICING_RETURN_IF_ERROR(RestoreIndex());
+
+ const std::string header_file =
+ MakeHeaderFilename(options_.base_dir().c_str());
+ if (HeaderExists()) {
+ if (!filesystem_->DeleteFile(header_file.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to delete file: ", header_file));
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::SetSchema(
+ const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
+ return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
+}
+
+libtextclassifier3::Status IcingSearchEngine::SetSchema(
+ SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
+ ICING_VLOG(1) << "Setting new Schema";
+
+ ICING_RETURN_IF_ERROR(SchemaUtil::Validate(new_schema));
+
+ absl_ports::unique_lock l(&mutex_);
+
+ ICING_ASSIGN_OR_RETURN(
+ const SchemaStore::SetSchemaResult set_schema_result,
+ schema_store_->SetSchema(std::move(new_schema),
+ ignore_errors_and_delete_documents));
+
+ if (set_schema_result.success) {
+ if (!set_schema_result.old_schema_type_ids_changed.empty() ||
+ !set_schema_result.schema_types_incompatible_by_id.empty() ||
+ !set_schema_result.schema_types_deleted_by_id.empty()) {
+ ICING_RETURN_IF_ERROR(document_store_->OptimizedUpdateSchemaStore(
+ schema_store_.get(), set_schema_result));
+ }
+
+ if (set_schema_result.index_incompatible) {
+ // Clears all index files
+ ICING_RETURN_IF_ERROR(index_->Reset());
+ ICING_RETURN_IF_ERROR(RestoreIndex());
+ }
+
+ return libtextclassifier3::Status::OK;
+ }
+
+ // TODO(cassiewang): Instead of returning a Status, consider returning some
+ // of the information we have in SetSchemaResult such as which types were
+ // deleted and which types were incompatible.
+ return absl_ports::FailedPreconditionError("Schema is incompatible.");
+}
+
+libtextclassifier3::StatusOr<SchemaProto> IcingSearchEngine::GetSchema() {
+ absl_ports::shared_lock l(&mutex_);
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, schema_store_->GetSchema());
+ return *schema;
+}
+
+libtextclassifier3::StatusOr<SchemaTypeConfigProto>
+IcingSearchEngine::GetSchemaType(std::string schema_type) {
+ absl_ports::shared_lock l(&mutex_);
+ ICING_ASSIGN_OR_RETURN(const SchemaTypeConfigProto* type_config,
+ schema_store_->GetSchemaTypeConfig(schema_type));
+ return *type_config;
+}
+
+libtextclassifier3::Status IcingSearchEngine::Put(
+ const DocumentProto& document) {
+ return Put(DocumentProto(document));
+}
+
+libtextclassifier3::Status IcingSearchEngine::Put(DocumentProto&& document) {
+ ICING_VLOG(1) << "Writing document to document store";
+
+ // Lock must be acquired before validation because the DocumentStore uses
+ // the schema file to validate, and the schema could be changed in
+ // SetSchema() which is protected by the same mutex.
+ absl_ports::unique_lock l(&mutex_);
+
+ ICING_ASSIGN_OR_RETURN(DocumentId document_id,
+ document_store_->Put(document));
+
+ IndexProcessor index_processor(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get(), index_.get(),
+ CreateIndexProcessorOptions(options_));
+ ICING_RETURN_IF_ERROR(index_processor.IndexDocument(document, document_id));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<DocumentProto> IcingSearchEngine::Get(
+ const std::string_view name_space, const std::string_view uri) {
+ absl_ports::shared_lock l(&mutex_);
+
+ return document_store_->Get(name_space, uri);
+}
+
+libtextclassifier3::Status IcingSearchEngine::Delete(
+ const std::string_view name_space, const std::string_view uri) {
+ ICING_VLOG(1) << "Deleting document from doc store";
+
+ absl_ports::unique_lock l(&mutex_);
+
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete Document. namespace: " << name_space
+ << ", uri: " << uri;
+ return status;
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::PersistToDisk() {
+ ICING_VLOG(1) << "Persisting data to disk";
+ absl_ports::unique_lock l(&mutex_);
+ return InternalPersistToDisk();
+}
+
+// Optimizes storage for document store and index.
+//
+// Steps:
+// 1. Flush data to disk.
+// 2. Copy data needed to a tmp directory.
+// 3. Swap current directory and tmp directory.
+//
+// TODO(b/143724846) Optimize schema store here as well.
+// TODO(b/143724541) Signal the caller if the failure is unrecoverable.
+libtextclassifier3::Status IcingSearchEngine::Optimize() {
+ ICING_VLOG(1) << "Optimizing icing storage";
+
+ absl_ports::unique_lock l(&mutex_);
+
+ // Flushes data to disk before doing optimization
+ ICING_RETURN_IF_ERROR(InternalPersistToDisk());
+
+ // Gets the current directory path and an empty tmp directory path for
+ // document store and index optimization.
+ std::string current_dir =
+ MakeDocumentAndIndexDirectoryPath(options_.base_dir());
+ std::string temporary_dir =
+ MakeDocumentAndIndexTemporaryDirectoryPath(options_.base_dir());
+ if (!filesystem_->DeleteDirectoryRecursively(temporary_dir.c_str()) ||
+ !filesystem_->CreateDirectoryRecursively(temporary_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to create a tmp directory: ", temporary_dir));
+ }
+
+ // Copies valid document data to tmp directory
+ auto optimize_status = document_store_->OptimizeInto(temporary_dir);
+
+ // Handles error if any
+ if (!optimize_status.ok()) {
+ filesystem_->DeleteDirectoryRecursively(temporary_dir.c_str());
+ return absl_ports::Annotate(optimize_status,
+ "Failed to optimize document store.");
+ }
+
+ // Resets before swapping
+ document_store_.reset();
+ index_.reset();
+
+ // When swapping files, always put the current working directory at the
+ // second place because it is renamed at the latter position so we're less
+ // vulnerable to errors.
+ if (!filesystem_->SwapFiles(temporary_dir.c_str(), current_dir.c_str())) {
+ // Try to rebuild document store and index if swapping fails, to avoid
+ // leaving the system in the broken state for future operations.
+ // TODO(b/144458732): Implement a more robust version of
+ // TC_ASSIGN_OR_RETURN that can support error logging.
+ auto document_store_or = DocumentStore::Create(
+ filesystem_.get(), current_dir, clock_.get(), schema_store_.get());
+ if (!document_store_or.ok()) {
+ ICING_LOG(ERROR)
+ << document_store_or.status().error_message()
+ << "Failed to swap files, no document store instance available";
+ return document_store_or.status();
+ }
+ document_store_ = std::move(document_store_or).ValueOrDie();
+
+ Index::Options index_options(current_dir, options_.index_merge_size());
+ // TODO(b/144458732): Implement a more robust version of
+ // TC_ASSIGN_OR_RETURN that can support error logging.
+ auto index_or = Index::Create(index_options, icing_filesystem_.get());
+ if (!index_or.ok()) {
+ ICING_LOG(ERROR) << index_or.status().error_message()
+ << "Failed to swap files, no index instance available";
+ return index_or.status();
+ }
+ index_ = std::move(index_or).ValueOrDie();
+ return absl_ports::InternalError("Failed to rename files");
+ }
+
+ // Recreates the doc store instance
+ ICING_ASSIGN_OR_RETURN(
+ document_store_,
+ DocumentStore::Create(filesystem_.get(), current_dir, clock_.get(),
+ schema_store_.get()));
+
+ // Deletes tmp directory
+ if (!filesystem_->DeleteDirectoryRecursively(temporary_dir.c_str())) {
+ return absl_ports::InternalError("Failed to delete temporary directory");
+ }
+
+ // Recreates the index instance and re-indexes all the documents.
+ // TODO(b/143646633): figure out if we need to optimize index and doc store
+ // at the same time.
+ Index::Options index_options(current_dir, options_.index_merge_size());
+ ICING_ASSIGN_OR_RETURN(index_,
+ Index::Create(index_options, icing_filesystem_.get()));
+ ICING_RETURN_IF_ERROR(RestoreIndex());
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() {
+ ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(document_store_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(index_->PersistToDisk());
+
+ // Update the combined checksum and write to header file.
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32> IcingSearchEngine::ComputeChecksum() {
+ Crc32 total_checksum;
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto checksum_or = schema_store_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of SchemaStore";
+ return checksum_or.status();
+ }
+
+ Crc32 schema_store_checksum = std::move(checksum_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = document_store_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of DocumentStore";
+ return checksum_or.status();
+ }
+ Crc32 document_store_checksum = std::move(checksum_or).ValueOrDie();
+
+ Crc32 index_checksum = index_->ComputeChecksum();
+
+ total_checksum.Append(std::to_string(document_store_checksum.Get()));
+ total_checksum.Append(std::to_string(schema_store_checksum.Get()));
+ total_checksum.Append(std::to_string(index_checksum.Get()));
+
+ return total_checksum;
+}
+
+bool IcingSearchEngine::HeaderExists() {
+ if (!filesystem_->FileExists(
+ MakeHeaderFilename(options_.base_dir()).c_str())) {
+ return false;
+ }
+
+ int64_t file_size =
+ filesystem_->GetFileSize(MakeHeaderFilename(options_.base_dir()).c_str());
+
+ // If it's been truncated to size 0 before, we consider it to be a new file
+ return file_size != 0 && file_size != Filesystem::kBadFileSize;
+}
+
+libtextclassifier3::Status IcingSearchEngine::UpdateHeader(
+ const Crc32& checksum) {
+ // Write the header
+ IcingSearchEngine::Header header;
+ header.magic = IcingSearchEngine::Header::kMagic;
+ header.checksum = checksum.Get();
+
+ // This should overwrite the header.
+ if (!filesystem_->Write(MakeHeaderFilename(options_.base_dir()).c_str(),
+ &header, sizeof(header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write IcingSearchEngine header: ",
+ MakeHeaderFilename(options_.base_dir())));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<SearchResultProto> IcingSearchEngine::Search(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec) {
+ ICING_RETURN_IF_ERROR(ValidateResultSpec(result_spec));
+
+ // TODO(b/146008613) Explore ideas to make this function read-only.
+ absl_ports::unique_lock l(&mutex_);
+
+ // Gets unordered results from query processor
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), clock_.get());
+ ICING_ASSIGN_OR_RETURN(QueryProcessor::QueryResults query_results,
+ query_processor.ParseSearch(search_spec));
+
+ // Generates the final list of document hits
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<ScoredDocumentHit> result_document_hits,
+ RunScoring(std::move(query_results.root_iterator), scoring_spec,
+ result_spec.num_to_retrieve(), document_store_.get()));
+
+ // Retrieves the document protos and snippets if requested
+ ResultRetriever result_retriever(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get());
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<SearchResultProto::ResultProto> results,
+ result_retriever.RetrieveResults(result_spec, query_results.query_terms,
+ search_spec.term_match_type(),
+ result_document_hits));
+ // Assembles the final search result proto
+ SearchResultProto search_results;
+ search_results.mutable_results()->Reserve(results.size());
+ for (SearchResultProto::ResultProto& result : results) {
+ search_results.mutable_results()->Add(std::move(result));
+ }
+ return search_results;
+}
+
+libtextclassifier3::Status IcingSearchEngine::RestoreIndex() {
+ DocumentId last_stored_document_id =
+ document_store_->last_added_document_id();
+
+ if (last_stored_document_id == kInvalidDocumentId) {
+ // Nothing to index
+ return libtextclassifier3::Status::OK;
+ }
+
+ IndexProcessor index_processor(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get(), index_.get(),
+ CreateIndexProcessorOptions(options_));
+
+ for (DocumentId document_id = kMinDocumentId;
+ document_id <= last_stored_document_id; document_id++) {
+ libtextclassifier3::StatusOr<DocumentProto> document_or =
+ document_store_->Get(document_id);
+
+ if (!document_or.ok()) {
+ if (absl_ports::IsInvalidArgument(document_or.status()) ||
+ absl_ports::IsNotFound(document_or.status())) {
+ // Skips invalid and non-existing documents.
+ continue;
+ } else {
+ // Returns other errors
+ return document_or.status();
+ }
+ }
+
+ ICING_RETURN_IF_ERROR(
+ index_processor.IndexDocument(document_or.ValueOrDie(), document_id));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
new file mode 100644
index 0000000..4069020
--- /dev/null
+++ b/icing/icing-search-engine.h
@@ -0,0 +1,319 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ICING_SEARCH_ENGINE_H_
+#define ICING_ICING_SEARCH_ENGINE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/thread_annotations.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/index.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/icing-search-engine-options.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// TODO(cassiewang) Top-level comments and links to design-doc.
+class IcingSearchEngine {
+ public:
+ struct Header {
+ static constexpr int32_t kMagic = 0x6e650d0a;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic;
+
+ // Checksum of the IcingSearchEngine's sub-component's checksums.
+ uint32_t checksum;
+ };
+
+ explicit IcingSearchEngine(const IcingSearchEngineOptions& options);
+
+ // Calculates integrity checks and persists files to disk.
+ ~IcingSearchEngine();
+
+ // Loads & verifies the contents previously indexed from disk and gets ready
+ // to handle read/write requests.
+ //
+ // WARNING: This is expected to be fast if Icing had a clean shutdown.
+ // Otherwise, it can take longer as it runs integrity checks and attempts
+ // to bring the index to a consistent state. If the data on disk is not
+ // consistent, it restores the state when PersistToDisk() was last called.
+ //
+ // Returns OK on success, ie, Icing was initialized and all data verified.
+ // Returns DATA_LOSS on partial success, when Icing encountered
+ // data-inconsistency and had to restore its state back to the last call
+ // to PersistToDisk().
+ // Returns any other error encountered due to which the call couldn't be
+ // completed. The instance of IcingSearchEngine is not usable if this
+ // happens.
+ libtextclassifier3::Status Initialize() LOCKS_EXCLUDED(mutex_);
+
+ // Specifies the schema to be applied on all Documents that are already
+ // stored as well as future documents. A schema can be 'invalid' and/or
+ // 'incompatible'. These are two independent concepts.
+ //
+ // An 'invalid' schema is one that is not constructed properly. For example,
+ // a PropertyConfigProto is missing the property name field. A schema can be
+ // 'invalid' even if there is no previously existing schema.
+ //
+ // An 'incompatible' schema is one that is incompatible with a previously
+ // existing schema. If there is no previously existing schema, then a new
+ // schema cannot be incompatible. An incompatible schema is one that
+ // invalidates pre-existing data. For example, a previously OPTIONAL field is
+ // now REQUIRED in the new schema, and pre-existing data is considered invalid
+ // against the new schema now.
+ //
+ // Default behavior will not allow a new schema to be set if it is invalid or
+ // incompatible.
+ //
+ // The argument 'ignore_errors_and_delete_documents' can be set to true to
+ // force set an incompatible schema. In that case, documents that are
+ // invalidated by the new schema would be deleted from Icing. This cannot be
+ // used to force set an invalid schema.
+ //
+ // This schema is persisted to disk and used across multiple instances.
+ // So, callers should only have to call this if the schema changed.
+ // However, calling it multiple times with the same schema is a no-op.
+ //
+ // On any error, Icing will keep using the older schema.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if 'new_schema' is invalid
+ // FAILED_PRECONDITION if 'new_schema' is incompatible
+ // INTERNAL_ERROR if Icing failed to store the new schema or upgrade
+ // existing data based on the new schema.
+ //
+ // TODO(cassiewang) Figure out, document (and maybe even enforce) the best
+ // way ordering of calls between Initialize() and SetSchema(), both when
+ // the caller is creating an instance of IcingSearchEngine for the first
+ // time and when the caller is reinitializing an existing index on disk.
+ libtextclassifier3::Status SetSchema(
+ SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false)
+ LOCKS_EXCLUDED(mutex_);
+
+ // This function makes a copy of the schema and calls SetSchema(SchemaProto&&
+ // new_schema, bool ignore_errors_and_delete_documents)
+ //
+ // NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool
+ // ignore_errors_and_delete_documents) directly to avoid a copy if the caller
+ // can make an rvalue SchemaProto.
+ libtextclassifier3::Status SetSchema(
+ const SchemaProto& new_schema,
+ bool ignore_errors_and_delete_documents = false) LOCKS_EXCLUDED(mutex_);
+
+ // Get Icing's current copy of the schema.
+ //
+ // Returns:
+ // SchemaProto on success
+ // NOT_FOUND if a schema has not been set yet
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<SchemaProto> GetSchema() LOCKS_EXCLUDED(mutex_);
+
+ // Get Icing's copy of the SchemaTypeConfigProto of name schema_type
+ //
+ // Returns:
+ // SchemaTypeConfigProto on success
+ // NOT_FOUND if a schema has not been set yet or if there is no
+ // SchemaTypeConfig of schema_type in the SchemaProto
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<SchemaTypeConfigProto> GetSchemaType(
+ std::string schema_type) LOCKS_EXCLUDED(mutex_);
+
+ // Puts the document into icing search engine so that it's stored and
+ // indexed. Documents are automatically written to disk, callers can also
+ // call PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Put(DocumentProto&& document)
+ LOCKS_EXCLUDED(mutex_);
+
+ // This function makes a copy of document and calls Put(DocumentProto&&
+ // document).
+ //
+ // NOTE: It's recommended to call Put(DocumentProto&& document) directly to
+ // avoid a copy if the caller can make an rvalue DocumentProto.
+ libtextclassifier3::Status Put(const DocumentProto& document)
+ LOCKS_EXCLUDED(mutex_);
+
+ // Finds and returns the document identified by the given key (namespace +
+ // uri)
+ //
+ // Returns:
+ // The document found on success
+ // NOT_FOUND if the key doesn't exist or doc has been deleted
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentProto> Get(std::string_view name_space,
+ std::string_view uri);
+
+ // Deletes the Document specified by the given namespace / uri pair from the
+ // search engine. Delete changes are automatically applied to disk, callers
+ // can also call PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Delete(std::string_view name_space,
+ std::string_view uri)
+ LOCKS_EXCLUDED(mutex_);
+
+ // Retrieves, scores, ranks, and returns the results according to the specs.
+ // Please refer to each proto file for spec definitions.
+ //
+ // Returns:
+ // A SearchResultProto on success
+ // INVALID_ARGUMENT if any of specs is invalid
+ // INTERNAL_ERROR on any other errors
+ libtextclassifier3::StatusOr<SearchResultProto> Search(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec) LOCKS_EXCLUDED(mutex_);
+
+ // Makes sure that every update/delete received till this point is flushed
+ // to disk. If the app crashes after a call to PersistToDisk(), Icing
+ // would be able to fully recover all data written up to this point.
+ //
+ // NOTE: It is not necessary to call PersistToDisk() to read back data
+ // that was recently written. All read APIs will include the most recent
+ // updates/deletes regardless of the data being flushed to disk.
+ libtextclassifier3::Status PersistToDisk() LOCKS_EXCLUDED(mutex_);
+
+ // Allows Icing to run tasks that are too expensive and/or unnecessary to be
+ // executed in real-time, but are useful to keep it fast and be
+ // resource-efficient. This method purely optimizes the internal files and
+ // has no functional impact on what gets accepted/returned.
+ //
+ // NOTE: This method should be called about once every 24 hours when the
+ // device is idle and charging. It can also be called when the system needs
+ // to free up extra disk-space.
+ //
+ // WARNING: This method is CPU and IO intensive and depending on the
+ // contents stored, it can take from a few seconds to a few minutes.
+ // This call also blocks all read/write operations on Icing.
+ libtextclassifier3::Status Optimize() LOCKS_EXCLUDED(mutex_);
+
+ // Disallow copy and move.
+ IcingSearchEngine(const IcingSearchEngine&) = delete;
+ IcingSearchEngine& operator=(const IcingSearchEngine&) = delete;
+
+ protected:
+ IcingSearchEngine(IcingSearchEngineOptions options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<Clock> clock);
+
+ private:
+ const IcingSearchEngineOptions options_;
+ const std::unique_ptr<const Filesystem> filesystem_;
+ const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
+ bool initialized_ = false;
+
+ // Abstraction for accessing time values.
+ std::unique_ptr<Clock> clock_;
+
+ // Used to provide reader and writer locks
+ absl_ports::shared_mutex mutex_;
+
+ // Stores and processes the schema
+ std::unique_ptr<SchemaStore> schema_store_ GUARDED_BY(mutex_);
+
+ // Used to store all valid documents
+ std::unique_ptr<DocumentStore> document_store_ GUARDED_BY(mutex_);
+
+ std::unique_ptr<const LanguageSegmenter> language_segmenter_
+ GUARDED_BY(mutex_);
+
+ std::unique_ptr<const Normalizer> normalizer_ GUARDED_BY(mutex_);
+
+ // Storage for all hits of content from the document store.
+ std::unique_ptr<Index> index_ GUARDED_BY(mutex_);
+
+ // Helper method to do the actual work to persist data to disk. We need this
+ // separate method so that other public methods don't need to call
+ // PersistToDisk(). Public methods calling each other may cause deadlock
+ // issues.
+ libtextclassifier3::Status InternalPersistToDisk()
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Many of the internal components rely on other components' derived data.
+ // Check that everything is consistent with each other so that we're not using
+ // outdated derived data in some parts of our system.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if missing header file
+ // INTERNAL_ERROR on any IO errors or if header is inconsistent
+ libtextclassifier3::Status CheckConsistency()
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Repopulates derived data off our ground truths.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any IO errors
+ libtextclassifier3::Status RegenerateDerivedFiles()
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to restore missing document data in index_. All documents
+ // will be reindexed. This does not clear the index, so it is recommended to
+ // call Index::Reset first.
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED if the index fills up before finishing indexing
+ // NOT_FOUND if some Document's schema type is not in the SchemaStore
+ // INTERNAL_ERROR on any IO errors
+ libtextclassifier3::Status RestoreIndex() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Computes the combined checksum of the IcingSearchEngine - includes all its
+ // subcomponents
+ //
+ // Returns:
+ // Combined checksum on success
+ // INTERNAL_ERROR on compute error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum()
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Checks if the header exists already. This does not create the header file
+ // if it doesn't exist.
+ bool HeaderExists() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Update and replace the header file. Creates the header file if it doesn't
+ // exist.
+ libtextclassifier3::Status UpdateHeader(const Crc32& checksum)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ICING_SEARCH_ENGINE_H_
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
new file mode 100644
index 0000000..a7f6adc
--- /dev/null
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -0,0 +1,102 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/document-builder.h"
+#include "icing/icing-search-engine.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/icing-search-engine-options.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+IcingSearchEngineOptions Setup() {
+ IcingSearchEngineOptions icing_options;
+ libtextclassifier3::Status status =
+ SetUpICUDataFile("icing/icu.dat");
+ icing_options.set_base_dir(GetTestTempDir() + "/icing");
+ icing_options.set_lang_model_path(GetLangIdModelPath());
+ return icing_options;
+}
+
+SchemaProto SetTypes() {
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ return schema;
+}
+
+DocumentProto MakeDocument(const uint8_t* data, size_t size) {
+ // TODO (sidchhabra): Added more optimized fuzzing techniques.
+ DocumentProto document;
+ string string_prop(reinterpret_cast<const char*>(data), size);
+ return DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", string_prop)
+ .Build();
+}
+
+SearchSpecProto SetSearchSpec(const uint8_t* data, size_t size) {
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ // TODO (sidchhabra): Added more optimized fuzzing techniques.
+ string query_string(reinterpret_cast<const char*>(data), size);
+ search_spec.set_query(query_string);
+ return search_spec;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Initialize
+ IcingSearchEngineOptions icing_options = Setup();
+ IcingSearchEngine icing(icing_options);
+ const Filesystem filesystem_;
+ // TODO (b/145758378): Deleting directory should not be required.
+ filesystem_.DeleteDirectoryRecursively(icing_options.base_dir().c_str());
+ libtextclassifier3::Status status = icing.Initialize();
+ SchemaProto schema_proto = SetTypes();
+ status = icing.SetSchema(schema_proto);
+
+ // Index
+ DocumentProto document = MakeDocument(data, size);
+ status = icing.Put(document);
+
+ // Query
+ SearchSpecProto search_spec = SetSearchSpec(data, size);
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ResultSpecProto result_spec;
+ libtextclassifier3::StatusOr<SearchResultProto> result =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ return 0;
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
new file mode 100644
index 0000000..e389b57
--- /dev/null
+++ b/icing/icing-search-engine_test.cc
@@ -0,0 +1,1848 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/icing-search-engine.h"
+
+#include <cstdint>
+#include <ctime>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/icing-search-engine-options.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/snippet-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::_;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Lt;
+using ::testing::Return;
+using ::testing::SizeIs;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<FakeClock> clock)
+ : IcingSearchEngine(options, std::move(filesystem), std::move(clock)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+class IcingSearchEngineTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+constexpr int kMaxSupportedDocumentSize = (1u << 24) - 1;
+
+// Non-zero value so we don't override it to be the current time
+constexpr std::time_t kDefaultCreationTimestampSecs = 1575492852;
+
+std::string GetDocumentIndexDir() {
+ return GetTestBaseDir() + "/document_index_dir";
+}
+
+std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; }
+
+std::string GetHeaderFilename() {
+ return GetTestBaseDir() + "/icing_search_engine_header";
+}
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ icing_options.set_lang_model_path(GetLangIdModelPath());
+ return icing_options;
+}
+
+DocumentProto GetDefaultDocument() {
+ return DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+}
+
+SchemaProto GetDefaultSchema() {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ return schema;
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+TEST_F(IcingSearchEngineTest, SimpleInitialization) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ DocumentProto document = GetDefaultDocument();
+ ICING_ASSERT_OK(icing.Put(document));
+ ICING_ASSERT_OK(icing.Put(DocumentProto(document)));
+}
+
+TEST_F(IcingSearchEngineTest, MaxIndexMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, NegativeMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(-1);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, ZeroMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(0);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // One is fine, if a bit weird. It just means that the lite index will be
+ // smaller and will request a merge any time content is added to it.
+ options.set_index_merge_size(1);
+ IcingSearchEngine icing(options);
+ ICING_EXPECT_OK(icing.Initialize());
+}
+
+TEST_F(IcingSearchEngineTest,
+ NegativeMaxTokensPerDocSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_tokens_per_doc(-1);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_tokens_per_doc(0);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // INT_MAX is valid - it just means that we shouldn't limit the number of
+ // tokens per document. It would be pretty inconceivable that anyone would
+ // produce such a document - the text being indexed alone would take up at
+ // least ~4.3 GiB! - and the document would be rejected before indexing
+ // for exceeding max_document_size, but there's no reason to explicitly
+ // bar it.
+ options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options);
+ ICING_EXPECT_OK(icing.Initialize());
+}
+
+TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_token_length(-1);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, ZeroMaxTokenLenReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_token_length(0);
+ IcingSearchEngine icing(options);
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, MaxTokenLenReturnsOkAndTruncatesTokens) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // A length of 1 is allowed - even though it would be strange to want
+ // this.
+ options.set_max_token_length(1);
+ IcingSearchEngine icing(options);
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_EXPECT_OK(icing.Put(document));
+
+ // "message" should have been truncated to "m"
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ // The indexed tokens were truncated to length of 1, so "m" will match
+ search_spec.set_query("m");
+
+ SearchResultProto exp_result;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document;
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+
+ // The query token is also truncated to length of 1, so "me"->"m" matches "m"
+ search_spec.set_query("me");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+
+ // The query token is still truncated to length of 1, so "massage"->"m"
+ // matches "m"
+ search_spec.set_query("massage");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+}
+
+TEST_F(IcingSearchEngineTest,
+ MaxIntMaxTokenLenReturnsOkTooLargeTokenReturnsResourceExhausted) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // Set token length to max. This is allowed (it just means never to
+ // truncate tokens). However, this does mean that tokens that exceed the
+ // size of the lexicon will cause indexing to fail.
+ options.set_max_token_length(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options);
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Add a document that just barely fits under the max document limit.
+ // This will still fail to index because we won't actually have enough
+ // room in the lexicon to fit this content.
+ std::string enormous_string(kMaxSupportedDocumentSize - 256, 'p');
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", std::move(enormous_string))
+ .Build();
+ EXPECT_THAT(icing.Put(document),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("p");
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+}
+
+TEST_F(IcingSearchEngineTest, FailToCreateDocStore) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ // This fails DocumentStore::Create()
+ ON_CALL(*mock_filesystem, CreateDirectoryRecursively(_))
+ .WillByDefault(Return(false));
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+
+ ASSERT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Could not create directory")));
+}
+
+TEST_F(IcingSearchEngineTest,
+ InvalidFileCreateLangSegmenterReturnsInvalidArgument) {
+ IcingSearchEngineOptions options(GetDefaultIcingOptions());
+ options.set_lang_model_path("notarealfile");
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<FakeClock>());
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest,
+ CircularReferenceCreateSectionManagerReturnsInvalidArgument) {
+ // Create a type config with a circular reference.
+ SchemaProto schema;
+ auto* type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto* body = type->add_properties();
+ body->set_property_name("recipient");
+ body->set_schema_type("Person");
+ body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ type = schema.add_types();
+ type->set_schema_type("Person");
+
+ body = type->add_properties();
+ body->set_property_name("recipient");
+ body->set_schema_type("Message");
+ body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ EXPECT_THAT(icing.SetSchema(schema),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, NoSchemaSet) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ DocumentProto document = GetDefaultDocument();
+ EXPECT_THAT(icing.Put(document),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'Message' not found")));
+ EXPECT_THAT(icing.Put(DocumentProto(document)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'Message' not found")));
+}
+
+TEST_F(IcingSearchEngineTest, FailToReadSchema) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Successfully initialize and set a schema
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+ }
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+
+ // This fails FileBackedProto::Read() when we try to check the schema we
+ // had previously set
+ ON_CALL(*mock_filesystem,
+ OpenForRead(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
+ .WillByDefault(Return(-1));
+
+ TestIcingSearchEngine test_icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+
+ ASSERT_THAT(test_icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Unable to open file for read")));
+}
+
+TEST_F(IcingSearchEngineTest, FailToWriteSchema) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ // This fails FileBackedProto::Write()
+ ON_CALL(*mock_filesystem,
+ OpenForWrite(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
+ .WillByDefault(Return(-1));
+
+ TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+
+ ICING_ASSERT_OK(icing.Initialize());
+ ASSERT_THAT(icing.SetSchema(GetDefaultSchema()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Unable to open file for write")));
+}
+
+TEST_F(IcingSearchEngineTest, SetSchema) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ auto message_document = GetDefaultDocument();
+
+ auto schema_with_message = GetDefaultSchema();
+
+ SchemaProto schema_with_email;
+ SchemaTypeConfigProto* type = schema_with_email.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ SchemaProto schema_with_email_and_message = schema_with_email;
+ type = schema_with_email_and_message.add_types();
+ type->set_schema_type("Message");
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Create an arbitrary invalid schema
+ SchemaProto invalid_schema;
+ SchemaTypeConfigProto* empty_type = invalid_schema.add_types();
+ empty_type->set_schema_type("");
+
+ // Make sure we can't set invalid schemas
+ EXPECT_THAT(icing.SetSchema(invalid_schema),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Can add an document of a set schema
+ ICING_EXPECT_OK(icing.SetSchema(schema_with_message));
+ ICING_EXPECT_OK(icing.Put(message_document));
+
+ // Schema with Email doesn't have Message, so would result incompatible
+ // data
+ EXPECT_THAT(icing.SetSchema(schema_with_email),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ // Can expand the set of schema types and add an document of a new
+ // schema type
+ ICING_EXPECT_OK(icing.SetSchema(SchemaProto(schema_with_email_and_message)));
+ ICING_EXPECT_OK(icing.Put(message_document));
+
+ // Can't add an document whose schema isn't set
+ auto photo_document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Photo")
+ .AddStringProperty("creator", "icing")
+ .Build();
+ EXPECT_THAT(icing.Put(photo_document),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'Photo' not found")));
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SchemaProto schema_with_no_indexed_property = GetDefaultSchema();
+ schema_with_no_indexed_property.mutable_types(0)
+ ->mutable_properties(0)
+ ->clear_indexing_config();
+
+ ICING_EXPECT_OK(icing.SetSchema(schema_with_no_indexed_property));
+ // Nothing will be index and Search() won't return anything.
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto empty_result;
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(empty_result)));
+
+ SchemaProto schema_with_indexed_property = GetDefaultSchema();
+ // Index restoration should be triggered here because new schema requires more
+ // properties to be indexed.
+ ICING_EXPECT_OK(icing.SetSchema(schema_with_indexed_property));
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SchemaProto schema_with_optional_subject;
+ auto type = schema_with_optional_subject.add_types();
+ type->set_schema_type("email");
+
+ // Add a OPTIONAL property
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ ICING_EXPECT_OK(icing.SetSchema(schema_with_optional_subject));
+
+ DocumentProto email_document_without_subject =
+ DocumentBuilder()
+ .SetKey("namespace", "without_subject")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto email_document_with_subject =
+ DocumentBuilder()
+ .SetKey("namespace", "with_subject")
+ .SetSchema("email")
+ .AddStringProperty("subject", "foo")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ ICING_EXPECT_OK(icing.Put(email_document_without_subject));
+ ICING_EXPECT_OK(icing.Put(email_document_with_subject));
+
+ SchemaProto schema_with_required_subject;
+ type = schema_with_required_subject.add_types();
+ type->set_schema_type("email");
+
+ // Add a REQUIRED property
+ property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ // Can't set the schema since it's incompatible
+ EXPECT_THAT(icing.SetSchema(schema_with_required_subject),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ // Force set it
+ ICING_EXPECT_OK(icing.SetSchema(schema_with_required_subject,
+ /*ignore_errors_and_delete_documents=*/true));
+
+ EXPECT_THAT(icing.Get("namespace", "with_subject"),
+ IsOkAndHolds(EqualsProto(email_document_with_subject)));
+
+ // The document without a subject got deleted because it failed validation
+ // against the new schema
+ EXPECT_THAT(icing.Get("namespace", "without_subject"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+ type = schema.add_types();
+ type->set_schema_type("message");
+
+ ICING_EXPECT_OK(icing.SetSchema(schema));
+
+ DocumentProto email_document =
+ DocumentBuilder()
+ .SetKey("namespace", "email_uri")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto message_document =
+ DocumentBuilder()
+ .SetKey("namespace", "message_uri")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ ICING_EXPECT_OK(icing.Put(email_document));
+ ICING_EXPECT_OK(icing.Put(message_document));
+
+ // Clear the schema and only add the "email" type, essentially deleting the
+ // "message" type
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ type->set_schema_type("email");
+
+ // Can't set the schema since it's incompatible
+ EXPECT_THAT(icing.SetSchema(new_schema),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ // Force set it
+ ICING_EXPECT_OK(icing.SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true));
+
+ // "email" document is still there
+ EXPECT_THAT(icing.Get("namespace", "email_uri"),
+ IsOkAndHolds(EqualsProto(email_document)));
+
+ // "message" document got deleted
+ EXPECT_THAT(icing.Get("namespace", "message_uri"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, GetSchemaNotFound) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ EXPECT_THAT(icing.GetSchema(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, GetSchemaOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ EXPECT_THAT(icing.GetSchema(), IsOkAndHolds(EqualsProto(GetDefaultSchema())));
+}
+
+TEST_F(IcingSearchEngineTest, GetSchemaTypeNotFound) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ EXPECT_THAT(icing.GetSchemaType("nonexistent_schema"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, GetSchemaTypeOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ EXPECT_THAT(icing.GetSchemaType(GetDefaultSchema().types(0).schema_type()),
+ IsOkAndHolds(EqualsProto(GetDefaultSchema().types(0))));
+}
+
+TEST_F(IcingSearchEngineTest, GetDocument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Simple put and get
+ ICING_ASSERT_OK(icing.Put(GetDefaultDocument()));
+ ASSERT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+
+ // Put an invalid document
+ ASSERT_THAT(icing.Put(DocumentProto()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'namespace' is empty")));
+
+ // Get a non-existing key
+ ASSERT_THAT(icing.Get("wrong", "uri"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_ASSERT_OK(icing.Put(document_one));
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_ASSERT_OK(icing.Put(document_two));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SearchResultProto results,
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec));
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(GetMatch(results.results(0).document(),
+ results.results(0).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message"));
+ EXPECT_THAT(
+ GetWindow(results.results(0).document(), results.results(0).snippet(),
+ "body", /*snippet_index=*/0),
+ Eq("message body"));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+ EXPECT_THAT(
+ GetMatch(results.results(1).document(), results.results(1).snippet(),
+ "body", /*snippet_index=*/0),
+ IsEmpty());
+ EXPECT_THAT(
+ GetWindow(results.results(1).document(), results.results(1).snippet(),
+ "body", /*snippet_index=*/0),
+ IsEmpty());
+
+ search_spec.set_query("foo");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+}
+
+TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_ASSERT_OK(icing.Put(document_one));
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_ASSERT_OK(icing.Put(document_two));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_to_retrieve(1);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ document_two;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchZeroResultLimitReturnsEmptyResults) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_to_retrieve(0);
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+}
+
+TEST_F(IcingSearchEngineTest, SearchNegativeResultLimitReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_to_retrieve(-5);
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Set the schema up beforehand.
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+ // Schema will be persisted to disk when icing goes out of scope.
+ }
+
+ {
+ // Ensure that icing initializes the schema and section_manager
+ // properly from the pre-existing file.
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ // The index and document store will be persisted to disk when icing goes
+ // out of scope.
+ }
+
+ {
+ // Ensure that the index is brought back up without problems and we
+ // can query for the content that we expect.
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+
+ search_spec.set_query("foo");
+ EXPECT_THAT(
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .Build();
+ {
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_ASSERT_OK(icing.Put(document1));
+
+ // Deletes document1
+ ICING_ASSERT_OK(icing.Delete("namespace", "uri1"));
+ const std::string document_log_path =
+ icing_options.base_dir() + "/document_index_dir/document_log";
+ int64_t document_log_size_before =
+ filesystem()->GetFileSize(document_log_path.c_str());
+ ICING_ASSERT_OK(icing.Optimize());
+ int64_t document_log_size_after =
+ filesystem()->GetFileSize(document_log_path.c_str());
+
+ // Validates that document can't be found right after Optimize()
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // Validates that document is actually removed from document log
+ EXPECT_THAT(document_log_size_after, Lt(document_log_size_before));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(icing_options);
+ ICING_EXPECT_OK(icing.Initialize());
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, OptimizationShouldDeleteTemporaryDirectory) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ IcingSearchEngine icing(icing_options);
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Create a tmp dir that will be used in Optimize() to swap files,
+ // this validates that any tmp dirs will be deleted before using.
+ const std::string tmp_dir =
+ icing_options.base_dir() + "/document_index_dir" + "_optimize_tmp";
+ const std::string tmp_file = tmp_dir + "/file";
+ ASSERT_TRUE(filesystem()->CreateDirectory(tmp_dir.c_str()));
+ ScopedFd fd(filesystem()->OpenForWrite(tmp_file.c_str()));
+ ASSERT_TRUE(fd.is_valid());
+ ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
+ fd.reset();
+ ICING_ASSERT_OK(icing.Optimize());
+
+ EXPECT_FALSE(filesystem()->DirectoryExists(tmp_dir.c_str()));
+ EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
+}
+
+TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body3")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ ICING_ASSERT_OK(icing.Put(document1));
+ ICING_ASSERT_OK(icing.Optimize());
+
+ // Validates that Get() and Put() are good right after Optimize()
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ IsOkAndHolds(EqualsProto(document1)));
+ ICING_EXPECT_OK(icing.Put(document2));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ IsOkAndHolds(EqualsProto(document1)));
+ EXPECT_THAT(icing.Get("namespace", "uri2"),
+ IsOkAndHolds(EqualsProto(document2)));
+ ICING_EXPECT_OK(icing.Put(document3));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_ASSERT_OK(icing.Put(document1));
+ ICING_ASSERT_OK(icing.Put(document2));
+ ICING_ASSERT_OK(icing.Optimize());
+
+ // Validates that Delete() works right after Optimize()
+ ICING_EXPECT_OK(icing.Delete("namespace", "uri1"));
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(icing.Get("namespace", "uri2"),
+ IsOkAndHolds(EqualsProto(document2)));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.Delete("namespace", "uri2"));
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(icing.Get("namespace", "uri2"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
+ // Creates 3 test schemas
+ SchemaProto schema1 = SchemaProto(GetDefaultSchema());
+
+ SchemaProto schema2 = SchemaProto(schema1);
+ auto new_property2 = schema2.mutable_types(0)->add_properties();
+ new_property2->set_property_name("property2");
+ new_property2->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property2->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ new_property2->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ new_property2->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ SchemaProto schema3 = SchemaProto(schema2);
+ auto new_property3 = schema3.mutable_types(0)->add_properties();
+ new_property3->set_property_name("property3");
+ new_property3->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property3->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ new_property3->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ new_property3->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(schema1));
+ ICING_ASSERT_OK(icing.Optimize());
+
+ // Validates that SetSchema() works right after Optimize()
+ ICING_EXPECT_OK(icing.SetSchema(schema2));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(schema3));
+}
+
+TEST_F(IcingSearchEngineTest, SearchShouldWorkAfterOptimization) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) = document;
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_ASSERT_OK(icing.Put(document));
+ ICING_ASSERT_OK(icing.Optimize());
+
+ // Validates that Search() works right after Optimize()
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(500)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) = document;
+
+ // Time just has to be less than the document's creation timestamp (100) + the
+ // schema's ttl (500)
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSeconds(400);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(schema));
+ ICING_EXPECT_OK(icing.Put(document));
+
+ // Check that the document is returned as part of search results
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(500)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) = document;
+
+ // Time just has to be greater than the document's creation timestamp (100) +
+ // the schema's ttl (500)
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSeconds(700);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(schema));
+ ICING_EXPECT_OK(icing.Put(document));
+
+ // Check that the document is returned as part of search results
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+}
+
+TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ auto property = type_config->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ DocumentProto message_document =
+ DocumentBuilder()
+ .SetKey("namespace", "message_uri")
+ .SetSchema("message")
+ .AddStringProperty("body", "foo")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_ASSERT_OK(icing.Initialize());
+ ICING_ASSERT_OK(icing.SetSchema(schema));
+ ICING_ASSERT_OK(icing.Put(message_document));
+
+ // Make sure we can search for message document
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ // The message isn't indexed, so we get nothing
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(SearchResultProto::default_instance())));
+
+ // With just the schema type filter, we can search for the message
+ search_spec.Clear();
+ search_spec.add_schema_type_filters("message");
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ message_document;
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+
+ // Since SchemaTypeIds are assigned based on order in the SchemaProto, this
+ // will force a change in the DocumentStore's cached SchemaTypeIds
+ schema.clear_types();
+ type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ // Adding a new indexed property will require reindexing
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ property = type_config->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ ICING_EXPECT_OK(icing.SetSchema(schema));
+
+ search_spec.Clear();
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.add_schema_type_filters("message");
+
+ // We can still search for the message document
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromMissingHeaderFile) {
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ EXPECT_TRUE(filesystem()->DeleteFile(GetHeaderFilename().c_str()));
+
+ // We should be able to recover from this and access all our previous data
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // Checks that DocumentLog is still ok
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+
+ // Checks that the index is still ok so we can search over it
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+
+ // Checks that Schema is still since it'll be needed to validate the document
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderMagic) {
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Change the header's magic value
+ int32_t invalid_magic = 1; // Anything that's not the actual kMagic value.
+ filesystem()->PWrite(GetHeaderFilename().c_str(),
+ offsetof(IcingSearchEngine::Header, magic),
+ &invalid_magic, sizeof(invalid_magic));
+
+ // We should be able to recover from this and access all our previous data
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // Checks that DocumentLog is still ok
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+
+ // Checks that the index is still ok so we can search over it
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+
+ // Checks that Schema is still since it'll be needed to validate the document
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderChecksum) {
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Change the header's checksum value
+ uint32_t invalid_checksum =
+ 1; // Anything that's not the actual checksum value
+ filesystem()->PWrite(GetHeaderFilename().c_str(),
+ offsetof(IcingSearchEngine::Header, checksum),
+ &invalid_checksum, sizeof(invalid_checksum));
+
+ // We should be able to recover from this and access all our previous data
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // Checks that DocumentLog is still ok
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+
+ // Checks that the index is still ok so we can search over it
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+
+ // Checks that Schema is still since it'll be needed to validate the document
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+}
+
+TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptSchema) {
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ const std::string schema_file =
+ absl_ports::StrCat(GetSchemaDir(), "/schema.pb");
+ const std::string corrupt_data = "1234";
+ EXPECT_TRUE(filesystem()->Write(schema_file.c_str(), corrupt_data.data(),
+ corrupt_data.size()));
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptDocumentLog) {
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ const std::string document_log_file =
+ absl_ports::StrCat(GetDocumentIndexDir(), "/document_log");
+ const std::string corrupt_data = "1234";
+ EXPECT_TRUE(filesystem()->Write(document_log_file.c_str(),
+ corrupt_data.data(), corrupt_data.size()));
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) {
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("additional", "content")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ {
+ // Initializes folder and schema
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ property = type->add_properties();
+ property->set_property_name("additional");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ ICING_EXPECT_OK(icing.SetSchema(schema));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ ICING_EXPECT_OK(icing.Put(document2));
+
+ // Won't get us anything because "additional" isn't marked as an indexed
+ // property in the schema
+ SearchSpecProto search_spec;
+ search_spec.set_query("additional:content");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ {
+ // This schema will change the SchemaTypeIds from the previous schema_
+ // (since SchemaTypeIds are assigned based on order of the types, and this
+ // new schema changes the ordering of previous types)
+ SchemaProto new_schema;
+ auto type = new_schema.add_types();
+ type->set_schema_type("Email");
+
+ type = new_schema.add_types();
+ type->set_schema_type("Message");
+
+ // Adding a new property changes the SectionIds (since SectionIds are
+ // assigned based on alphabetical order of indexed sections, marking
+ // "additional" as an indexed property will push the "body" property to a
+ // different SectionId)
+ auto property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ property = type->add_properties();
+ property->set_property_name("additional");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir()));
+ ICING_EXPECT_OK(schema_store->SetSchema(new_schema));
+ } // Will persist new schema
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // We can insert a Email document since we kept the new schema
+ DocumentProto email_document =
+ DocumentBuilder()
+ .SetKey("namespace", "email_uri")
+ .SetSchema("Email")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ ICING_EXPECT_OK(icing.Put(email_document));
+ EXPECT_THAT(icing.Get("namespace", "email_uri"),
+ IsOkAndHolds(EqualsProto(email_document)));
+
+ SearchSpecProto search_spec;
+
+ // The section restrict will ensure we are using the correct, updated
+ // SectionId in the Index
+ search_spec.set_query("additional:content");
+
+ // Schema type filter will ensure we're using the correct, updated
+ // SchemaTypeId in the DocumentStore
+ search_spec.add_schema_type_filters("Message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) = document2;
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromInconsistentDocumentStore) {
+ {
+ // Initializes folder and schema, index one document
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir()));
+ ICING_EXPECT_OK(schema_store->SetSchema(GetDefaultSchema()));
+
+ // Puts a second document into DocumentStore but doesn't index it.
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(filesystem(), GetDocumentIndexDir(), &fake_clock,
+ schema_store.get()));
+ ICING_EXPECT_OK(document_store->Put(document2));
+ }
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ // Index Restoration should be triggered here and document2 should be
+ // indexed.
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // DocumentStore kept the additional document
+ EXPECT_THAT(icing.Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(GetDefaultDocument())));
+ EXPECT_THAT(icing.Get("namespace", "uri2"),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // We indexed the additional document
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) = document2;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, RecoverFromInconsistentIndex) {
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_result;
+ (*expected_result.mutable_results()->Add()->mutable_document()) =
+ GetDefaultDocument();
+
+ {
+ // Initializes folder and schema, index one document
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(icing.Put(GetDefaultDocument()));
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Pretend we lost the entire index
+ EXPECT_TRUE(filesystem()->DeleteDirectoryRecursively(
+ absl_ports::StrCat(GetDocumentIndexDir(), "/idx/lite.").c_str()));
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+
+ // Check that our index is ok by searching over the restored index
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(expected_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ICING_ASSERT_OK(icing.Put(document2));
+ ICING_ASSERT_OK(icing.Put(document3));
+ ICING_ASSERT_OK(icing.Put(document1));
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+
+ // Result should be in descending score order
+ SearchResultProto exp_result;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document3;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document2;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document1;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchShouldAllowNoScoring) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Creates 3 documents and ensures the relationship of them is:
+ // document1 < document2 < document3
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampSecs(1571111111)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampSecs(1572222222)
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampSecs(1573333333)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ICING_ASSERT_OK(icing.Put(document3));
+ ICING_ASSERT_OK(icing.Put(document1));
+ ICING_ASSERT_OK(icing.Put(document2));
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+
+ SearchResultProto exp_result;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document2;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document1;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document3;
+
+ // Results should not be ranked by score but returned in reverse insertion
+ // order.
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByCreationTimestamp) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Creates 3 documents and ensures the relationship in terms of creation
+ // timestamp score is: document1 < document2 < document3
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampSecs(1571111111)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampSecs(1572222222)
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampSecs(1573333333)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ICING_ASSERT_OK(icing.Put(document3));
+ ICING_ASSERT_OK(icing.Put(document1));
+ ICING_ASSERT_OK(icing.Put(document2));
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+
+ // Result should be in descending timestamp order
+ SearchResultProto exp_result;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document3;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document2;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document1;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedAscendingly) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ICING_EXPECT_OK(icing.Initialize());
+ ICING_EXPECT_OK(icing.SetSchema(GetDefaultSchema()));
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampSecs(kDefaultCreationTimestampSecs)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ICING_ASSERT_OK(icing.Put(document2));
+ ICING_ASSERT_OK(icing.Put(document3));
+ ICING_ASSERT_OK(icing.Put(document1));
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+
+ // Result should be in ascending score order
+ SearchResultProto exp_result;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document1;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document2;
+ (*exp_result.mutable_results()->Add()->mutable_document()) = document3;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ scoring_spec.set_order_by(ScoringSpecProto::Order::ASC);
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance()),
+ IsOkAndHolds(EqualsProto(exp_result)));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/hit/doc-hit-info.cc b/icing/index/hit/doc-hit-info.cc
new file mode 100644
index 0000000..80dbbde
--- /dev/null
+++ b/icing/index/hit/doc-hit-info.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/hit/doc-hit-info.h"
+
+#include "icing/legacy/core/icing-string-util.h"
+
+namespace icing {
+namespace lib {
+
+bool DocHitInfo::operator<(const DocHitInfo& other) const {
+ if (document_id() != other.document_id()) {
+ // Sort by document_id descending. This mirrors how the individual hits that
+ // are collapsed into this DocHitInfo would sort with other hits -
+ // document_ids are inverted when encoded in hits. Hits are encoded this way
+ // because they are appended to posting lists and the most recent value
+ // appended to a posting list must have the smallest encoded value of any
+ // hit on the posting list.
+ return document_id() > other.document_id();
+ }
+ if (hit_section_ids_mask() != other.hit_section_ids_mask()) {
+ return hit_section_ids_mask() < other.hit_section_ids_mask();
+ }
+ // Doesn't matter which way we compare this array, as long as
+ // DocHitInfo is unequal when it is unequal.
+ return memcmp(max_hit_score_, other.max_hit_score_, sizeof(max_hit_score_)) <
+ 0;
+}
+
+void DocHitInfo::UpdateSection(SectionId section_id, Hit::Score hit_score) {
+ SectionIdMask section_id_mask = (1u << section_id);
+ if (hit_section_ids_mask() & section_id_mask) {
+ max_hit_score_[section_id] =
+ std::max(max_hit_score_[section_id], hit_score);
+ } else {
+ max_hit_score_[section_id] = hit_score;
+ hit_section_ids_mask_ |= section_id_mask;
+ }
+}
+
+void DocHitInfo::MergeSectionsFrom(const DocHitInfo& other) {
+ SectionIdMask other_mask = other.hit_section_ids_mask();
+ while (other_mask) {
+ SectionId section_id = __builtin_ctz(other_mask);
+ UpdateSection(section_id, other.max_hit_score(section_id));
+ other_mask &= ~(1u << section_id);
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h
new file mode 100644
index 0000000..386822d
--- /dev/null
+++ b/icing/index/hit/doc-hit-info.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_HIT_DOC_HIT_INFO_H_
+#define ICING_INDEX_HIT_DOC_HIT_INFO_H_
+
+#include <limits>
+
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// DocHitInfo provides a collapsed view of all hits for a specific term and doc.
+// Hits contain a document_id, section_id and a hit score. The information in
+// multiple hits is collapse into a DocHitInfo by providing a SectionIdMask of
+// all sections that contained a hit for this term as well as the highest hit
+// score of any hit for each section.
+class DocHitInfo {
+ public:
+ explicit DocHitInfo(DocumentId document_id_in = kInvalidDocumentId,
+ SectionIdMask hit_section_ids_mask = kSectionIdMaskNone)
+ : document_id_(document_id_in),
+ hit_section_ids_mask_(hit_section_ids_mask) {
+ memset(max_hit_score_, Hit::kMaxHitScore, sizeof(max_hit_score_));
+ }
+
+ DocumentId document_id() const { return document_id_; }
+
+ void set_document_id(DocumentId document_id) { document_id_ = document_id; }
+
+ SectionIdMask hit_section_ids_mask() const { return hit_section_ids_mask_; }
+
+ void set_hit_section_ids_mask(SectionIdMask section_id_mask) {
+ hit_section_ids_mask_ = section_id_mask;
+ }
+
+ Hit::Score max_hit_score(SectionId section_id) const {
+ return max_hit_score_[section_id];
+ }
+
+ bool operator<(const DocHitInfo& other) const;
+ bool operator==(const DocHitInfo& other) const {
+ return (*this < other) == (other < *this);
+ }
+
+ // Updates the hit_section_ids_mask and max_hit_score for the section, if
+ // necessary.
+ void UpdateSection(SectionId section_id, Hit::Score hit_score);
+
+ // Merges the sections of other into this. The hit_section_ids_masks are or'd
+ // and the max hit score for each section between the two is set.
+ //
+ // This does not affect the DocumentId of this or other. If callers care about
+ // only merging sections for DocHitInfos with the same DocumentId, callers
+ // should check this themselves.
+ void MergeSectionsFrom(const DocHitInfo& other);
+
+ private:
+ DocumentId document_id_;
+ SectionIdMask hit_section_ids_mask_;
+ Hit::Score max_hit_score_[kMaxSectionId + 1];
+} __attribute__((packed));
+static_assert(sizeof(DocHitInfo) == 22, "");
+// TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
+static_assert(icing_is_packed_pod<DocHitInfo>::value, "go/icing-ubsan");
+static_assert(sizeof(Hit::Score) == 1,
+ "Change how max_hit_score_ is initialized if changing the type "
+ "of Hit::Score");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_HIT_DOC_HIT_INFO_H_
diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc
new file mode 100644
index 0000000..d8adbc1
--- /dev/null
+++ b/icing/index/hit/doc-hit-info_test.cc
@@ -0,0 +1,170 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/hit/doc-hit-info.h"
+
+#include "icing/index/hit/hit.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsTrue;
+using ::testing::Ne;
+
+constexpr DocumentId kSomeDocumentId = 12;
+constexpr DocumentId kSomeOtherDocumentId = 54;
+
+TEST(DocHitInfoTest, InitialMaxHitScores) {
+ DocHitInfo info(kSomeDocumentId);
+ for (SectionId i = 0; i <= kMaxSectionId; ++i) {
+ EXPECT_THAT(info.max_hit_score(i), Eq(Hit::kMaxHitScore));
+ }
+}
+
+TEST(DocHitInfoTest, UpdateHitScores) {
+ DocHitInfo info(kSomeDocumentId);
+ ASSERT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
+
+ // Updating a section for the first time, should change its max hit score,
+ // even though the hit score (16) may be lower than the current value returned
+ // by info.max_hit_score(3) (kMaxHitScore)
+ info.UpdateSection(3, 16);
+ EXPECT_THAT(info.max_hit_score(3), Eq(16));
+
+ // Updating a section with a hit score lower than the previously set one
+ // should not update max hit score.
+ info.UpdateSection(3, 15);
+ EXPECT_THAT(info.max_hit_score(3), Eq(16));
+
+ // Updating a section with a hit score higher than the previously set one
+ // should update the max hit score.
+ info.UpdateSection(3, 17);
+ EXPECT_THAT(info.max_hit_score(3), Eq(17));
+
+ // Updating a section with kMaxHitScore should *always* set the max hit
+ // score to kMaxHitScore (regardless of what value kMaxHitScore is
+ // defined with).
+ info.UpdateSection(3, Hit::kMaxHitScore);
+ EXPECT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
+
+ // Updating a section that has had kMaxHitScore explicitly set, should
+ // *never* change the max hit score (regardless of what value kMaxHitScore
+ // is defined with).
+ info.UpdateSection(3, 16);
+ EXPECT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
+}
+
+TEST(DocHitInfoTest, UpdateSectionIdMask) {
+ DocHitInfo info(kSomeDocumentId);
+ EXPECT_THAT(info.hit_section_ids_mask(), Eq(kSectionIdMaskNone));
+
+ info.UpdateSection(3, 16);
+ EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
+
+ // Calling update again shouldn't do anything
+ info.UpdateSection(3, 15);
+ EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
+
+ // Updating another section shouldn't do anything
+ info.UpdateSection(2, 77);
+ EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
+}
+
+TEST(DocHitInfoTest, MergeSectionsFromDifferentDocumentId) {
+ // Merging infos with different document_ids works.
+ DocHitInfo info1(kSomeDocumentId);
+ DocHitInfo info2(kSomeOtherDocumentId);
+ info2.UpdateSection(7, 12);
+ info1.MergeSectionsFrom(info2);
+ EXPECT_THAT(info1.max_hit_score(7), Eq(12));
+ EXPECT_THAT(info1.document_id(), Eq(kSomeDocumentId));
+}
+
+TEST(DocHitInfoTest, MergeSectionsFromKeepsOldSection) {
+ // Merging shouldn't override sections that are present info1, but not present
+ // in info2.
+ DocHitInfo info1(kSomeDocumentId);
+ info1.UpdateSection(3, 16);
+ DocHitInfo info2(kSomeDocumentId);
+ info1.MergeSectionsFrom(info2);
+ EXPECT_THAT(info1.max_hit_score(3), Eq(16));
+}
+
+TEST(DocHitInfoTest, MergeSectionsFromAddsNewSection) {
+ // Merging should add sections that were not present in info1, but are present
+ // in info2.
+ DocHitInfo info1(kSomeDocumentId);
+ DocHitInfo info2(kSomeDocumentId);
+ info2.UpdateSection(7, 12);
+ info1.MergeSectionsFrom(info2);
+ EXPECT_THAT(info1.max_hit_score(7), Eq(12));
+}
+
+TEST(DocHitInfoTest, MergeSectionsFromSetsHigherHitScore) {
+ // Merging should override the value of a section in info1 if the same section
+ // is present in info2 with a higher hit score.
+ DocHitInfo info1(kSomeDocumentId);
+ info1.UpdateSection(2, 77);
+ DocHitInfo info2(kSomeDocumentId);
+ info2.UpdateSection(2, 89);
+ info1.MergeSectionsFrom(info2);
+ EXPECT_THAT(info1.max_hit_score(2), Eq(89));
+}
+
+TEST(DocHitInfoTest, MergeSectionsFromDoesNotSetLowerHitScore) {
+ // Merging should not override the hit score of a section in info1 if the same
+ // section is present in info2 but with a lower hit score.
+ DocHitInfo info1(kSomeDocumentId);
+ info1.UpdateSection(5, 108);
+ DocHitInfo info2(kSomeDocumentId);
+ info2.UpdateSection(5, 13);
+ info1.MergeSectionsFrom(info2);
+ EXPECT_THAT(info1.max_hit_score(5), Eq(108));
+}
+
+TEST(DocHitInfoTest, Comparison) {
+ constexpr DocumentId kDocumentId = 1;
+ DocHitInfo info(kDocumentId);
+ info.UpdateSection(1, 12);
+
+ constexpr DocumentId kHighDocumentId = 15;
+ DocHitInfo high_document_id_info(kHighDocumentId);
+ high_document_id_info.UpdateSection(1, 12);
+
+ DocHitInfo high_section_id_info(kDocumentId);
+ high_section_id_info.UpdateSection(1, 12);
+ high_section_id_info.UpdateSection(6, Hit::kMaxHitScore);
+
+ std::vector<DocHitInfo> infos{info, high_document_id_info,
+ high_section_id_info};
+ std::sort(infos.begin(), infos.end());
+ EXPECT_THAT(infos,
+ ElementsAre(high_document_id_info, info, high_section_id_info));
+
+ // There are no requirements for how DocHitInfos with the same DocumentIds and
+ // hit masks will compare, but they must not be equal.
+ DocHitInfo different_hit_score_info(kDocumentId);
+ different_hit_score_info.UpdateSection(1, 76);
+ EXPECT_THAT(info < different_hit_score_info,
+ Ne(different_hit_score_info < info));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
new file mode 100644
index 0000000..be2df5c
--- /dev/null
+++ b/icing/index/hit/hit.cc
@@ -0,0 +1,100 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/hit/hit.h"
+
+#include "icing/store/document-id.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+enum FlagOffset {
+ // This hit, whether exact or not, came from a prefixed section and will
+ // need to be backfilled into branching posting lists if/when those are
+ // created.
+ kInPrefixSection = 0,
+ // This hit represents a prefix of a longer term. If exact matches are
+ // required, then this hit should be ignored.
+ kPrefixHit = 1,
+ // Whether or not the hit has a hit score other than kMaxHitScore.
+ kHasScore = 2,
+ kNumFlags = 3,
+};
+static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <=
+ sizeof(Hit::Value) * 8,
+ "HitOverflow");
+
+inline DocumentId InvertDocumentId(DocumentId document_id) {
+ static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1),
+ "(kMaxDocumentId + 1) must not overflow.");
+ static_assert(
+ (kMaxDocumentId + 1) < (1U << kDocumentIdBits),
+ "(kMaxDocumentId + 1) must also fit in kDocumentIdBits wide bitfield");
+ // Invert the document_id value. +1 is added so the resulting range is [1,
+ // kMaxDocumentId + 1].
+ return (kMaxDocumentId + 1) - document_id;
+}
+
+} // namespace
+
+Hit::Hit(SectionId section_id, DocumentId document_id, Hit::Score score,
+ bool in_prefix_section, bool is_prefix_hit)
+ : score_(score) {
+ // Values are stored so that when sorted, they appear in document_id
+ // descending, section_id ascending, order. Also, all else being
+ // equal, non-prefix hits sort before prefix hits. So inverted
+ // document_id appears in the most significant bits, followed by
+ // (uninverted) section_id.
+ value_ = 0;
+ bit_util::BitfieldSet(InvertDocumentId(document_id),
+ kSectionIdBits + kNumFlags, kDocumentIdBits, &value_);
+ bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &value_);
+ bit_util::BitfieldSet(score != kMaxHitScore, kHasScore, 1, &value_);
+ bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &value_);
+ bit_util::BitfieldSet(in_prefix_section, kInPrefixSection, 1, &value_);
+}
+
+DocumentId Hit::document_id() const {
+ DocumentId inverted_document_id = bit_util::BitfieldGet(
+ value(), kSectionIdBits + kNumFlags, kDocumentIdBits);
+ // Undo the document_id inversion.
+ return InvertDocumentId(inverted_document_id);
+}
+
+SectionId Hit::section_id() const {
+ return bit_util::BitfieldGet(value(), kNumFlags, kSectionIdBits);
+}
+
+bool Hit::has_score() const {
+ return bit_util::BitfieldGet(value(), kHasScore, 1);
+}
+
+bool Hit::is_prefix_hit() const {
+ return bit_util::BitfieldGet(value(), kPrefixHit, 1);
+}
+
+bool Hit::is_in_prefix_section() const {
+ return bit_util::BitfieldGet(value(), kInPrefixSection, 1);
+}
+
+bool Hit::EqualsDocumentIdAndSectionId::operator()(const Hit& hit1,
+ const Hit& hit2) const {
+ return (hit1.value() >> kNumFlags) == (hit2.value() >> kNumFlags);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
new file mode 100644
index 0000000..f84dc24
--- /dev/null
+++ b/icing/index/hit/hit.h
@@ -0,0 +1,98 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_HIT_HIT_H_
+#define ICING_INDEX_HIT_HIT_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Hit is a specific encoding that refers to content within a document. A hit
+// consists of:
+// - a DocumentId
+// - a SectionId
+// referring to the document and section that the hit corresponds to, as well as
+// metadata about the hit:
+// - whether the Hit has a Score other than the default value
+// - whether the Hit does not appear exactly in the document, but instead
+// represents a term that is a prefix of a term in the document
+// - whether the Hit came from a section that has prefix expansion enabled
+// and a score for the hit. Ranging from [0,255] a higher score indicates a
+// higher quality hit.
+// The hit is the most basic unit of the index and, when grouped together by
+// term, can be used to encode what terms appear in what documents.
+class Hit {
+ public:
+ // The datatype used to encode Hit information: the document_id, section_id
+ // and the has_score, prefix hit and in prefix section flags.
+ using Value = uint32_t;
+
+ // WARNING: Changing this value will invalidate any pre-existing posting lists
+ // on user devices.
+ static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
+ // Docs are sorted in reverse, and 0 is never used as the inverted
+ // DocumentId (because it is the inverse of kInvalidValue), so it is always
+ // the max in a descending sort.
+ static constexpr Value kMaxDocumentIdSortValue = 0;
+
+ // A score reflecting the "quality" of this hit. The higher the score, the
+ // higher quality the hit.
+ using Score = uint8_t;
+ // By default, hits are given the highest possible score.
+ static constexpr Score kMaxHitScore = std::numeric_limits<Score>::max();
+
+ explicit Hit(Value value = kInvalidValue, Score score = kMaxHitScore)
+ : value_(value), score_(score) {}
+ Hit(SectionId section_id, DocumentId document_id, Score score,
+ bool in_prefix_section = false, bool is_prefix_hit = false);
+
+ bool is_valid() const { return value() != kInvalidValue; }
+ Value value() const { return value_; }
+ DocumentId document_id() const;
+ SectionId section_id() const;
+ // Whether or not the hit contains a non-default score. Hits with non-default
+ // score are considered to be of lower quality.
+ bool has_score() const;
+ Score score() const { return score_; }
+ bool is_prefix_hit() const;
+ bool is_in_prefix_section() const;
+
+ bool operator<(const Hit& h2) const { return value() < h2.value(); }
+ bool operator==(const Hit& h2) const { return value() == h2.value(); }
+
+ struct EqualsDocumentIdAndSectionId {
+ bool operator()(const Hit& hit1, const Hit& hit2) const;
+ };
+
+ private:
+ // Value and score must be in this order.
+ // Value bits layout: 5 unused + 20 document_id + 4 section id + 3 flags.
+ Value value_;
+ Score score_;
+} __attribute__((packed));
+static_assert(sizeof(Hit) == 5, "");
+// TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
+static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_HIT_HIT_H_
diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc
new file mode 100644
index 0000000..982bfcf
--- /dev/null
+++ b/icing/index/hit/hit_test.cc
@@ -0,0 +1,137 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/hit/hit.h"
+
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Not;
+
+static constexpr DocumentId kSomeDocumentId = 24;
+static constexpr SectionId kSomeSectionid = 5;
+static constexpr Hit::Score kSomeHitScore = 57;
+
+TEST(HitTest, HasScoreFlag) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ EXPECT_THAT(h1.has_score(), IsFalse());
+ EXPECT_THAT(h1.score(), Eq(Hit::kMaxHitScore));
+
+ Hit h2(kSomeSectionid, kSomeDocumentId, kSomeHitScore);
+ EXPECT_THAT(h2.has_score(), IsTrue());
+ EXPECT_THAT(h2.score(), Eq(kSomeHitScore));
+}
+
+TEST(HitTest, IsPrefixHitFlag) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ EXPECT_THAT(h1.is_prefix_hit(), IsFalse());
+
+ Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ /*in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ EXPECT_THAT(h2.is_prefix_hit(), IsFalse());
+
+ Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ /*in_prefix_section=*/false, /*is_prefix_hit=*/true);
+ EXPECT_THAT(h3.is_prefix_hit(), IsTrue());
+}
+
+TEST(HitTest, IsInPrefixSectionFlag) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ EXPECT_THAT(h1.is_in_prefix_section(), IsFalse());
+
+ Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ /*in_prefix_section=*/false);
+ EXPECT_THAT(h2.is_in_prefix_section(), IsFalse());
+
+ Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ /*in_prefix_section=*/true);
+ EXPECT_THAT(h3.is_in_prefix_section(), IsTrue());
+}
+
+TEST(HitTest, Accessors) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId));
+ EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid));
+}
+
+TEST(HitTest, Valid) {
+ Hit def;
+ EXPECT_THAT(def.is_valid(), IsFalse());
+
+ Hit explicit_invalid(Hit::kInvalidValue);
+ EXPECT_THAT(explicit_invalid.is_valid(), IsFalse());
+
+ static constexpr Hit::Value kSomeValue = 65372;
+ Hit explicit_valid(kSomeValue);
+ EXPECT_THAT(explicit_valid.is_valid(), IsTrue());
+
+ Hit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId, kSomeHitScore);
+ EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue());
+
+ Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId, kSomeHitScore);
+ EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue());
+
+ Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeHitScore);
+ EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue());
+
+ Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeHitScore);
+ EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
+}
+
+TEST(HitTest, Comparison) {
+ Hit hit(1, 243, Hit::kMaxHitScore);
+ // DocumentIds are sorted in ascending order. So a hit with a lower
+ // document_id should be considered greater than one with a higher
+ // document_id.
+ Hit higher_document_id_hit(1, 2409, Hit::kMaxHitScore);
+ Hit higher_section_id_hit(15, 243, Hit::kMaxHitScore);
+ // Whether or not a hit score was set is considered, but the score itself is
+ // not.
+ Hit hitscore_hit(1, 243, 12);
+ Hit prefix_hit(1, 243, Hit::kMaxHitScore, /*in_prefix_section=*/false,
+ /*is_prefix_hit=*/true);
+ Hit hit_in_prefix_section(1, 243, Hit::kMaxHitScore,
+ /*in_prefix_section=*/true,
+ /*is_prefix_hit=*/false);
+
+ std::vector<Hit> hits{
+ hit, higher_document_id_hit, higher_section_id_hit, hitscore_hit,
+ prefix_hit, hit_in_prefix_section};
+ std::sort(hits.begin(), hits.end());
+ EXPECT_THAT(hits,
+ ElementsAre(higher_document_id_hit, hit, hit_in_prefix_section,
+ prefix_hit, hitscore_hit, higher_section_id_hit));
+
+ Hit higher_hitscore_hit(1, 243, 108);
+ // Hit score value is not considered when comparing hits.
+ EXPECT_THAT(hitscore_hit, Not(Lt(higher_hitscore_hit)));
+ EXPECT_THAT(higher_hitscore_hit, Not(Lt(hitscore_hit)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
new file mode 100644
index 0000000..c9e07be
--- /dev/null
+++ b/icing/index/index-processor.cc
@@ -0,0 +1,96 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/index-processor.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/index.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::Status IndexProcessor::IndexDocument(
+ const DocumentProto& document, DocumentId document_id) {
+ if (index_->last_added_document_id() != kInvalidDocumentId &&
+ document_id <= index_->last_added_document_id()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "DocumentId %d must be greater than last added document_id %d",
+ document_id, index_->last_added_document_id()));
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
+ schema_store_.ExtractSections(document));
+ uint32_t num_tokens = 0;
+ libtextclassifier3::Status overall_status;
+ for (const Section& section : sections) {
+ Index::Editor editor = index_->Edit(document_id, section.metadata.id,
+ section.metadata.term_match_type);
+ for (std::string_view subcontent : section.content) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ section.metadata.tokenizer, &lang_segmenter_));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
+ tokenizer->Tokenize(subcontent));
+ while (itr->Advance()) {
+ if (++num_tokens > options_.max_tokens_per_document) {
+ switch (options_.token_limit_behavior) {
+ case Options::TokenLimitBehavior::kReturnError:
+ return absl_ports::ResourceExhaustedError(
+ "Max number of tokens reached!");
+ case Options::TokenLimitBehavior::kSuppressError:
+ return libtextclassifier3::Status::OK;
+ }
+ }
+ std::string term = normalizer_.NormalizeTerm(itr->GetToken().text);
+ // Add this term to the index. Even if adding this hit fails, we keep
+ // trying to add more hits because it's possible that future hits could
+ // still be added successfully. For instance if the lexicon is full, we
+ // might fail to add a hit for a new term, but should still be able to
+ // add hits for terms that are already in the index.
+ auto status = editor.AddHit(term.c_str());
+ if (overall_status.ok() && !status.ok()) {
+ // If we've succeeded to add everything so far, set overall_status to
+ // represent this new failure. If we've already failed, no need to
+ // update the status - we're already going to return a resource
+ // exhausted error.
+ overall_status = status;
+ }
+ }
+ }
+ }
+ return overall_status;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
new file mode 100644
index 0000000..612fdfe
--- /dev/null
+++ b/icing/index/index-processor.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_INDEX_PROCESSOR_H_
+#define ICING_INDEX_INDEX_PROCESSOR_H_
+
+#include <cstdint>
+#include <string>
+
+#include "utils/base/status.h"
+#include "icing/index/index.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section-manager.h"
+#include "icing/store/document-id.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/token.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+class IndexProcessor {
+ public:
+ struct Options {
+ int32_t max_tokens_per_document;
+
+ // Indicates how a document exceeding max_tokens_per_document should be
+ // handled.
+ enum class TokenLimitBehavior {
+ // When set, the first max_tokens_per_document will be indexed. If the
+ // token count exceeds max_tokens_per_document, a ResourceExhausted error
+ // will be returned.
+ kReturnError,
+ // When set, the first max_tokens_per_document will be indexed. If the
+ // token count exceeds max_tokens_per_document, OK will be returned.
+ kSuppressError,
+ };
+ TokenLimitBehavior token_limit_behavior;
+ };
+
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ // TODO(b/141180665): Add nullptr checks for the raw pointers
+ IndexProcessor(const SchemaStore* schema_store,
+ const LanguageSegmenter* lang_segmenter,
+ const Normalizer* normalizer, Index* index,
+ const Options& options)
+ : schema_store_(*schema_store),
+ lang_segmenter_(*lang_segmenter),
+ normalizer_(*normalizer),
+ index_(index),
+ options_(options) {}
+
+ // Add document to the index, associated with document_id. If the number of
+ // tokens in the document exceeds max_tokens_per_document, then only the first
+ // max_tokens_per_document will be added to the index. All tokens of length
+ // exceeding max_token_length will be shortened to max_token_length.
+ //
+ // Returns:
+ // INVALID_ARGUMENT if document_id is less than the document_id of a
+ // previously indexed
+ // document or tokenization fails.
+ // RESOURCE_EXHAUSTED if the index is full and can't add anymore content.
+ // NOT_FOUND if there is no definition for the document's schema type.
+ // INTERNAL_ERROR if any other errors occur
+ libtextclassifier3::Status IndexDocument(const DocumentProto& document,
+ DocumentId document_id);
+
+ private:
+ std::string NormalizeToken(const Token& token);
+
+ const SchemaStore& schema_store_;
+ const LanguageSegmenter& lang_segmenter_;
+ const Normalizer& normalizer_;
+ Index* const index_;
+ const Options options_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_INDEX_PROCESSOR_H_
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
new file mode 100644
index 0000000..f22d2f2
--- /dev/null
+++ b/icing/index/index-processor_benchmark.cc
@@ -0,0 +1,379 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/index-processor.h"
+#include "icing/index/index.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/logging.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/index:index-processor_benchmark
+//
+// $ blaze-bin/icing/index/index-processor_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// Make target //icing/tokenization:language-segmenter depend on
+// //third_party/icu
+//
+// Make target //icing/transform:normalizer depend on
+// //third_party/icu
+//
+// Download LangId model file from
+// //nlp/saft/components/lang_id/mobile/fb_model:models/latest_model.smfb and
+// put it into your device:
+// $ adb push [your model path] /data/local/tmp/
+//
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/index:index-processor_benchmark
+//
+// $ adb push blaze-bin/icing/index/index-processor_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/index-processor_benchmark --benchmarks=all
+// --adb
+
+// Flag to tell the benchmark that it'll be run on an Android device via adb,
+// the benchmark will set up data files accordingly.
+ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Creates a fake type config with 10 properties (p0 - p9)
+void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
+ type_config->set_schema_type("Fake_Type");
+
+ for (int i = 0; i < 10; i++) {
+ auto property = type_config->add_properties();
+ property->set_property_name(
+ IcingStringUtil::StringPrintf("p%d", i)); // p0 - p9
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ }
+}
+
+DocumentProto CreateDocumentWithOneProperty(int content_length) {
+ return DocumentBuilder()
+ .SetKey("icing", "fake/1")
+ .SetSchema("Fake_Type")
+ .AddStringProperty("p0", std::string(content_length, 'A'))
+ .Build();
+}
+
+DocumentProto CreateDocumentWithTenProperties(int content_length) {
+ int property_length = content_length / 10;
+ return DocumentBuilder()
+ .SetKey("icing", "fake/1")
+ .SetSchema("Fake_Type")
+ .AddStringProperty("p0", std::string(property_length, 'A'))
+ .AddStringProperty("p1", std::string(property_length, 'B'))
+ .AddStringProperty("p2", std::string(property_length, 'C'))
+ .AddStringProperty("p3", std::string(property_length, 'D'))
+ .AddStringProperty("p4", std::string(property_length, 'E'))
+ .AddStringProperty("p5", std::string(property_length, 'F'))
+ .AddStringProperty("p6", std::string(property_length, 'G'))
+ .AddStringProperty("p7", std::string(property_length, 'H'))
+ .AddStringProperty("p8", std::string(property_length, 'I'))
+ .AddStringProperty("p9", std::string(property_length, 'J'))
+ .Build();
+}
+
+DocumentProto CreateDocumentWithDiacriticLetters(int content_length) {
+ std::string content;
+ while (content.length() < content_length) {
+ content.append("àáâãā");
+ }
+ return DocumentBuilder()
+ .SetKey("icing", "fake/1")
+ .SetSchema("Fake_Type")
+ .AddStringProperty("p0", content)
+ .Build();
+}
+
+DocumentProto CreateDocumentWithHiragana(int content_length) {
+ std::string content;
+ while (content.length() < content_length) {
+ content.append("あいうえお");
+ }
+ return DocumentBuilder()
+ .SetKey("icing", "fake/1")
+ .SetSchema("Fake_Type")
+ .AddStringProperty("p0", content)
+ .Build();
+}
+
+std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
+ const std::string& index_dir) {
+ Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10);
+ return Index::Create(options, &filesystem).ValueOrDie();
+}
+
+std::unique_ptr<LanguageSegmenter> CreateLanguageSegmenter() {
+ if (absl::GetFlag(FLAGS_adb)) {
+ return LanguageSegmenter::Create("/data/local/tmp/latest_model.smfb")
+ .ValueOrDie();
+ } else {
+ return LanguageSegmenter::Create(GetLangIdModelPath()).ValueOrDie();
+ }
+}
+
+std::unique_ptr<Normalizer> CreateNormalizer() {
+ return Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max())
+ .ValueOrDie();
+}
+
+std::unique_ptr<SchemaStore> CreateSchemaStore() {
+ Filesystem filesystem;
+ std::unique_ptr<SchemaStore> schema_store =
+ SchemaStore::Create(&filesystem, GetTestTempDir()).ValueOrDie();
+
+ SchemaProto schema;
+ CreateFakeTypeConfig(schema.add_types());
+ auto set_schema_status = schema_store->SetSchema(schema);
+
+ if (!set_schema_status.ok()) {
+ ICING_LOG(ERROR) << set_schema_status.status().error_message();
+ }
+
+ return schema_store;
+}
+
+void CleanUp(const IcingFilesystem& filesystem, const std::string& index_dir) {
+ filesystem.DeleteDirectoryRecursively(index_dir.c_str());
+}
+
+std::unique_ptr<IndexProcessor> CreateIndexProcessor(
+ const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
+ Index* index) {
+ IndexProcessor::Options processor_options{};
+ processor_options.max_tokens_per_document = 1024 * 1024 * 10;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+
+ return std::make_unique<IndexProcessor>(schema_store, language_segmenter,
+ normalizer, index, processor_options);
+}
+
+void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem filesystem;
+ std::string index_dir = GetTestTempDir() + "/index_test/";
+
+ CleanUp(filesystem, index_dir);
+
+ std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
+ normalizer.get(), index.get());
+
+ DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
+
+ DocumentId document_id = 0;
+ for (auto _ : state) {
+ ICING_ASSERT_OK(
+ index_processor->IndexDocument(input_document, document_id++));
+ }
+
+ CleanUp(filesystem, index_dir);
+}
+BENCHMARK(BM_IndexDocumentWithOneProperty)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem filesystem;
+ std::string index_dir = GetTestTempDir() + "/index_test/";
+
+ CleanUp(filesystem, index_dir);
+
+ std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
+ normalizer.get(), index.get());
+
+ DocumentProto input_document =
+ CreateDocumentWithTenProperties(state.range(0));
+
+ DocumentId document_id = 0;
+ for (auto _ : state) {
+ ICING_ASSERT_OK(
+ index_processor->IndexDocument(input_document, document_id++));
+ }
+
+ CleanUp(filesystem, index_dir);
+}
+BENCHMARK(BM_IndexDocumentWithTenProperties)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem filesystem;
+ std::string index_dir = GetTestTempDir() + "/index_test/";
+
+ CleanUp(filesystem, index_dir);
+
+ std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
+ normalizer.get(), index.get());
+
+ DocumentProto input_document =
+ CreateDocumentWithDiacriticLetters(state.range(0));
+
+ DocumentId document_id = 0;
+ for (auto _ : state) {
+ ICING_ASSERT_OK(
+ index_processor->IndexDocument(input_document, document_id++));
+ }
+
+ CleanUp(filesystem, index_dir);
+}
+BENCHMARK(BM_IndexDocumentWithDiacriticLetters)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_IndexDocumentWithHiragana(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem filesystem;
+ std::string index_dir = GetTestTempDir() + "/index_test/";
+
+ CleanUp(filesystem, index_dir);
+
+ std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
+ std::unique_ptr<IndexProcessor> index_processor =
+ CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
+ normalizer.get(), index.get());
+
+ DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
+
+ DocumentId document_id = 0;
+ for (auto _ : state) {
+ ICING_ASSERT_OK(
+ index_processor->IndexDocument(input_document, document_id++));
+ }
+
+ CleanUp(filesystem, index_dir);
+}
+BENCHMARK(BM_IndexDocumentWithHiragana)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
new file mode 100644
index 0000000..c58898b
--- /dev/null
+++ b/icing/index/index-processor_test.cc
@@ -0,0 +1,569 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/index-processor.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// type and property names of FakeType
+constexpr std::string_view kFakeType = "FakeType";
+constexpr std::string_view kExactProperty = "exact";
+constexpr std::string_view kPrefixedProperty = "prefixed";
+constexpr std::string_view kUnindexedProperty1 = "unindexed1";
+constexpr std::string_view kUnindexedProperty2 = "unindexed2";
+constexpr std::string_view kSubProperty = "submessage";
+constexpr std::string_view kNestedProperty = "nested";
+constexpr std::string_view kRepeatedProperty = "repeated";
+
+constexpr DocumentId kDocumentId0 = 0;
+constexpr DocumentId kDocumentId1 = 1;
+
+constexpr SectionId kExactSectionId = 0;
+constexpr SectionId kPrefixedSectionId = 1;
+constexpr SectionId kRepeatedSectionId = 2;
+constexpr SectionId kNestedSectionId = 3;
+
+using Cardinality = PropertyConfigProto::Cardinality;
+using DataType = PropertyConfigProto::DataType;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Test;
+
+class IndexProcessorTest : public Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+
+ index_dir_ = GetTestTempDir() + "/index_test/";
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_,
+ Index::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ normalizer_,
+ Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, GetTestTempDir()));
+ SchemaProto schema;
+ CreateFakeTypeConfig(schema.add_types());
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+ index_processor_ = std::make_unique<IndexProcessor>(
+ schema_store_.get(), lang_segmenter_.get(), normalizer_.get(),
+ index_.get(), processor_options);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
+ }
+
+ std::unique_ptr<IndexProcessor> index_processor_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<Index> index_;
+ std::unique_ptr<SchemaStore> schema_store_;
+
+ private:
+ static void AddProperty(std::string_view name, DataType::Code type,
+ Cardinality::Code cardinality,
+ TermMatchType::Code term_match_type,
+ SchemaTypeConfigProto* type_config) {
+ auto* prop = type_config->add_properties();
+ prop->set_property_name(std::string(name));
+ prop->set_data_type(type);
+ prop->set_cardinality(cardinality);
+ prop->mutable_indexing_config()->set_term_match_type(term_match_type);
+ prop->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ }
+
+ static void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
+ type_config->set_schema_type(std::string(kFakeType));
+
+ AddProperty(std::string(kExactProperty), DataType::STRING,
+ Cardinality::REQUIRED, TermMatchType::EXACT_ONLY, type_config);
+
+ AddProperty(std::string(kPrefixedProperty), DataType::STRING,
+ Cardinality::OPTIONAL, TermMatchType::PREFIX, type_config);
+
+ // Don't set IndexingConfig
+ auto* prop = type_config->add_properties();
+ prop->set_property_name(std::string(kUnindexedProperty1));
+ prop->set_data_type(DataType::STRING);
+ prop->set_cardinality(Cardinality::OPTIONAL);
+
+ AddProperty(std::string(kUnindexedProperty2), DataType::BYTES,
+ Cardinality::OPTIONAL, TermMatchType::UNKNOWN, type_config);
+
+ AddProperty(std::string(kRepeatedProperty), DataType::STRING,
+ Cardinality::REPEATED, TermMatchType::PREFIX, type_config);
+
+ AddProperty(kSubProperty, DataType::DOCUMENT, Cardinality::OPTIONAL,
+ TermMatchType::UNKNOWN, type_config);
+
+ std::string recipients_name =
+ absl_ports::StrCat(kSubProperty, kPropertySeparator, kNestedProperty);
+ AddProperty(recipients_name, DataType::STRING, Cardinality::OPTIONAL,
+ TermMatchType::PREFIX, type_config);
+ }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::string index_dir_;
+};
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedProperty1), "foo bar baz")
+ .AddBytesProperty(std::string(kUnindexedProperty2),
+ "attachment bytes")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+}
+
+TEST_F(IndexProcessorTest, OneDoc) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("hello", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("hello", 1U << kPrefixedSectionId,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexProcessorTest, MultipleDocs) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "pitbull")
+ .AddStringProperty(std::string(kPrefixedProperty), "mr. world wide")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("world", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(kDocumentId1,
+ std::vector<SectionId>{kPrefixedSectionId}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kExactSectionId})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("world", 1U << kPrefixedSectionId,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
+}
+
+TEST_F(IndexProcessorTest, DocWithNestedProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddDocumentProperty(
+ std::string(kSubProperty),
+ DocumentBuilder()
+ .AddStringProperty(std::string(kNestedProperty),
+ "rocky raccoon")
+ .Build())
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("rocky", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kNestedSectionId})));
+}
+
+TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kRepeatedProperty), "rocky",
+ "italian stallion")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("italian", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kRepeatedSectionId})));
+}
+
+TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
+ // Only allow the first four tokens ("hello", "world", "good", "night") to be
+ // indexed.
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 4;
+ options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+ index_processor_ = std::make_unique<IndexProcessor>(
+ schema_store_.get(), lang_segmenter_.get(), normalizer_.get(),
+ index_.get(), options);
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "night" should have been indexed.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("night", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+
+ // "moon" should not have been.
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("moon", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
+ // Only allow the first four tokens ("hello", "world", "good", "night") to be
+ // indexed.
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 4;
+ options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
+ index_processor_ = std::make_unique<IndexProcessor>(
+ schema_store_.get(), lang_segmenter_.get(), normalizer_.get(),
+ index_.get(), options);
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "night" should have been indexed.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("night", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+
+ // "moon" should not have been.
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("moon", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexProcessorTest, TooLongTokens) {
+ // Only allow the tokens of length four, truncating "hello", "world" and
+ // "night".
+ IndexProcessor::Options options;
+ options.max_tokens_per_document = 1000;
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
+ Normalizer::Create(/*max_token_length=*/4));
+
+ index_processor_ = std::make_unique<IndexProcessor>(
+ schema_store_.get(), lang_segmenter_.get(), normalizer.get(),
+ index_.get(), options);
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "hello world")
+ .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "good" should have been indexed normally.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("good", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+
+ // "night" should not have been.
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("night", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ // "night" should have been truncated to "nigh".
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("nigh", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+}
+
+TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "best rocky movies")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+
+ // Only document_id 1 should surface in a prefix query for "Rock"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("rock", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
+}
+
+TEST_F(IndexProcessorTest, TokenNormalization) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "all lower case")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("case", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(kDocumentId1,
+ std::vector<SectionId>{kExactSectionId}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kExactSectionId})));
+}
+
+TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+
+ // Indexing a document with document_id < last_added_document_id should cause
+ // a failure.
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "all lower case")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // As should indexing a document document_id == last_added_document_id.
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+}
+
+TEST_F(IndexProcessorTest, NonAsciiIndexing) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty),
+ "你好,世界!你好:世界。“你好”世界?")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("你好", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+}
+
+// TODO(b/142508211) Renable this test once a proper limit on max content length
+// has been determined.
+/*
+TEST_F(IndexProcessorTest,
+ LexiconFullIndexesSmallerTokensReturnsResourceExhausted) {
+ IndexProcessor::Options processor_options;
+ processor_options.max_tokens_per_document = 1000;
+ processor_options.token_limit_behavior =
+ IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+ index_processor_ = std::make_unique<IndexProcessor>(
+ section_manager_.get(), lang_segmenter_.get(), normalizer_.get(),
+ index_.get(), processor_options);
+
+ // This is the maximum token length that an empty lexicon constructed for a
+ // lite index with merge size of 1MiB can support.
+ constexpr int kMaxTokenLength = 16777217;
+ // Create a string "ppppppp..." with a length that is too large to fit into
+ // the lexicon.
+ std::string enormous_string(kMaxTokenLength + 1, 'p');
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(kFakeType)
+ .AddStringProperty(std::string(kExactProperty),
+ absl_ports::StrCat(enormous_string, " foo"))
+ .AddStringProperty(std::string(kPrefixedProperty), "bar baz")
+ .Build();
+ EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("baz", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+}
+*/
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/index.cc b/icing/index/index.cc
new file mode 100644
index 0000000..7fdb70d
--- /dev/null
+++ b/icing/index/index.cc
@@ -0,0 +1,128 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/index.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
+ const Index::Options& options) {
+ if (options.index_merge_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "Requested hit buffer size must be greater than 0.");
+ }
+ if (options.index_merge_size > LiteIndex::max_hit_buffer_size()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested hit buffer size %d is too large.",
+ options.index_merge_size));
+ }
+ return LiteIndex::Options(options.base_dir + "/idx/lite.",
+ options.index_merge_size);
+}
+
+// TODO(tjbarron) implement for real when the main index is added.
+IcingDynamicTrie::Options GetMainLexiconOptions() {
+ return IcingDynamicTrie::Options();
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
+ const Options& options, const IcingFilesystem* filesystem) {
+ ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options,
+ CreateLiteIndexOptions(options));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<TermIdCodec> term_id_codec,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(GetMainLexiconOptions()),
+ IcingDynamicTrie::max_value_index(
+ lite_index_options.lexicon_options)));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(lite_index_options, filesystem));
+ return std::unique_ptr<Index>(
+ new Index(options, std::move(term_id_codec), std::move(lite_index)));
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
+ TermMatchType::Code term_match_type) {
+ switch (term_match_type) {
+ case TermMatchType::EXACT_ONLY:
+ return std::make_unique<DocHitInfoIteratorTermExact>(
+ term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
+ case TermMatchType::PREFIX:
+ return std::make_unique<DocHitInfoIteratorTermPrefix>(
+ term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
+ default:
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Invalid TermMatchType: ",
+ TermMatchType::Code_Name(term_match_type)));
+ }
+}
+
+libtextclassifier3::Status Index::Editor::AddHit(const char* term,
+ Hit::Score score) {
+ // Step 1: See if this term is already in the lexicon
+ uint32_t tvi;
+ auto tvi_or = lite_index_->FindTerm(term);
+
+ // Step 2: Update the lexicon, either add the term or update its properties
+ if (tvi_or.ok()) {
+ ICING_VLOG(1) << "Term " << term
+ << " is already present in lexicon. Updating.";
+ tvi = tvi_or.ValueOrDie();
+ // Already in the lexicon. Just update the properties.
+ ICING_RETURN_IF_ERROR(lite_index_->UpdateTerm(tvi, term_match_type_));
+ } else {
+ ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting.";
+ // Haven't seen this term before. Add it to the lexicon.
+ ICING_ASSIGN_OR_RETURN(tvi,
+ lite_index_->InsertTerm(term, term_match_type_));
+ }
+
+ // Step 3: Add the hit itself
+ Hit hit(section_id_, document_id_, score,
+ term_match_type_ == TermMatchType::PREFIX);
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ return lite_index_->AddHit(term_id, hit);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/index.h b/icing/index/index.h
new file mode 100644
index 0000000..498ce89
--- /dev/null
+++ b/icing/index/index.h
@@ -0,0 +1,171 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_INDEX_H_
+#define ICING_INDEX_INDEX_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// The class representing the Icing search index. This index maps terms to hits
+// (document_ids, section_ids).
+// Content is added to the index through the Editor class - which also dedupes
+// hits (calling Editor::AddHit with the same arguments will only result in the
+// creation of a single hit).
+// Ex.
+// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
+// . Index::Create(MakeIndexOptions()));
+// Index::Editor editor = index->Edit(document_id, section_id,
+// TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo"));
+// ICING_RETURN_IF_ERROR(editor.AddHit("baz"));
+//
+// Content is retrieved from the index through the Iterator class.
+// Ex.
+// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
+// . Index::Create(MakeIndexOptions()));
+// ICING_ASSIGN_OR_RETURN(Index::Iterator iterator =
+// index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+// while(iterator->Advance().ok())
+// ProcessResult(iterator->value());
+class Index {
+ public:
+ struct Options {
+ explicit Options(const std::string& base_dir, uint32_t index_merge_size)
+ : base_dir(base_dir), index_merge_size(index_merge_size) {}
+
+ std::string base_dir;
+ int32_t index_merge_size;
+ };
+ // Creates an instance of Index in the directory pointed by file_dir.
+ static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create(
+ const Options& options, const IcingFilesystem* filesystem);
+
+ // Clears all files created by the index. Returns OK if all files were
+ // cleared.
+ libtextclassifier3::Status Reset() { return lite_index_->Reset(); }
+
+ // Brings components of the index into memory in anticipation of a query in
+ // order to reduce latency.
+ void Warm() { lite_index_->Warm(); }
+
+ // Syncs all the data and metadata changes to disk.
+ // Returns any encountered IO errors.
+ libtextclassifier3::Status PersistToDisk() {
+ return lite_index_->PersistToDisk();
+ }
+
+ // Compute the checksum over the entire Index's subcomponents.
+ Crc32 ComputeChecksum() { return lite_index_->ComputeChecksum(); }
+
+ // DocumentIds are always inserted in increasing order. Returns the largest
+ // document_id added to the index.
+ DocumentId last_added_document_id() const {
+ return lite_index_->last_added_document_id();
+ }
+
+ // Returns debug information for the index in out.
+ // verbosity <= 0, simplest debug information - just the lexicons and lite
+ // index.
+ // verbosity > 0, more detailed debug information including raw postings
+ // lists.
+ void GetDebugInfo(int verbosity, std::string* out) const {
+ lite_index_->GetDebugInfo(verbosity, out);
+ }
+
+ // Create an iterator to iterate through all doc hit infos in the index that
+ // match the term. section_id_mask can be set to ignore hits from sections not
+ // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits
+ // that occur in section 3.
+ //
+ // Returns:
+ // unique ptr to a valid DocHitInfoIterator that matches the term
+ // INVALID_ARGUMENT if given an invalid term_match_type
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
+ const std::string& term, SectionIdMask section_id_mask,
+ TermMatchType::Code term_match_type);
+
+ // A class that can be used to add hits to the index.
+ //
+ // An editor groups hits from a particular section within a document together
+ // and dedupes hits for the same term within a section. This removes the
+ // burden of deduping from the caller and direct access to the index
+ // implementation allows for more efficient deduping.
+ class Editor {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ // TODO(b/141180665): Add nullptr checks for the raw pointers
+ Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index,
+ DocumentId document_id, SectionId section_id,
+ TermMatchType::Code term_match_type)
+ : term_id_codec_(term_id_codec),
+ lite_index_(lite_index),
+ document_id_(document_id),
+ term_match_type_(term_match_type),
+ section_id_(section_id) {}
+
+ libtextclassifier3::Status AddHit(const char* term,
+ Hit::Score score = Hit::kMaxHitScore);
+
+ private:
+ // The Editor is able to store previously seen terms as TermIds. This is
+ // is more efficient than a client doing this externally because TermIds are
+ // not exposed to clients.
+ std::unordered_set<uint32_t> seen_tokens_;
+ const TermIdCodec* term_id_codec_;
+ LiteIndex* lite_index_;
+ DocumentId document_id_;
+ TermMatchType::Code term_match_type_;
+ SectionId section_id_;
+ };
+ Editor Edit(DocumentId document_id, SectionId section_id,
+ TermMatchType::Code term_match_type) {
+ return Editor(term_id_codec_.get(), lite_index_.get(), document_id,
+ section_id, term_match_type);
+ }
+
+ private:
+ Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
+ std::unique_ptr<LiteIndex>&& lite_index)
+ : lite_index_(std::move(lite_index)),
+ options_(options),
+ term_id_codec_(std::move(term_id_codec)) {}
+
+ std::unique_ptr<LiteIndex> lite_index_;
+ const Options options_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_INDEX_H_
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
new file mode 100644
index 0000000..536f9fb
--- /dev/null
+++ b/icing/index/index_test.cc
@@ -0,0 +1,551 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/index.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsTrue;
+using ::testing::NiceMock;
+using ::testing::Test;
+
+class IndexTest : public Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/index_test/";
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
+ }
+
+ std::unique_ptr<Index> index_;
+ std::string index_dir_;
+ IcingFilesystem filesystem_;
+};
+
+constexpr DocumentId kDocumentId0 = 0;
+constexpr DocumentId kDocumentId1 = 1;
+constexpr DocumentId kDocumentId2 = 2;
+constexpr SectionId kSectionId2 = 2;
+constexpr SectionId kSectionId3 = 3;
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+MATCHER_P2(EqualsDocHitInfo, document_id, sections, "") {
+ const DocHitInfo& actual = arg;
+ SectionIdMask section_mask = kSectionIdMaskNone;
+ for (SectionId section : sections) {
+ section_mask |= 1U << section;
+ }
+ *result_listener << "actual is {document_id=" << actual.document_id()
+ << ", section_mask=" << actual.hit_section_ids_mask()
+ << "}, but expected was {document_id=" << document_id
+ << ", section_mask=" << section_mask << "}.";
+ return actual.document_id() == document_id &&
+ actual.hit_section_ids_mask() == section_mask;
+}
+
+TEST_F(IndexTest, EmptyIndex) {
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+}
+
+TEST_F(IndexTest, AdvancePastEnd) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(itr->doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
+}
+
+TEST_F(IndexTest, SingleHitSingleTermIndex) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, SingleHitMultiTermIndex) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.AddHit("bar"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, NoHitMultiTermIndex) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.AddHit("bar"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, MultiHitMultiTermIndex) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("bar"), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
+}
+
+TEST_F(IndexTest, MultiHitSectionRestrict) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ SectionIdMask desired_section = 1U << kSectionId2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", desired_section, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+}
+
+TEST_F(IndexTest, SingleHitDedupeIndex) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, PrefixHit) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("fool"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, MultiPrefixHit) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("fool"), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+}
+
+TEST_F(IndexTest, NoExactHitInPrefixQuery) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ ASSERT_THAT(edit.AddHit("fool"), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId3})));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+}
+
+TEST_F(IndexTest, PrefixHitDedupe) {
+ // Act
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+ ASSERT_THAT(edit.AddHit("fool"), IsOk());
+
+ // Assert
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, PrefixToString) {
+ SectionIdMask id_mask = (1U << kSectionId2) | (1U << kSectionId3);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", id_mask, TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("0000000000001100:foo*"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("foo", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("1111111111111111:foo*"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("foo", kSectionIdMaskNone,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("0000000000000000:foo*"));
+}
+
+TEST_F(IndexTest, ExactToString) {
+ SectionIdMask id_mask = (1U << kSectionId2) | (1U << kSectionId3);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", id_mask, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("0000000000001100:foo"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("1111111111111111:foo"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("foo", kSectionIdMaskNone,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("0000000000000000:foo"));
+}
+
+TEST_F(IndexTest, NonAsciiTerms) {
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("こんにちは"), IsOk());
+ ASSERT_THAT(edit.AddHit("あなた"), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("こんに", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("あなた", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, FullIndex) {
+ // Make a smaller index so that it's easier to fill up.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+ std::default_random_engine random;
+ libtextclassifier3::Status status = libtextclassifier3::Status::OK;
+ constexpr int kTokenSize = 5;
+ DocumentId document_id = 0;
+ std::vector<std::string> query_terms;
+ while (status.ok()) {
+ for (int i = 0; i < 100; ++i) {
+ Index::Editor edit =
+ index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY);
+ std::string term = RandomString(kAlNumAlphabet, kTokenSize, &random);
+ status = edit.AddHit(term.c_str());
+ if (i % 50 == 0) {
+ // Remember one out of every fifty terms to query for later.
+ query_terms.push_back(std::move(term));
+ }
+ if (!status.ok()) {
+ break;
+ }
+ }
+ ++document_id;
+ }
+
+ // Assert
+ // Adding more hits should fail.
+ Index::Editor edit =
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(edit.AddHit("foo"),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(edit.AddHit("bar"),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(edit.AddHit("baz"),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ for (const std::string& term : query_terms) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator(term.c_str(), kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ // Each query term should contain at least one hit - there may have been
+ // other hits for this term that were added.
+ EXPECT_THAT(itr->Advance(), IsOk());
+ }
+ EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+}
+
+TEST_F(IndexTest, IndexCreateIOFailure) {
+ // Create the index with mock filesystem. By default, Mock will return false,
+ // so the first attempted file operation will fail.
+ NiceMock<IcingMockFilesystem> mock_filesystem;
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ EXPECT_THAT(Index::Create(options, &mock_filesystem),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(IndexTest, IndexCreateCorruptionFailure) {
+ // Add some content to the index
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+ ASSERT_THAT(edit.AddHit("bar"), IsOk());
+
+ // Close the index.
+ index_.reset();
+
+ // Corrrupt the index file.
+ std::string hit_buffer_filename = index_dir_ + "/idx/lite.hb";
+ ScopedFd sfd(filesystem_.OpenForWrite(hit_buffer_filename.c_str()));
+ ASSERT_THAT(sfd.is_valid(), IsTrue());
+
+ constexpr std::string_view kCorruptBytes = "ffffffffffffffffffffff";
+ // The first page of the hit_buffer is taken up by the header. Overwrite the
+ // first page of content.
+ constexpr int kHitBufferStartOffset = 4096;
+ ASSERT_THAT(filesystem_.PWrite(sfd.get(), kHitBufferStartOffset,
+ kCorruptBytes.data(), kCorruptBytes.length()),
+ IsTrue());
+
+ // Recreate the index.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ EXPECT_THAT(Index::Create(options, &filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
+}
+
+TEST_F(IndexTest, IndexPersistence) {
+ // Add some content to the index
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+ ASSERT_THAT(edit.AddHit("bar"), IsOk());
+ EXPECT_THAT(index_->PersistToDisk(), IsOk());
+
+ // Close the index.
+ index_.reset();
+
+ // Recreate the index.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+
+ // Check that the hits are present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexTest, InvalidHitBufferSize) {
+ Index::Options options(
+ index_dir_, /*index_merge_size=*/std::numeric_limits<uint32_t>::max());
+ EXPECT_THAT(Index::Create(options, &filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(IndexTest, ComputeChecksumSameBetweenCalls) {
+ // Add some content to the index.
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+
+ Crc32 foo_checksum(757666244U);
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+
+ // Calling it again shouldn't change the checksum
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+}
+
+TEST_F(IndexTest, ComputeChecksumSameAcrossInstances) {
+ // Add some content to the index.
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+
+ Crc32 foo_checksum(757666244U);
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+
+ // Recreate the index, checksum should still be the same across instances
+ index_.reset();
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+}
+
+TEST_F(IndexTest, ComputeChecksumChangesOnModification) {
+ // Add some content to the index.
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ ASSERT_THAT(edit.AddHit("foo"), IsOk());
+
+ Crc32 foo_checksum(757666244U);
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+
+ // Modifying the index changes the checksum;
+ EXPECT_THAT(edit.AddHit("bar"), IsOk());
+
+ Crc32 foo_bar_checksum(1228959551U);
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_bar_checksum));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc
new file mode 100644
index 0000000..0d5bfea
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorAllDocumentId::DocHitInfoIteratorAllDocumentId(
+ const DocumentId document_id_limit)
+ : document_id_limit_(document_id_limit),
+ current_document_id_(document_id_limit) {}
+
+libtextclassifier3::Status DocHitInfoIteratorAllDocumentId::Advance() {
+ if (!IsDocumentIdValid(current_document_id_)) {
+ // Reached the end, set these to invalid values and return
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ doc_hit_info_.set_document_id(current_document_id_--);
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.h b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h
new file mode 100644
index 0000000..97ba5f2
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_ALL_DOCUMENT_ID_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_ALL_DOCUMENT_ID_H_
+
+#include <cstdint>
+#include <string>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Iterator for all DocumentIds in range [0, document_id_limit_]: 0 inclusive,
+// document_id_limit_ inclusive. Returns DocumentIds in descending order.
+class DocHitInfoIteratorAllDocumentId : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorAllDocumentId(DocumentId document_id_limit);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override { return 0; }
+
+ int32_t GetNumLeafAdvanceCalls() const override {
+ return document_id_limit_ - current_document_id_;
+ }
+
+ std::string ToString() const override {
+ return IcingStringUtil::StringPrintf("(ALL document_id_limit:%d)",
+ document_id_limit_);
+ }
+
+ private:
+ const DocumentId document_id_limit_;
+
+ // An internal value for the iterator to track the current doc id.
+ DocumentId current_document_id_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_ALL_DOCUMENT_ID_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc
new file mode 100644
index 0000000..7366b97
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc
@@ -0,0 +1,113 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Not;
+
+TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) {
+ {
+ DocHitInfoIteratorAllDocumentId all_it(100);
+
+ // We'll always start with an invalid document_id, need to Advance before we
+ // get anything out of this.
+ EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(all_it.hit_intersect_section_ids_mask(),
+ Eq(kSectionIdMaskNone));
+ }
+
+ {
+ // Can initialize with negative values, but won't ever be able to Advance to
+ // a proper document_id
+ DocHitInfoIteratorAllDocumentId all_it(-5);
+ EXPECT_THAT(all_it.Advance(), Not(IsOk()));
+ }
+}
+
+TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumBlocksInspected) {
+ DocHitInfoIteratorAllDocumentId all_it(100);
+ EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0));
+
+ // Number of iterations is chosen arbitrarily. Just meant to demonstrate that
+ // no matter how many Advance calls are made, GetNumBlocksInspected should
+ // always return 0.
+ for (int i = 0; i < 5; ++i) {
+ EXPECT_THAT(all_it.Advance(), IsOk());
+ EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0));
+ }
+}
+
+TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumLeafAdvanceCalls) {
+ DocHitInfoIteratorAllDocumentId all_it(100);
+ EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(0));
+
+ for (int i = 1; i <= 5; ++i) {
+ EXPECT_THAT(all_it.Advance(), IsOk());
+ EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(i));
+ }
+}
+
+TEST(DocHitInfoIteratorAllDocumentIdTest, Advance) {
+ {
+ // Can't advance beyond an invalid DocumentId
+ EXPECT_THAT(DocHitInfoIteratorAllDocumentId(-1).Advance(), Not(IsOk()));
+ }
+
+ {
+ // Test one advance
+ DocHitInfoIteratorAllDocumentId all_it(5);
+ EXPECT_THAT(all_it.Advance(), IsOk());
+ EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(5));
+
+ // Advancing shouldn't affect the intersect section ids mask, since there's
+ // no intersecting going on
+ EXPECT_THAT(all_it.hit_intersect_section_ids_mask(),
+ Eq(kSectionIdMaskNone));
+ }
+
+ {
+ std::vector<DocumentId> expected_document_ids;
+ expected_document_ids.reserve(125);
+ for (int i = 124; i >= 0; --i) {
+ expected_document_ids.push_back(i);
+ }
+
+ // Many advances
+ DocHitInfoIteratorAllDocumentId all_it(124);
+ EXPECT_THAT(GetDocumentIds(&all_it),
+ ElementsAreArray(expected_document_ids));
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc
new file mode 100644
index 0000000..276b78a
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-and.cc
@@ -0,0 +1,230 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// When combining ANDed iterators, n-ary operator has better performance when
+// number of operands > 3 according to benchmark cl/243720660
+// TODO (samzheng): Tune this number when it's necessary, e.g. implementation
+// changes.
+inline constexpr int kBinaryAndIteratorPerformanceThreshold = 3;
+
+// The minimum number of iterators needed to construct a And iterator. The And
+// constructor currently takes 2 iterators.
+inline constexpr int kMinBinaryIterators = 2;
+
+} // namespace
+
+std::unique_ptr<DocHitInfoIterator> CreateAndIterator(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators) {
+ if (iterators.size() == 1) {
+ return std::move(iterators.at(0));
+ }
+
+ std::unique_ptr<DocHitInfoIterator> iterator;
+ if (iterators.size() <= kBinaryAndIteratorPerformanceThreshold &&
+ iterators.size() >= kMinBinaryIterators) {
+ // Accumulate the iterators that need to be ANDed together.
+ iterator = std::move(iterators.at(0));
+ for (size_t i = 1; i < iterators.size(); ++i) {
+ std::unique_ptr<DocHitInfoIterator> temp_iterator = std::move(iterator);
+ iterator = std::make_unique<DocHitInfoIteratorAnd>(
+ std::move(temp_iterator), std::move(iterators[i]));
+ }
+ } else {
+ // If the vector is too small, the AndNary iterator can handle it and return
+ // an error on the Advance call
+ iterator =
+ std::make_unique<DocHitInfoIteratorAndNary>(std::move(iterators));
+ }
+
+ return iterator;
+}
+
+DocHitInfoIteratorAnd::DocHitInfoIteratorAnd(
+ std::unique_ptr<DocHitInfoIterator> short_it,
+ std::unique_ptr<DocHitInfoIterator> long_it)
+ : short_(std::move(short_it)), long_(std::move(long_it)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorAnd::Advance() {
+ // Advance on short first
+ if (!short_->Advance().ok()) {
+ // Didn't find anything for the first iterator, reset to invalid values and
+ // return.
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ DocumentId short_doc_id = short_->doc_hit_info().document_id();
+
+ // Then AdvanceTo on long
+ ICING_ASSIGN_OR_RETURN(DocumentId long_doc_id,
+ AdvanceTo(long_.get(), short_doc_id));
+
+ // Now try to align DocHitInfos by moving one or the other.
+ while (short_doc_id != long_doc_id) {
+ if (short_doc_id > long_doc_id) {
+ ICING_ASSIGN_OR_RETURN(short_doc_id,
+ AdvanceTo(short_.get(), long_doc_id));
+ } else {
+ ICING_ASSIGN_OR_RETURN(long_doc_id, AdvanceTo(long_.get(), short_doc_id));
+ }
+ }
+
+ // Guaranteed that short_doc_id and long_doc_id match now
+ doc_hit_info_ = short_->doc_hit_info();
+ doc_hit_info_.MergeSectionsFrom(long_->doc_hit_info());
+ hit_intersect_section_ids_mask_ = short_->hit_intersect_section_ids_mask() &
+ long_->hit_intersect_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorAnd::GetNumBlocksInspected() const {
+ return short_->GetNumBlocksInspected() + long_->GetNumBlocksInspected();
+}
+
+int32_t DocHitInfoIteratorAnd::GetNumLeafAdvanceCalls() const {
+ return short_->GetNumLeafAdvanceCalls() + long_->GetNumLeafAdvanceCalls();
+}
+
+std::string DocHitInfoIteratorAnd::ToString() const {
+ return absl_ports::StrCat("(", short_->ToString(), " AND ", long_->ToString(),
+ ")");
+}
+
+DocHitInfoIteratorAndNary::DocHitInfoIteratorAndNary(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators)
+ : iterators_(std::move(iterators)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() {
+ if (iterators_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Not enough iterators to AND together");
+ }
+
+ // Advance on the first iterator to get a potential hit
+ if (!iterators_.at(0)->Advance().ok()) {
+ // Didn't find anything for the first iterator, reset to invalid values and
+ // return
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ DocumentId potential_document_id =
+ iterators_.at(0)->doc_hit_info().document_id();
+
+ // Our goal is to find the next document_id that exists on all the iterators
+ // by advancing the iterators one by one. We start with some
+ // "potential_document_id", check if it actually matches the above goal. If
+ // yes, return. If not, find the next best "potential" and repeat till we hit
+ // the end.
+
+ // Has the current potential_document_id been found in all the iterators?
+ bool found_document_id = false;
+ while (!found_document_id) {
+ for (auto& iterator : iterators_) {
+ if (iterator->doc_hit_info().document_id() > potential_document_id) {
+ // Advance the current iterator until it's equal to or smaller than the
+ // potential hit doc id
+ DocumentId unused;
+ ICING_ASSIGN_OR_RETURN(
+ unused, AdvanceTo(iterator.get(), potential_document_id));
+ }
+
+ if (iterator->doc_hit_info().document_id() == potential_document_id) {
+ // The potential hit got matched on the iterators so far
+ found_document_id = true;
+ continue;
+ } else if (iterator->doc_hit_info().document_id() <
+ potential_document_id) {
+ // This iterator doesn't have potential_document_id as we've gone past
+ // it already. Use the current document_id as the new
+ // "potential_document_id" and start checking all iterators again.
+ found_document_id = false;
+ potential_document_id = iterator->doc_hit_info().document_id();
+ break;
+ }
+ }
+ }
+
+ // Found a DocumentId which exists in all the iterators
+ doc_hit_info_ = iterators_.at(0)->doc_hit_info();
+ hit_intersect_section_ids_mask_ =
+ iterators_.at(0)->hit_intersect_section_ids_mask();
+
+ for (size_t i = 1; i < iterators_.size(); i++) {
+ doc_hit_info_.MergeSectionsFrom(iterators_.at(i)->doc_hit_info());
+ hit_intersect_section_ids_mask_ &=
+ iterators_.at(i)->hit_intersect_section_ids_mask();
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorAndNary::GetNumBlocksInspected() const {
+ int32_t blockCount = 0;
+ for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) {
+ blockCount += iter->GetNumBlocksInspected();
+ }
+ return blockCount;
+}
+
+int32_t DocHitInfoIteratorAndNary::GetNumLeafAdvanceCalls() const {
+ int32_t leafCount = 0;
+ for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) {
+ leafCount += iter->GetNumLeafAdvanceCalls();
+ }
+ return leafCount;
+}
+
+std::string DocHitInfoIteratorAndNary::ToString() const {
+ std::string ret = "(";
+
+ for (int i = 0; i < iterators_.size(); ++i) {
+ if (i == iterators_.size() - 1) {
+ // Last element in vector
+ absl_ports::StrAppend(&ret, iterators_.at(i)->ToString(), ")");
+ } else {
+ absl_ports::StrAppend(&ret, iterators_.at(i)->ToString(), " AND ");
+ }
+ }
+
+ return ret;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h
new file mode 100644
index 0000000..5c4c07e
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-and.h
@@ -0,0 +1,77 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_AND_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_AND_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+
+namespace icing {
+namespace lib {
+
+// Given n iterators, will decide what the fastest And-iterator implementation
+// will be.
+std::unique_ptr<DocHitInfoIterator> CreateAndIterator(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators);
+
+// Iterate over a logical AND of two child iterators.
+class DocHitInfoIteratorAnd : public DocHitInfoIterator {
+ public:
+ // Set the shorter iterator to short_it to get performance benefits
+ // for when an underlying iterator has a more efficient AdvanceTo.
+ explicit DocHitInfoIteratorAnd(std::unique_ptr<DocHitInfoIterator> short_it,
+ std::unique_ptr<DocHitInfoIterator> long_it);
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> short_;
+ std::unique_ptr<DocHitInfoIterator> long_;
+};
+
+// Iterate over a logical AND of multiple child iterators.
+// NOTE: DocHitInfoIteratorAnd is a faster alternative to AND exactly 2
+// iterators.
+class DocHitInfoIteratorAndNary : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorAndNary(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_AND_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
new file mode 100644
index 0000000..35574b7
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
@@ -0,0 +1,351 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+TEST(CreateAndIteratorTest, And) {
+ // Basic test that we can create a working And iterator. Further testing of
+ // the And iterator should be done separately below.
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(10)};
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ std::unique_ptr<DocHitInfoIterator> and_iter =
+ CreateAndIterator(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(and_iter.get()), ElementsAre(10));
+}
+
+TEST(CreateAndIteratorTest, AndNary) {
+ // Basic test that we can create a working AndNary iterator. Further testing
+ // of the AndNary iterator should be done separately below.
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(10)};
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+
+ std::unique_ptr<DocHitInfoIterator> and_iter =
+ CreateAndIterator(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(and_iter.get()), ElementsAre(10));
+}
+
+TEST(DocHitInfoIteratorAndTest, Initialize) {
+ DocHitInfoIteratorAnd and_iter(std::make_unique<DocHitInfoIteratorDummy>(),
+ std::make_unique<DocHitInfoIteratorDummy>());
+
+ // We start out with invalid values
+ EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
+ EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(),
+ Eq(kSectionIdMaskNone));
+}
+
+TEST(DocHitInfoIteratorAndTest, GetNumBlocksInspected) {
+ int first_iter_blocks = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumBlocksInspected(first_iter_blocks);
+
+ int second_iter_blocks = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumBlocksInspected(second_iter_blocks);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(and_iter.GetNumBlocksInspected(),
+ Eq(first_iter_blocks + second_iter_blocks));
+}
+
+TEST(DocHitInfoIteratorAndTest, GetNumLeafAdvanceCalls) {
+ int first_iter_leaves = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
+
+ int second_iter_leaves = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(),
+ Eq(first_iter_leaves + second_iter_leaves));
+}
+
+TEST(DocHitInfoIteratorAndTest, AdvanceNoOverlap) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(10), DocHitInfo(9)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(8), DocHitInfo(7)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(GetDocumentIds(&and_iter), IsEmpty());
+}
+
+TEST(DocHitInfoIteratorAndTest, Advance) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(10), DocHitInfo(8),
+ DocHitInfo(6), DocHitInfo(4),
+ DocHitInfo(2), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(8), DocHitInfo(4),
+ DocHitInfo(0)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(GetDocumentIds(&and_iter), ElementsAre(8, 4, 0));
+}
+
+TEST(DocHitInfoIteratorAndTest, AdvanceNestedIterators) {
+ std::vector<DocHitInfo> first_vector = {
+ DocHitInfo(10), DocHitInfo(9), DocHitInfo(8), DocHitInfo(7),
+ DocHitInfo(6), DocHitInfo(5), DocHitInfo(4), DocHitInfo(3),
+ DocHitInfo(2), DocHitInfo(1), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(10), DocHitInfo(8),
+ DocHitInfo(6), DocHitInfo(4),
+ DocHitInfo(2), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(10), DocHitInfo(7),
+ DocHitInfo(6), DocHitInfo(2),
+ DocHitInfo(1)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ std::unique_ptr<DocHitInfoIterator> third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector);
+
+ std::unique_ptr<DocHitInfoIterator> inner_iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(first_iter),
+ std::move(second_iter));
+ std::unique_ptr<DocHitInfoIterator> outer_iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(inner_iter),
+ std::move(third_iter));
+
+ EXPECT_THAT(GetDocumentIds(outer_iter.get()), ElementsAre(10, 6, 2));
+}
+
+TEST(DocHitInfoIteratorAndTest, SectionIdMask) {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ SectionIdMask mask_anded_result = 0b00000100;
+ SectionIdMask mask_ored_result = 0b01010111;
+
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
+
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(),
+ Eq(mask_ored_result));
+ EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, Initialize) {
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+
+ // We start out with invalid values
+ EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
+ EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(),
+ Eq(kSectionIdMaskNone));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, InitializeEmpty) {
+ // We can initialize it fine even with an empty vector
+ std::vector<std::unique_ptr<DocHitInfoIterator>> empty_vector;
+ DocHitInfoIteratorAndNary empty_iter(std::move(empty_vector));
+
+ // But it won't be able to advance anywhere
+ EXPECT_THAT(empty_iter.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, GetNumBlocksInspected) {
+ int first_iter_blocks = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumBlocksInspected(first_iter_blocks);
+
+ int second_iter_blocks = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumBlocksInspected(second_iter_blocks);
+
+ int third_iter_blocks = 13; // arbitrary value
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ third_iter->SetNumBlocksInspected(third_iter_blocks);
+
+ int fourth_iter_blocks = 1; // arbitrary value
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ fourth_iter->SetNumBlocksInspected(fourth_iter_blocks);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+
+ EXPECT_THAT(and_iter.GetNumBlocksInspected(),
+ Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks +
+ fourth_iter_blocks));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, GetNumLeafAdvanceCalls) {
+ int first_iter_leaves = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
+
+ int second_iter_leaves = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
+
+ int third_iter_leaves = 13; // arbitrary value
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ third_iter->SetNumLeafAdvanceCalls(third_iter_leaves);
+
+ int fourth_iter_leaves = 13; // arbitrary value
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+
+ EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(),
+ Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves +
+ fourth_iter_leaves));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, Advance) {
+ std::vector<DocHitInfo> first_vector = {
+ DocHitInfo(10), DocHitInfo(9), DocHitInfo(8), DocHitInfo(7),
+ DocHitInfo(6), DocHitInfo(5), DocHitInfo(4), DocHitInfo(3),
+ DocHitInfo(2), DocHitInfo(1), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(10), DocHitInfo(8),
+ DocHitInfo(6), DocHitInfo(4),
+ DocHitInfo(2), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(9), DocHitInfo(6),
+ DocHitInfo(3), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> fourth_vector = {DocHitInfo(6), DocHitInfo(5),
+ DocHitInfo(0)};
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(first_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(second_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(third_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(fourth_vector));
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(&and_iter), ElementsAre(6, 0));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01000101; // hits in sections 0, 2, 6
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3
+ SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5
+ SectionIdMask mask_anded_result = 0b00000100;
+ SectionIdMask mask_ored_result = 0b01101111;
+
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(4, section_id_mask3)};
+ std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)};
+
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector);
+ third_iter->set_hit_intersect_section_ids_mask(section_id_mask3);
+
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector);
+ fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(),
+ Eq(mask_ored_result));
+ EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc
new file mode 100644
index 0000000..a19c1b1
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc
@@ -0,0 +1,142 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorFilter::DocHitInfoIteratorFilter(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ const Clock* clock, const Options& options)
+ : delegate_(std::move(delegate)),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ options_(options),
+ current_seconds_(clock->GetCurrentSeconds()) {
+ // Precompute all the NamespaceIds
+ for (std::string_view name_space : options_.namespaces) {
+ auto namespace_id_or = document_store_.GetNamespaceId(name_space);
+
+ // If we can't find the NamespaceId, just throw it away
+ if (namespace_id_or.ok()) {
+ target_namespace_ids_.emplace(namespace_id_or.ValueOrDie());
+ }
+ }
+
+ // Precompute all the SchemaTypeIds
+ for (std::string_view schema_type : options_.schema_types) {
+ auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
+
+ // If we can't find the SchemaTypeId, just throw it away
+ if (schema_type_id_or.ok()) {
+ target_schema_type_ids_.emplace(schema_type_id_or.ValueOrDie());
+ }
+ }
+}
+
+libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() {
+ if (!delegate_->Advance().ok()) {
+ // Didn't find anything on the delegate iterator.
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ if (current_seconds_ == -1) {
+ // We couldn't get the current time, meaning we can't tell which documents
+ // are expired or not. So just don't return anything.
+ return absl_ports::InternalError(
+ "Couldn't get current time. Try again in a bit");
+ }
+
+ if (options_.filter_deleted) {
+ if (!document_store_.DoesDocumentExist(
+ delegate_->doc_hit_info().document_id())) {
+ // Document doesn't exist, keep searching
+ return Advance();
+ }
+ }
+
+ // Try to get the DocumentFilterData
+ auto document_filter_data_or = document_store_.GetDocumentFilterData(
+ delegate_->doc_hit_info().document_id());
+ if (!document_filter_data_or.ok()) {
+ // Didn't find the DocumentFilterData in the filter cache. This could be
+ // because the DocumentId isn't valid or the filter cache is in some invalid
+ // state. This is bad, but not the query's responsibility to fix, so just
+ // skip this result for now.
+ return Advance();
+ }
+ // We should be guaranteed that this exists now.
+ DocumentFilterData data = std::move(document_filter_data_or).ValueOrDie();
+
+ if (!options_.namespaces.empty() &&
+ target_namespace_ids_.count(data.namespace_id()) == 0) {
+ // Doesn't match one of the specified namespaces. Keep searching
+ return Advance();
+ }
+
+ if (!options_.schema_types.empty() &&
+ target_schema_type_ids_.count(data.schema_type_id()) == 0) {
+ // Doesn't match one of the specified schema types. Keep searching
+ return Advance();
+ }
+
+ if (current_seconds_ >= data.expiration_timestamp_secs()) {
+ // Current time has exceeded the document's expiration time
+ return Advance();
+ }
+
+ // Satisfied all our specified filters
+ doc_hit_info_ = delegate_->doc_hit_info();
+ hit_intersect_section_ids_mask_ = delegate_->hit_intersect_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorFilter::GetNumBlocksInspected() const {
+ return delegate_->GetNumBlocksInspected();
+}
+
+int32_t DocHitInfoIteratorFilter::GetNumLeafAdvanceCalls() const {
+ return delegate_->GetNumLeafAdvanceCalls();
+}
+
+std::string DocHitInfoIteratorFilter::ToString() const {
+ return delegate_->ToString();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
new file mode 100644
index 0000000..954a973
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_FILTER_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_FILTER_H_
+
+#include <cstdint>
+#include <ctime>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-store.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+// A iterator that helps filter out DocHitInfos associated with non-existing
+// document ids.
+class DocHitInfoIteratorFilter : public DocHitInfoIterator {
+ public:
+ struct Options {
+ // Filter out/don't return DocHitInfos that are associated with nonexistent
+ // Documents.
+ bool filter_deleted = true;
+
+ // List of namespaces that documents must have. An empty vector means that
+ // all namespaces are valid, and no documents will be filtered out.
+ //
+ // Note that if we want to reference the strings in namespaces later, ensure
+ // that the caller who passed the Options class outlives the
+ // DocHitInfoIteratorFilter.
+ std::vector<std::string_view> namespaces;
+
+ // List of schema types that documents must have. An empty vector means that
+ // all schema types are valid, and no documents will be filtered out.
+ //
+ // Note that if we want to reference the strings in schema types later,
+ // ensure that the caller who passed the Options class outlives the
+ // DocHitInfoIteratorFilter.
+ std::vector<std::string_view> schema_types;
+ };
+
+ explicit DocHitInfoIteratorFilter(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ const Clock* clock, const Options& options);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> delegate_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+ const Options options_;
+ std::unordered_set<NamespaceId> target_namespace_ids_;
+ std::unordered_set<SchemaTypeId> target_schema_type_ids_;
+ const std::time_t current_seconds_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_FILTER_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
new file mode 100644
index 0000000..9c71d54
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
@@ -0,0 +1,887 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class DocHitInfoIteratorDeletedFilterTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorDeletedFilterTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ test_document1_ =
+ DocumentBuilder().SetKey("icing", "email/1").SetSchema("email").Build();
+ test_document2_ =
+ DocumentBuilder().SetKey("icing", "email/2").SetSchema("email").Build();
+ test_document3_ =
+ DocumentBuilder().SetKey("icing", "email/3").SetSchema("email").Build();
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ // Destroy objects before the whole directory is removed because they
+ // persist data in the destructor.
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ DocumentProto test_document1_;
+ DocumentProto test_document2_;
+ DocumentProto test_document3_;
+ DocHitInfoIteratorFilter::Options options_;
+};
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, EmptyOriginalIterator) {
+ ICING_ASSERT_OK(document_store_->Put(test_document1_));
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator_empty =
+ std::make_unique<DocHitInfoIteratorDummy>();
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator_empty), document_store_.get(),
+ schema_store_.get(), &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(test_document3_));
+
+ // Deletes test document 2
+ ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
+ test_document2_.uri()));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2),
+ DocHitInfo(document_id3)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.filter_deleted = false;
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id1, document_id2, document_id3));
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(test_document3_));
+ // Deletes test document 2
+ ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
+ test_document2_.uri()));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2),
+ DocHitInfo(document_id3)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id1, document_id3));
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, NonExistingDocumentsAreFiltered) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(test_document3_));
+
+ // Document ids 7, 8, 9 are not existing
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2),
+ DocHitInfo(document_id3),
+ DocHitInfo(7),
+ DocHitInfo(8),
+ DocHitInfo(9)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id1, document_id2, document_id3));
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, NegativeDocumentIdIsIgnored) {
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(-1)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(filtered_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, InvalidDocumentIdIsIgnored) {
+ // kInvalidDocumentId should be skipped.
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(kInvalidDocumentId)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(filtered_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_F(DocHitInfoIteratorDeletedFilterTest, GreaterThanMaxDocumentIdIsIgnored) {
+ // Document ids that are greater than the max value is invalid and should be
+ // skipped.
+ DocumentId invalid_greater_than_max = kMaxDocumentId + 2;
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(invalid_greater_than_max)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(filtered_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+class DocHitInfoIteratorNamespaceFilterTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorNamespaceFilterTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ document1_namespace1_ = DocumentBuilder()
+ .SetKey(namespace1_, "email/1")
+ .SetSchema("email")
+ .Build();
+ document2_namespace1_ = DocumentBuilder()
+ .SetKey(namespace1_, "email/2")
+ .SetSchema("email")
+ .Build();
+ document1_namespace2_ = DocumentBuilder()
+ .SetKey(namespace2_, "email/1")
+ .SetSchema("email")
+ .Build();
+ document1_namespace3_ = DocumentBuilder()
+ .SetKey(namespace3_, "email/1")
+ .SetSchema("email")
+ .Build();
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ // Destroy objects before the whole directory is removed because they
+ // persist data in the destructor.
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string namespace1_ = "namespace1";
+ const std::string namespace2_ = "namespace2";
+ const std::string namespace3_ = "namespace3";
+ DocumentProto document1_namespace1_;
+ DocumentProto document2_namespace1_;
+ DocumentProto document1_namespace2_;
+ DocumentProto document1_namespace3_;
+ DocHitInfoIteratorFilter::Options options_;
+};
+
+TEST_F(DocHitInfoIteratorNamespaceFilterTest, EmptyOriginalIterator) {
+ std::unique_ptr<DocHitInfoIterator> original_iterator_empty =
+ std::make_unique<DocHitInfoIteratorDummy>();
+
+ options_.namespaces = std::vector<std::string_view>{};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator_empty), document_store_.get(),
+ schema_store_.get(), &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorNamespaceFilterTest,
+ NonexistentNamespacesReturnsEmpty) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_namespace1_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.namespaces = std::vector<std::string_view>{"nonexistent_namespace"};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorNamespaceFilterTest, NoNamespacesReturnsAll) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_namespace1_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.namespaces = std::vector<std::string_view>{};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorNamespaceFilterTest,
+ FilterOutExistingDocumentFromDifferentNamespace) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_namespace1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_namespace1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document1_namespace2_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2),
+ DocHitInfo(document_id3)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.namespaces = std::vector<std::string_view>{namespace1_};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id1, document_id2));
+}
+
+TEST_F(DocHitInfoIteratorNamespaceFilterTest, FilterForMultipleNamespacesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_namespace1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_namespace1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document1_namespace2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document1_namespace3_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id1), DocHitInfo(document_id2),
+ DocHitInfo(document_id3), DocHitInfo(document_id4)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.namespaces = std::vector<std::string_view>{namespace1_, namespace3_};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id1, document_id2, document_id4));
+}
+
+class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorSchemaTypeFilterTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ document1_schema1_ =
+ DocumentBuilder().SetKey("namespace", "1").SetSchema(schema1_).Build();
+ document2_schema2_ =
+ DocumentBuilder().SetKey("namespace", "2").SetSchema(schema2_).Build();
+ document3_schema3_ =
+ DocumentBuilder().SetKey("namespace", "3").SetSchema(schema3_).Build();
+ document4_schema1_ =
+ DocumentBuilder().SetKey("namespace", "4").SetSchema(schema1_).Build();
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type(schema1_);
+ type_config = schema.add_types();
+ type_config->set_schema_type(schema2_);
+ type_config = schema.add_types();
+ type_config->set_schema_type(schema3_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ // Destroy objects before the whole directory is removed because they
+ // persist data in the destructor.
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string schema1_ = "email";
+ const std::string schema2_ = "message";
+ const std::string schema3_ = "person";
+ DocumentProto document1_schema1_;
+ DocumentProto document2_schema2_;
+ DocumentProto document3_schema3_;
+ DocumentProto document4_schema1_;
+ DocHitInfoIteratorFilter::Options options_;
+};
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, EmptyOriginalIterator) {
+ std::unique_ptr<DocHitInfoIterator> original_iterator_empty =
+ std::make_unique<DocHitInfoIteratorDummy>();
+
+ options_.schema_types = std::vector<std::string_view>{};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator_empty), document_store_.get(),
+ schema_store_.get(), &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
+ NonexistentSchemaTypeReturnsEmpty) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_schema1_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.schema_types =
+ std::vector<std::string_view>{"nonexistent_schema_type"};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, NoSchemaTypesReturnsAll) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_schema1_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.schema_types = std::vector<std::string_view>{};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
+ FilterOutExistingDocumentFromDifferentSchemaTypes) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_schema2_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.schema_types = std::vector<std::string_view>{schema1_};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, FilterForMultipleSchemaTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_schema2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3_schema3_));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
+ DocHitInfo(document_id2),
+ DocHitInfo(document_id3)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ options_.schema_types = std::vector<std::string_view>{schema2_, schema3_};
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator),
+ ElementsAre(document_id2, document_id3));
+}
+
+class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorExpirationFilterTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type(email_schema_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ // Destroy objects before the whole directory is removed because they
+ // persist data in the destructor.
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string email_schema_ = "email";
+ DocHitInfoIteratorFilter::Options options_;
+};
+
+TEST_F(DocHitInfoIteratorExpirationFilterTest, TtlZeroIsntFilteredOut) {
+ // Insert a document
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema(email_schema_)
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(0)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Arbitrary value
+ fake_clock_.SetSeconds(100);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorExpirationFilterTest, BeforeTtlNotFilteredOut) {
+ // Insert a document
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema(email_schema_)
+ .SetCreationTimestampSecs(1)
+ .SetTtlSecs(100)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Arbitrary value, but must be less than document's creation_timestamp + ttl
+ fake_clock_.SetSeconds(50);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorExpirationFilterTest, EqualTtlFilteredOut) {
+ // Insert a document
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema(email_schema_)
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(100)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Current time is exactly the document's creation_timestamp + ttl
+ fake_clock_.SetSeconds(100);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorExpirationFilterTest, PastTtlFilteredOut) {
+ // Insert a document
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema(email_schema_)
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(100)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document));
+
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Arbitrary value, but must be greater than the document's
+ // creation_timestamp + ttl
+ fake_clock_.SetSeconds(101);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorExpirationFilterTest,
+ InvalidTimeFiltersReturnsInternalError) {
+ // Put something in the original iterator so we don't get a ResourceExhausted
+ // error
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(/*document_id_in=*/0)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // -1 is the value returned on std::time() error
+ fake_clock_.SetSeconds(-1);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options_);
+
+ EXPECT_THAT(filtered_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+class DocHitInfoIteratorFilterTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorFilterTest() : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ document1_namespace1_schema1_ = DocumentBuilder()
+ .SetKey(namespace1_, "1")
+ .SetSchema(schema1_)
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(100)
+ .Build();
+ document2_namespace1_schema1_ = DocumentBuilder()
+ .SetKey(namespace1_, "2")
+ .SetSchema(schema1_)
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(100)
+ .Build();
+ document3_namespace2_schema1_ = DocumentBuilder()
+ .SetKey(namespace2_, "3")
+ .SetSchema(schema1_)
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(100)
+ .Build();
+ document4_namespace1_schema2_ = DocumentBuilder()
+ .SetKey(namespace1_, "4")
+ .SetSchema(schema2_)
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(100)
+ .Build();
+ document5_namespace1_schema1_ = DocumentBuilder()
+ .SetKey(namespace1_, "5")
+ .SetSchema(schema1_)
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(100)
+ .Build();
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type(schema1_);
+ type_config = schema.add_types();
+ type_config->set_schema_type(schema2_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ // Destroy objects before the whole directory is removed because they
+ // persist data in the destructor.
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string namespace1_ = "namespace1";
+ const std::string namespace2_ = "namespace2";
+ const std::string schema1_ = "email";
+ const std::string schema2_ = "message";
+ DocumentProto document1_namespace1_schema1_;
+ DocumentProto document2_namespace1_schema1_;
+ DocumentProto document3_namespace2_schema1_;
+ DocumentProto document4_namespace1_schema2_;
+ DocumentProto document5_namespace1_schema1_;
+};
+
+TEST_F(DocHitInfoIteratorFilterTest, CombineAllFiltersOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(document1_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(document2_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(document3_namespace2_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id4,
+ document_store_->Put(document4_namespace1_schema2_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id5,
+ document_store_->Put(document5_namespace1_schema1_));
+
+ // Deletes document2, causing it to be filtered out
+ ICING_ASSERT_OK(
+ document_store_->Delete(document2_namespace1_schema1_.namespace_(),
+ document2_namespace1_schema1_.uri()));
+
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id1), DocHitInfo(document_id2),
+ DocHitInfo(document_id3), DocHitInfo(document_id4),
+ DocHitInfo(document_id5)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter::Options options;
+
+ // Filters out document3 by namespace
+ options.namespaces = std::vector<std::string_view>{namespace1_};
+
+ // Filters out document4 by schema type
+ options.schema_types = std::vector<std::string_view>{schema1_};
+
+ // Filters out document5 since it's expired
+ FakeClock fake_clock;
+ fake_clock.SetSeconds(199);
+
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock, options);
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
+}
+
+TEST_F(DocHitInfoIteratorFilterTest, SectionIdMasksArePopulatedCorrectly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(document1_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(document2_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(document3_namespace2_schema1_));
+
+ SectionIdMask section_id_mask1 = 0b01001001; // hits in sections 0, 3, 6
+ SectionIdMask section_id_mask2 = 0b10010010; // hits in sections 1, 4, 7
+ SectionIdMask section_id_mask3 = 0b00100100; // hits in sections 2, 5
+ std::vector<SectionId> section_ids1 = {0, 3, 6};
+ std::vector<SectionId> section_ids2 = {1, 4, 7};
+ std::vector<SectionId> section_ids3 = {2, 5};
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id1, section_id_mask1),
+ DocHitInfo(document_id2, section_id_mask2),
+ DocHitInfo(document_id3, section_id_mask3)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorFilter::Options options;
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options);
+
+ EXPECT_THAT(GetDocHitInfos(&filtered_iterator),
+ ElementsAre(EqualsDocHitInfo(document_id1, section_ids1),
+ EqualsDocHitInfo(document_id2, section_ids2),
+ EqualsDocHitInfo(document_id3, section_ids3)));
+}
+
+TEST_F(DocHitInfoIteratorFilterTest, GetNumBlocksInspected) {
+ auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ original_iterator->SetNumBlocksInspected(5);
+
+ DocHitInfoIteratorFilter::Options options;
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options);
+
+ EXPECT_THAT(filtered_iterator.GetNumBlocksInspected(), Eq(5));
+}
+
+TEST_F(DocHitInfoIteratorFilterTest, GetNumLeafAdvanceCalls) {
+ auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ original_iterator->SetNumLeafAdvanceCalls(6);
+
+ DocHitInfoIteratorFilter::Options options;
+ DocHitInfoIteratorFilter filtered_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ &fake_clock_, options);
+
+ EXPECT_THAT(filtered_iterator.GetNumLeafAdvanceCalls(), Eq(6));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc
new file mode 100644
index 0000000..ff39acc
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-not.cc
@@ -0,0 +1,79 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-not.h"
+
+#include <cstdint>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorNot::DocHitInfoIteratorNot(
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded,
+ DocumentId document_id_limit)
+ : to_be_excluded_(std::move(to_be_excluded)),
+ all_document_id_iterator_(
+ DocHitInfoIteratorAllDocumentId(document_id_limit)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorNot::Advance() {
+ if (!all_document_id_iterator_.Advance().ok()) {
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ if (all_document_id_iterator_.doc_hit_info().document_id() <
+ to_be_excluded_->doc_hit_info().document_id()) {
+ // Since DocumentIds are returned from DocHitInfoIterators in decreasing
+ // order, we have passed the last NOT result if we're smaller than its
+ // DocumentId. Advance the NOT result if so.
+ to_be_excluded_->Advance().IgnoreError();
+ }
+
+ if (all_document_id_iterator_.doc_hit_info().document_id() ==
+ to_be_excluded_->doc_hit_info().document_id()) {
+ // This is a NOT result, skip and Advance to the next result.
+ return Advance();
+ }
+
+ // No errors, we've found a valid result
+ doc_hit_info_ = all_document_id_iterator_.doc_hit_info();
+
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const {
+ return to_be_excluded_->GetNumBlocksInspected() +
+ all_document_id_iterator_.GetNumBlocksInspected();
+}
+
+int32_t DocHitInfoIteratorNot::GetNumLeafAdvanceCalls() const {
+ return to_be_excluded_->GetNumLeafAdvanceCalls() +
+ all_document_id_iterator_.GetNumLeafAdvanceCalls();
+}
+
+std::string DocHitInfoIteratorNot::ToString() const {
+ return absl_ports::StrCat("(NOT ", to_be_excluded_->ToString(), ")");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-not.h b/icing/index/iterator/doc-hit-info-iterator-not.h
new file mode 100644
index 0000000..52da3db
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-not.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NOT_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NOT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "utils/base/status.h"
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Iterator that will return all documents that are *not* specified by the
+// to_be_excluded_iterator.
+//
+// NOTE: The hit_intersect_section_ids_mask is meaningless for this iterator.
+// When this iterator produces a result, it's because the Document was not
+// present in the to_be_excluded_iterator. There is no concept of the Document
+// having been chosen because it's term was in a specific section. Since we
+// don't know anything about the sections for the Document, the
+// hit_intersect_section_ids_mask is always kSectionIdMaskNone. Correspondingly,
+// this means that the doc_hit_info.hit_section_ids_mask will also always be
+// kSectionIdMaskNone.
+class DocHitInfoIteratorNot : public DocHitInfoIterator {
+ public:
+ // to_be_excluded_iterator: The results of this iterator will be excluded
+ // from this iterator's results.
+ // document_id_limit: The DocumentId that represents the most recently added
+ // Document to the DocumentStore
+ explicit DocHitInfoIteratorNot(
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator,
+ const DocumentId document_id_limit);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_;
+ DocHitInfoIteratorAllDocumentId all_document_id_iterator_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NOT_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
new file mode 100644
index 0000000..5d0e4ac
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
@@ -0,0 +1,161 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-not.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+TEST(DocHitInfoIteratorNotTest, InvalidDocumentIdLimit) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {DocHitInfo(5),
+ DocHitInfo(4)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/-1);
+ EXPECT_THAT(not_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(DocHitInfoIteratorNotTest, NotFirstFewDocumentIdsOk) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {DocHitInfo(5),
+ DocHitInfo(4)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(GetDocumentIds(¬_iterator), ElementsAre(3, 2, 1, 0));
+}
+
+TEST(DocHitInfoIteratorNotTest, NotLastFewDocumentIdsOk) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {DocHitInfo(1),
+ DocHitInfo(0)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(GetDocumentIds(¬_iterator), ElementsAre(5, 4, 3, 2));
+}
+
+TEST(DocHitInfoIteratorNotTest, IntermittentDocumentIdOverlapOk) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {
+ DocHitInfo(8), DocHitInfo(6), DocHitInfo(4), DocHitInfo(2)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/10);
+ EXPECT_THAT(GetDocumentIds(¬_iterator), ElementsAre(10, 9, 7, 5, 3, 1, 0));
+}
+
+TEST(DocHitInfoIteratorNotTest, NoDocumentIdOverlapOk) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(GetDocumentIds(¬_iterator), ElementsAre(5, 4, 3, 2, 1, 0));
+}
+
+TEST(DocHitInfoIteratorNotTest, AllDocumentIdOverlapOk) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {
+ DocHitInfo(5), DocHitInfo(4), DocHitInfo(3),
+ DocHitInfo(2), DocHitInfo(1), DocHitInfo(0)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(GetDocumentIds(¬_iterator), IsEmpty());
+}
+
+TEST(DocHitInfoIteratorNotTest, GetNumBlocksInspected) {
+ int to_be_excluded_iterator_blocks = 4; // arbitrary value
+ auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ to_be_excluded_iterator->SetNumBlocksInspected(
+ to_be_excluded_iterator_blocks);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+
+ // The AllDocumentId iterator doesn't count any blocks as being inspected
+ // since it's just decrementing 1 from the document_id_limit.
+ EXPECT_THAT(not_iterator.GetNumBlocksInspected(),
+ Eq(to_be_excluded_iterator_blocks));
+}
+
+TEST(DocHitInfoIteratorNotTest, GetNumLeafAdvanceCalls) {
+ int to_be_excluded_iterator_leaves = 4; // arbitrary value
+ auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ to_be_excluded_iterator->SetNumLeafAdvanceCalls(
+ to_be_excluded_iterator_leaves);
+
+ int all_document_id_limit = 5;
+ // Since we iterate from [limit, 0] inclusive, add 1 for the 0th advance call
+ int all_leaf_advance_calls = all_document_id_limit + 1;
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ all_document_id_limit);
+
+ while (not_iterator.Advance().ok()) {
+ // Advance through the whole not iterator
+ }
+
+ // The AllDocumentId iterator counts each DocumentId as a leaf advance call
+ EXPECT_THAT(not_iterator.GetNumLeafAdvanceCalls(),
+ Eq(to_be_excluded_iterator_leaves + all_leaf_advance_calls));
+}
+
+TEST(DocHitInfoIteratorNotTest, SectionIdsAlwaysNone) {
+ SectionIdMask section_id_mask5 = 1U << 5; // arbitrary non-zero value
+ SectionIdMask section_id_mask4 = 1U << 4; // arbitrary non-zero value
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {
+ DocHitInfo(5, section_id_mask5), DocHitInfo(4, section_id_mask4)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(GetDocHitInfos(¬_iterator),
+ ElementsAre(DocHitInfo(3, kSectionIdMaskNone),
+ DocHitInfo(2, kSectionIdMaskNone),
+ DocHitInfo(1, kSectionIdMaskNone),
+ DocHitInfo(0, kSectionIdMaskNone)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc
new file mode 100644
index 0000000..b4dc86a
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-or.cc
@@ -0,0 +1,239 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-or.h"
+
+#include <cstdint>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// When combining Or iterators, n-ary operator has better performance when
+// number of operands > 2 according to benchmark cl/243321264
+// TODO (samzheng): Tune this number when it's necessary, e.g. implementation
+// changes.
+constexpr int kBinaryOrIteratorPerformanceThreshold = 2;
+
+} // namespace
+
+std::unique_ptr<DocHitInfoIterator> CreateOrIterator(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators) {
+ if (iterators.size() == 1) {
+ return std::move(iterators.at(0));
+ }
+
+ std::unique_ptr<DocHitInfoIterator> iterator;
+ if (iterators.size() == kBinaryOrIteratorPerformanceThreshold) {
+ iterator = std::make_unique<DocHitInfoIteratorOr>(std::move(iterators[0]),
+ std::move(iterators[1]));
+ } else {
+ // If the vector is too small, the OrNary iterator can handle it and return
+ // an error on the Advance call
+ iterator = std::make_unique<DocHitInfoIteratorOrNary>(std::move(iterators));
+ }
+
+ return iterator;
+}
+
+DocHitInfoIteratorOr::DocHitInfoIteratorOr(
+ std::unique_ptr<DocHitInfoIterator> left_it,
+ std::unique_ptr<DocHitInfoIterator> right_it)
+ : left_(std::move(left_it)), right_(std::move(right_it)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
+ // Cache the document_id of the left iterator for comparison to the right.
+ DocumentId orig_left_document_id = left_document_id_;
+
+ // Advance the left iterator if necessary.
+ if (left_document_id_ != kInvalidDocumentId) {
+ if (right_document_id_ == kInvalidDocumentId ||
+ left_document_id_ >= right_document_id_) {
+ if (left_->Advance().ok()) {
+ left_document_id_ = left_->doc_hit_info().document_id();
+ } else {
+ left_document_id_ = kInvalidDocumentId;
+ }
+ }
+ }
+
+ // Advance the right iterator if necessary, by comparing to the original
+ // left document_id (not the one which may have been updated).
+ if (right_document_id_ != kInvalidDocumentId) {
+ if (orig_left_document_id == kInvalidDocumentId ||
+ right_document_id_ >= orig_left_document_id) {
+ if (right_->Advance().ok()) {
+ right_document_id_ = right_->doc_hit_info().document_id();
+ } else {
+ right_document_id_ = kInvalidDocumentId;
+ }
+ }
+ }
+
+ // Done, we either found a match or we reached the end of potential
+ // DocHitInfos
+ if (left_document_id_ == kInvalidDocumentId &&
+ right_document_id_ == kInvalidDocumentId) {
+ // Reached the end, set these to invalid values and return
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ // Now chose the best one that is not invalid.
+ DocHitInfoIterator* chosen;
+ if (left_document_id_ == kInvalidDocumentId) {
+ chosen = right_.get();
+ } else if (right_document_id_ == kInvalidDocumentId) {
+ chosen = left_.get();
+ } else if (left_document_id_ < right_document_id_) {
+ chosen = right_.get();
+ } else {
+ chosen = left_.get();
+ }
+
+ doc_hit_info_ = chosen->doc_hit_info();
+ hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask();
+
+ // If equal, combine.
+ if (left_document_id_ == right_document_id_) {
+ doc_hit_info_.MergeSectionsFrom(right_->doc_hit_info());
+ hit_intersect_section_ids_mask_ &= right_->hit_intersect_section_ids_mask();
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorOr::GetNumBlocksInspected() const {
+ return left_->GetNumBlocksInspected() + right_->GetNumBlocksInspected();
+}
+
+int32_t DocHitInfoIteratorOr::GetNumLeafAdvanceCalls() const {
+ return left_->GetNumLeafAdvanceCalls() + right_->GetNumLeafAdvanceCalls();
+}
+
+std::string DocHitInfoIteratorOr::ToString() const {
+ return absl_ports::StrCat("(", left_->ToString(), " OR ", right_->ToString(),
+ ")");
+}
+
+DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators)
+ : iterators_(std::move(iterators)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
+ if (iterators_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Not enough iterators to OR together");
+ }
+
+ if (doc_hit_info_.document_id() == 0) {
+ // 0 is the smallest (last) DocumentId, can't advance further. Reset to
+ // invalid values and return directly
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ // The maximum possible doc id for the current Advance() call.
+ const DocumentId next_document_id_max = doc_hit_info_.document_id() - 1;
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ DocumentId next_document_id = kInvalidDocumentId;
+ // Go through the iterators and try to find the maximum document_id that is
+ // equal to or smaller than next_document_id_max
+ for (const auto& iterator : iterators_) {
+ if (iterator->doc_hit_info().document_id() > next_document_id_max) {
+ // Advance the iterator until its value is equal to or smaller than
+ // next_document_id_max
+ if (ABSL_PREDICT_FALSE(
+ !AdvanceTo(iterator.get(), next_document_id_max).ok())) {
+ continue;
+ }
+ }
+ // Now iterator->get_document_id() <= next_document_id_max
+ if (next_document_id == kInvalidDocumentId) {
+ next_document_id = iterator->doc_hit_info().document_id();
+ } else {
+ next_document_id =
+ std::max(next_document_id, iterator->doc_hit_info().document_id());
+ }
+ }
+ if (next_document_id == kInvalidDocumentId) {
+ // None of the iterators had a next document_id, reset to invalid values and
+ // return
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ // Found the next hit DocumentId, now calculate the section info.
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ for (const auto& iterator : iterators_) {
+ if (iterator->doc_hit_info().document_id() == next_document_id) {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ doc_hit_info_ = iterator->doc_hit_info();
+ hit_intersect_section_ids_mask_ =
+ iterator->hit_intersect_section_ids_mask();
+ } else {
+ doc_hit_info_.MergeSectionsFrom(iterator->doc_hit_info());
+ hit_intersect_section_ids_mask_ &=
+ iterator->hit_intersect_section_ids_mask();
+ }
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+int32_t DocHitInfoIteratorOrNary::GetNumBlocksInspected() const {
+ int32_t blockCount = 0;
+ for (const auto& iter : iterators_) {
+ blockCount += iter->GetNumBlocksInspected();
+ }
+ return blockCount;
+}
+
+int32_t DocHitInfoIteratorOrNary::GetNumLeafAdvanceCalls() const {
+ int32_t leafCount = 0;
+ for (const auto& iter : iterators_) {
+ leafCount += iter->GetNumLeafAdvanceCalls();
+ }
+ return leafCount;
+}
+
+std::string DocHitInfoIteratorOrNary::ToString() const {
+ std::string ret = "(";
+
+ for (int i = 0; i < iterators_.size(); ++i) {
+ absl_ports::StrAppend(&ret, iterators_.at(i)->ToString());
+ if (i != iterators_.size() - 1) {
+ // Not the last element in vector
+ absl_ports::StrAppend(&ret, " OR ");
+ }
+ }
+
+ absl_ports::StrAppend(&ret, ")");
+ return ret;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h
new file mode 100644
index 0000000..4128e0f
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-or.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_OR_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_OR_H_
+
+#include <cstdint>
+#include <string>
+
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+
+namespace icing {
+namespace lib {
+
+// Given n iterators, will decide what the fastest Or-iterator implementation
+// will be.
+std::unique_ptr<DocHitInfoIterator> CreateOrIterator(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators);
+
+// Iterate over a logical OR of two child iterators.
+class DocHitInfoIteratorOr : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorOr(std::unique_ptr<DocHitInfoIterator> left_it,
+ std::unique_ptr<DocHitInfoIterator> right_it);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> left_;
+ std::unique_ptr<DocHitInfoIterator> right_;
+ DocumentId left_document_id_ = kMaxDocumentId;
+ DocumentId right_document_id_ = kMaxDocumentId;
+};
+
+// Iterate over a logical OR of multiple child iterators.
+//
+// NOTE: DocHitInfoIteratorOr is a faster alternative to OR exactly 2 iterators.
+class DocHitInfoIteratorOrNary : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorOrNary(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_OR_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
new file mode 100644
index 0000000..3faa5ab
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
@@ -0,0 +1,322 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-or.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+
+TEST(CreateAndIteratorTest, Or) {
+ // Basic test that we can create a working Or iterator. Further testing of
+ // the Or iterator should be done separately below.
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(10)};
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ std::unique_ptr<DocHitInfoIterator> or_iter =
+ CreateOrIterator(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(or_iter.get()), ElementsAre(10));
+}
+
+TEST(CreateOrIteratorTest, OrNary) {
+ // Basic test that we can create a working OrNary iterator. Further testing
+ // of the OrNary iterator should be done separately below.
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(10)};
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos));
+
+ std::unique_ptr<DocHitInfoIterator> or_iter =
+ CreateOrIterator(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(or_iter.get()), ElementsAre(10));
+}
+
+TEST(DocHitInfoIteratorOrTest, Initialize) {
+ DocHitInfoIteratorOr or_iter(std::make_unique<DocHitInfoIteratorDummy>(),
+ std::make_unique<DocHitInfoIteratorDummy>());
+
+ // We start out with invalid values
+ EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
+ EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone));
+}
+
+TEST(DocHitInfoIteratorOrTest, GetNumBlocksInspected) {
+ int first_iter_blocks = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumBlocksInspected(first_iter_blocks);
+
+ int second_iter_blocks = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumBlocksInspected(second_iter_blocks);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(or_iter.GetNumBlocksInspected(),
+ Eq(first_iter_blocks + second_iter_blocks));
+}
+
+TEST(DocHitInfoIteratorOrTest, GetNumLeafAdvanceCalls) {
+ int first_iter_leaves = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
+
+ int second_iter_leaves = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(),
+ Eq(first_iter_leaves + second_iter_leaves));
+}
+
+TEST(DocHitInfoIteratorOrTest, Advance) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(10), DocHitInfo(8),
+ DocHitInfo(6), DocHitInfo(4),
+ DocHitInfo(2), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(9), DocHitInfo(7),
+ DocHitInfo(5), DocHitInfo(3),
+ DocHitInfo(1)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+
+ EXPECT_THAT(GetDocumentIds(&or_iter),
+ ElementsAre(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+TEST(DocHitInfoIteratorOrTest, AdvanceNestedIterators) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(10), DocHitInfo(8)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(9), DocHitInfo(5)};
+
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(7), DocHitInfo(6)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ std::unique_ptr<DocHitInfoIterator> third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector);
+
+ std::unique_ptr<DocHitInfoIterator> inner_iter =
+ std::make_unique<DocHitInfoIteratorOr>(std::move(first_iter),
+ std::move(second_iter));
+ std::unique_ptr<DocHitInfoIterator> outer_iter =
+ std::make_unique<DocHitInfoIteratorOr>(std::move(inner_iter),
+ std::move(third_iter));
+
+ EXPECT_THAT(GetDocumentIds(outer_iter.get()), ElementsAre(10, 9, 8, 7, 6, 5));
+}
+
+TEST(DocHitInfoIteratorOrTest, SectionIdMask) {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ SectionIdMask mask_anded_result = 0b00000100;
+ SectionIdMask mask_ored_result = 0b01010111;
+
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
+
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(),
+ Eq(mask_ored_result));
+ EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, Initialize) {
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ // We start out with invalid values
+ EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
+ EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, InitializeEmpty) {
+ // We can initialize it fine even with an empty vector
+ std::vector<std::unique_ptr<DocHitInfoIterator>> empty_vector;
+ DocHitInfoIteratorOrNary empty_iter(std::move(empty_vector));
+
+ // But it won't be able to advance anywhere
+ EXPECT_THAT(empty_iter.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, GetNumBlocksInspected) {
+ int first_iter_blocks = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumBlocksInspected(first_iter_blocks);
+
+ int second_iter_blocks = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumBlocksInspected(second_iter_blocks);
+
+ int third_iter_blocks = 13; // arbitrary value
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ third_iter->SetNumBlocksInspected(third_iter_blocks);
+
+ int fourth_iter_blocks = 1; // arbitrary value
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ fourth_iter->SetNumBlocksInspected(fourth_iter_blocks);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ EXPECT_THAT(or_iter.GetNumBlocksInspected(),
+ Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks +
+ fourth_iter_blocks));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, GetNumLeafAdvanceCalls) {
+ int first_iter_leaves = 4; // arbitrary value
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
+
+ int second_iter_leaves = 7; // arbitrary value
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
+
+ int third_iter_leaves = 13; // arbitrary value
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ third_iter->SetNumLeafAdvanceCalls(third_iter_leaves);
+
+ int fourth_iter_leaves = 13; // arbitrary value
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
+ fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(),
+ Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves +
+ fourth_iter_leaves));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, Advance) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(7), DocHitInfo(0)};
+
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(6), DocHitInfo(1)};
+
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(5), DocHitInfo(2)};
+
+ std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4), DocHitInfo(3)};
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(first_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(second_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(third_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(fourth_vector));
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ EXPECT_THAT(GetDocumentIds(&or_iter), ElementsAre(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01000101; // hits in sections 0, 2, 6
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3
+ SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5
+ SectionIdMask mask_anded_result = 0b00000100;
+ SectionIdMask mask_ored_result = 0b01101111;
+
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(4, section_id_mask3)};
+ std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)};
+
+ auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector);
+ third_iter->set_hit_intersect_section_ids_mask(section_id_mask3);
+
+ auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector);
+ fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+ iterators.push_back(std::move(fourth_iter));
+
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(),
+ Eq(mask_ored_result));
+ EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
new file mode 100644
index 0000000..58e7f2a
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::string_view target_section)
+ : delegate_(std::move(delegate)),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ target_section_(target_section) {}
+
+libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() {
+ if (!delegate_->Advance().ok()) {
+ // Didn't find anything on the delegate iterator.
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ DocumentId document_id = delegate_->doc_hit_info().document_id();
+
+ SectionIdMask section_id_mask =
+ delegate_->doc_hit_info().hit_section_ids_mask();
+
+ auto data_or = document_store_.GetDocumentFilterData(document_id);
+ if (!data_or.ok()) {
+ // Ran into some error retrieving information on this hit, skip
+ return Advance();
+ }
+
+ // Guaranteed that the DocumentFilterData exists at this point
+ DocumentFilterData data = std::move(data_or).ValueOrDie();
+ SchemaTypeId schema_type_id = data.schema_type_id();
+
+ // A hit can be in multiple sections at once, need to check that at least one
+ // of the confirmed section ids match the name of the target section
+ while (section_id_mask != 0) {
+ // There was a hit in this section id
+ SectionId section_id = __builtin_ctz(section_id_mask);
+
+ auto section_metadata_or =
+ schema_store_.GetSectionMetadata(schema_type_id, section_id);
+
+ if (section_metadata_or.ok()) {
+ const SectionMetadata* section_metadata =
+ section_metadata_or.ValueOrDie();
+
+ if (section_metadata->path == target_section_) {
+ // The hit was in the target section name, return OK/found
+ doc_hit_info_ = delegate_->doc_hit_info();
+ hit_intersect_section_ids_mask_ =
+ delegate_->hit_intersect_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+ }
+ }
+
+ // Mark this section as checked
+ section_id_mask &= ~(1U << section_id);
+ }
+
+ // Didn't find a matching section name for this hit, go to the next hit
+ return Advance();
+}
+
+int32_t DocHitInfoIteratorSectionRestrict::GetNumBlocksInspected() const {
+ return delegate_->GetNumBlocksInspected();
+}
+
+int32_t DocHitInfoIteratorSectionRestrict::GetNumLeafAdvanceCalls() const {
+ return delegate_->GetNumLeafAdvanceCalls();
+}
+
+std::string DocHitInfoIteratorSectionRestrict::ToString() const {
+ return absl_ports::StrCat(target_section_, ": ", delegate_->ToString());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
new file mode 100644
index 0000000..f9b9b04
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_SECTION_RESTRICT_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_SECTION_RESTRICT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/status.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// A iterator that helps filter for DocHitInfos whose term was in a section
+// named target_section.
+//
+// NOTE: This is a little different from the DocHitInfoIteratorFilter class.
+// That class is meant to be applied to the root of a query tree and filter over
+// all results at the end. This class is more used in the limited scope of a
+// term or a small group of terms.
+class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ explicit DocHitInfoIteratorSectionRestrict(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::string_view target_section);
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override;
+
+ int32_t GetNumLeafAdvanceCalls() const override;
+
+ std::string ToString() const override;
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> delegate_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+
+ // Ensure that this does not outlive the underlying string value.
+ std::string_view target_section_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_SECTION_RESTRICT_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
new file mode 100644
index 0000000..df79c6d
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -0,0 +1,244 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorSectionRestrictTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ document_ =
+ DocumentBuilder().SetKey("namespace", "uri").SetSchema("email").Build();
+
+ auto type_config = schema_.add_types();
+ type_config->set_schema_type("email");
+
+ // Add an indexed property so we generate section metadata on it
+ auto property = type_config->add_properties();
+ property->set_property_name(indexed_property_);
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ // First and only indexed property, so it gets the first id of 0
+ indexed_section_id_ = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ SchemaProto schema_;
+ DocumentProto document_;
+ const std::string indexed_property_ = "subject";
+ int indexed_section_id_;
+ FakeClock fake_clock_;
+};
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) {
+ std::unique_ptr<DocHitInfoIterator> original_iterator_empty =
+ std::make_unique<DocHitInfoIteratorDummy>();
+
+ DocHitInfoIteratorSectionRestrict filtered_iterator(
+ std::move(original_iterator_empty), document_store_.get(),
+ schema_store_.get(), /*target_section=*/"");
+
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document_));
+
+ SectionIdMask section_id_mask = 1U << indexed_section_id_;
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, section_id_mask)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filtering for the indexed section name should get a result
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ indexed_property_);
+
+ EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator),
+ ElementsAre(document_id));
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) {
+ // Create a hit with a document id that doesn't exist in the DocumentStore yet
+ std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(/*document_id_in=*/0)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filtering for the indexed section name should get a result
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_section=*/"");
+
+ EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ DoesntIncludeHitWithWrongSectionName) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document_));
+
+ SectionIdMask section_id_mask = 1U << indexed_section_id_;
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, section_id_mask)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filtering for the indexed section name should get a result
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ "some_section_name");
+
+ EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ DoesntIncludeHitWithNoSectionIds) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document_));
+
+ // Create a hit that doesn't exist in any sections, so it shouldn't match any
+ // section filters
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, kSectionIdMaskNone)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ indexed_property_);
+
+ EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ DoesntIncludeHitWithDifferentSectionId) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document_));
+
+ // Anything that's not 0, which is the indexed property
+ SectionId not_matching_section_id = 2;
+
+ // Create a hit that exists in a different section, so it shouldn't match any
+ // section filters
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, kSectionIdMaskNone << not_matching_section_id)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ indexed_property_);
+
+ EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty());
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumBlocksInspected) {
+ auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ original_iterator->SetNumBlocksInspected(5);
+
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_section=*/"");
+
+ EXPECT_THAT(section_restrict_iterator.GetNumBlocksInspected(), Eq(5));
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumLeafAdvanceCalls) {
+ auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
+ original_iterator->SetNumLeafAdvanceCalls(6);
+
+ DocHitInfoIteratorSectionRestrict section_restrict_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_section=*/"");
+
+ EXPECT_THAT(section_restrict_iterator.GetNumLeafAdvanceCalls(), Eq(6));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/iterator/doc-hit-info-iterator-term.cc
new file mode 100644
index 0000000..9cbb438
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-term.cc
@@ -0,0 +1,125 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+
+#include <cstdint>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kMaxSectionId + 1, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (1U << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
+ if (cached_hits_idx_ == -1) {
+ ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+ } else {
+ ++cached_hits_idx_;
+ }
+ if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
+ hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
+ // Exact match only. All hits in lite lexicon are exact.
+ ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ lite_index_->AppendHits(term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/false, &cached_hits_);
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
+ // Take union of lite terms.
+ int term_len = term_.length();
+ int terms_matched = 0;
+ for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
+ it.IsValid(); it.Advance()) {
+ bool exact_match = strlen(it.GetKey()) == term_len;
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t term_id,
+ term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
+ lite_index_->AppendHits(term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/!exact_match,
+ &cached_hits_);
+ ++terms_matched;
+ }
+ if (terms_matched > 1) {
+ SortAndDedupeDocumentIds();
+ }
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
+ // Re-sort cached document_ids and merge sections.
+ sort(cached_hits_.begin(), cached_hits_.end());
+
+ int idx = 0;
+ for (int i = 1; i < cached_hits_.size(); ++i) {
+ const DocHitInfo& hit_info = cached_hits_.at(i);
+ DocHitInfo& collapsed_hit_info = cached_hits_.at(idx);
+ if (collapsed_hit_info.document_id() == hit_info.document_id()) {
+ collapsed_hit_info.MergeSectionsFrom(hit_info);
+ } else {
+ // New document_id.
+ cached_hits_.at(++idx) = hit_info;
+ }
+ }
+ // idx points to last doc hit info.
+ cached_hits_.resize(idx + 1);
+}
+
+std::string DocHitInfoIteratorTermPrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/iterator/doc-hit-info-iterator-term.h
new file mode 100644
index 0000000..f209f0d
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-term.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTerm : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index, const std::string term,
+ SectionIdMask section_restrict_mask)
+ : term_(term),
+ lite_index_(lite_index),
+ cached_hits_idx_(-1),
+ term_id_codec_(term_id_codec),
+ num_advance_calls_(0),
+ section_restrict_mask_(section_restrict_mask) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override {
+ // TODO(b/137862424): Implement this once the main index is added.
+ return 0;
+ }
+ int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_hits_.
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+ LiteIndex* const lite_index_;
+ // Stores hits retrieved from the index. This may only be a subset of the hits
+ // that are present in the index. Current value pointed to by the Iterator is
+ // tracked by cached_hits_idx_.
+ std::vector<DocHitInfo> cached_hits_;
+ int cached_hits_idx_;
+ const TermIdCodec* term_id_codec_;
+ int num_advance_calls_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
+ public:
+ explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
+ section_id_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
+ public:
+ explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
+ section_id_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+ // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+ // merged.
+ void SortAndDedupeDocumentIds();
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h
new file mode 100644
index 0000000..ac9a3a9
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h
@@ -0,0 +1,115 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Dummy class to help with testing. It starts with an kInvalidDocumentId doc
+// hit info until an Advance is called (like normal DocHitInfoIterators). It
+// will then proceed to return the doc_hit_infos in order as Advance's are
+// called. After all doc_hit_infos are returned, Advance will return a NotFound
+// error (also like normal DocHitInfoIterators).
+class DocHitInfoIteratorDummy : public DocHitInfoIterator {
+ public:
+ DocHitInfoIteratorDummy() = default;
+ explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos)
+ : doc_hit_infos_(std::move(doc_hit_infos)) {}
+
+ libtextclassifier3::Status Advance() override {
+ if (index_ < doc_hit_infos_.size()) {
+ doc_hit_info_ = doc_hit_infos_.at(index_);
+ index_++;
+ return libtextclassifier3::Status::OK;
+ }
+
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+
+ void set_hit_intersect_section_ids_mask(
+ SectionIdMask hit_intersect_section_ids_mask) {
+ hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask;
+ }
+
+ int32_t GetNumBlocksInspected() const override {
+ return num_blocks_inspected_;
+ }
+
+ void SetNumBlocksInspected(int32_t num_blocks_inspected) {
+ num_blocks_inspected_ = num_blocks_inspected;
+ }
+
+ int32_t GetNumLeafAdvanceCalls() const override {
+ return num_leaf_advance_calls_;
+ }
+
+ void SetNumLeafAdvanceCalls(int32_t num_leaf_advance_calls) {
+ num_leaf_advance_calls_ = num_leaf_advance_calls;
+ }
+
+ std::string ToString() const override {
+ std::string ret = "<";
+ for (auto& doc_hit_info : doc_hit_infos_) {
+ absl_ports::StrAppend(&ret, IcingStringUtil::StringPrintf(
+ "[%d,%d]", doc_hit_info.document_id(),
+ doc_hit_info.hit_section_ids_mask()));
+ }
+ absl_ports::StrAppend(&ret, ">");
+ return ret;
+ }
+
+ private:
+ int32_t index_ = 0;
+ int32_t num_blocks_inspected_ = 0;
+ int32_t num_leaf_advance_calls_ = 0;
+ std::vector<DocHitInfo> doc_hit_infos_;
+};
+
+inline std::vector<DocumentId> GetDocumentIds(DocHitInfoIterator* iterator) {
+ std::vector<DocumentId> ids;
+ while (iterator->Advance().ok()) {
+ ids.push_back(iterator->doc_hit_info().document_id());
+ }
+ return ids;
+}
+
+inline std::vector<DocHitInfo> GetDocHitInfos(DocHitInfoIterator* iterator) {
+ std::vector<DocHitInfo> doc_hit_infos;
+ while (iterator->Advance().ok()) {
+ doc_hit_infos.push_back(iterator->doc_hit_info());
+ }
+ return doc_hit_infos;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h
new file mode 100644
index 0000000..eace911
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator.h
@@ -0,0 +1,99 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
+
+#include <cstdint>
+#include <string>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
+//
+// NOTE: You must call Advance() before calling hit_info() or
+// hit_intersect_section_ids_mask().
+//
+// Example:
+// DocHitInfoIterator itr = GetIterator(...);
+// while (itr.Advance()) {
+// HandleDocHitInfo(itr.hit_info());
+// }
+class DocHitInfoIterator {
+ public:
+ virtual ~DocHitInfoIterator() = default;
+
+ // Returns:
+ // OK if was able to advance to a new document_id.
+ // RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over
+ virtual libtextclassifier3::Status Advance() = 0;
+
+ // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo
+ // will have a kInvalidDocumentId if Advance() was not called after
+ // construction or if Advance returned an error.
+ const DocHitInfo& doc_hit_info() const { return doc_hit_info_; }
+
+ // SectionIdMask representing which sections (if any) have matched *ALL* query
+ // terms for the current document_id.
+ SectionIdMask hit_intersect_section_ids_mask() const {
+ return hit_intersect_section_ids_mask_;
+ }
+
+ // Gets the number of flash index blocks that have been read as a
+ // result of operations on this object.
+ virtual int32_t GetNumBlocksInspected() const = 0;
+
+ // HitIterators may be constructed into trees. Internal nodes will return the
+ // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will
+ // return the number of times Advance() was called on it.
+ virtual int32_t GetNumLeafAdvanceCalls() const = 0;
+
+ // A string representing the iterator.
+ virtual std::string ToString() const = 0;
+
+ protected:
+ DocHitInfo doc_hit_info_;
+ SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+
+ // Helper function to advance the given iterator to at most the given
+ // document_id.
+ libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it,
+ DocumentId document_id) {
+ while (it->Advance().ok()) {
+ if (it->doc_hit_info().document_id() <= document_id) {
+ return it->doc_hit_info().document_id();
+ }
+ }
+
+ // Didn't find anything for the other iterator, reset to invalid values and
+ // return.
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+}; // namespace DocHitInfoIterator
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator_benchmark.cc b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc
new file mode 100644
index 0000000..90e4888
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc
@@ -0,0 +1,185 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/index/iterator:doc-hit-info-iterator_benchmark
+//
+// $
+// blaze-bin/icing/index/iterator/doc-hit-info-iterator_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// $ blaze build --config=android_arm64 -c opt --dynamic_mode=off
+// --copt=-gmlt
+// //icing/index/iterator:doc-hit-info-iterator_benchmark
+//
+// $ adb push
+// blaze-bin/icing/index/iterator/doc-hit-info-iterator_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/doc-hit-info-iterator_benchmark
+// --benchmarks=all
+
+// Functor to be used with std::generate to create a container of DocHitInfos.
+// DocHitInfos are generated starting at docid starting_docid and continuing at
+// every n docid.
+// Ex. std::vector<DocHitInfo> infos(6);
+// std::generate(infos.begin(), infos.end(), GeneratorEveryOtherN(25, 5));
+// infos will now hold: {DocHitInfo(25), DocHitInfo(20), DocHitInfo(15),
+// DocHitInfo(10), DocHitInfo(5), DocHitInfo(0)}
+struct GeneratorEveryOtherN {
+ explicit GeneratorEveryOtherN(DocumentId starting_docid, int n)
+ : current_docid(starting_docid), interval(n) {}
+
+ DocHitInfo operator()() {
+ DocHitInfo info(current_docid, kSectionIdMaskAll);
+ current_docid -= interval;
+ return info;
+ }
+
+ DocumentId current_docid;
+ int interval;
+};
+
+void BM_DocHitInfoIteratorAndBenchmark(benchmark::State& state) {
+ // First iterator: If starting_docid is 1024 and interval is 2, docids
+ // [1024, 1022, 1020, 1018, ..., 2, 0]
+ DocumentId starting_docid = state.range(0);
+ int interval = state.range(1);
+ std::vector<DocHitInfo> first_infos((starting_docid / interval) + 1);
+ std::generate(first_infos.begin(), first_infos.end(),
+ GeneratorEveryOtherN(starting_docid, interval));
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_infos);
+
+ // Second iterator: An iterator with 1/4 of the hits as first_iter. If
+ // starting_docid is 1024 and interval is 2, docids
+ // [1024, 1016, 1008, 1000, ..., 8, 0]
+ interval *= 4;
+ std::vector<DocHitInfo> second_infos((starting_docid / interval) + 1);
+ std::generate(second_infos.begin(), second_infos.end(),
+ GeneratorEveryOtherN(starting_docid, interval));
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_infos);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iters;
+ iters.push_back(std::move(first_iter));
+ iters.push_back(std::move(second_iter));
+ std::unique_ptr<DocHitInfoIterator> and_iter =
+ CreateAndIterator(std::move(iters));
+ for (auto _ : state) {
+ while (and_iter->Advance().ok()) {
+ // Intentionally left blank.
+ }
+ }
+}
+BENCHMARK(BM_DocHitInfoIteratorAndBenchmark)
+ ->ArgPair(1024, 1)
+ ->ArgPair(1024, 2)
+ ->ArgPair(1024, 4)
+ ->ArgPair(1024, 16)
+ ->ArgPair(1024, 128)
+ ->ArgPair(8192, 1)
+ ->ArgPair(8192, 2)
+ ->ArgPair(8192, 4)
+ ->ArgPair(8192, 16)
+ ->ArgPair(8192, 128)
+ ->ArgPair(65536, 1)
+ ->ArgPair(65536, 2)
+ ->ArgPair(65536, 4)
+ ->ArgPair(65536, 16)
+ ->ArgPair(65536, 128);
+
+void BM_DocHitInfoIteratorAndNaryBenchmark(benchmark::State& state) {
+ // First iterator: If starting_docid is 1024 and interval is 2, docids
+ // [1024, 1022, 1020, 1018, ..., 2, 0]
+ DocumentId starting_docid = state.range(0);
+ int interval = state.range(1);
+ std::vector<DocHitInfo> first_infos((starting_docid / interval) + 1);
+ std::generate(first_infos.begin(), first_infos.end(),
+ GeneratorEveryOtherN(starting_docid, interval));
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_infos);
+
+ // Second iterator: An iterator with 1/2 of the hits as first_iter. If
+ // starting_docid is 1024 and interval is 2, docids
+ // [1024, 1020, 1016, 1012, ..., 4, 0]
+ interval *= 2;
+ std::vector<DocHitInfo> second_infos((starting_docid / interval) + 1);
+ std::generate(second_infos.begin(), second_infos.end(),
+ GeneratorEveryOtherN(starting_docid, interval));
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_infos);
+
+ // Third iterator: An iterator with 1/4 of the hits as first_iter. If
+ // starting_docid is 1024 and interval is 2, docids
+ // [1024, 1016, 1008, 1000, ..., 8, 0]
+ interval *= 4;
+ std::vector<DocHitInfo> third_infos((starting_docid / interval) + 1);
+ std::generate(third_infos.begin(), third_infos.end(),
+ GeneratorEveryOtherN(starting_docid, interval));
+ std::unique_ptr<DocHitInfoIterator> third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_infos);
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iters;
+ iters.push_back(std::move(first_iter));
+ iters.push_back(std::move(second_iter));
+ iters.push_back(std::move(third_iter));
+ std::unique_ptr<DocHitInfoIterator> and_iter =
+ CreateAndIterator(std::move(iters));
+ for (auto _ : state) {
+ while (and_iter->Advance().ok()) {
+ // Intentionally left blank.
+ }
+ }
+}
+BENCHMARK(BM_DocHitInfoIteratorAndNaryBenchmark)
+ ->ArgPair(1024, 1)
+ ->ArgPair(1024, 2)
+ ->ArgPair(1024, 4)
+ ->ArgPair(1024, 16)
+ ->ArgPair(1024, 128)
+ ->ArgPair(8192, 1)
+ ->ArgPair(8192, 2)
+ ->ArgPair(8192, 4)
+ ->ArgPair(8192, 16)
+ ->ArgPair(8192, 128)
+ ->ArgPair(65536, 1)
+ ->ArgPair(65536, 2)
+ ->ArgPair(65536, 4)
+ ->ArgPair(65536, 16)
+ ->ArgPair(65536, 128);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite-index.cc b/icing/index/lite-index.cc
new file mode 100644
index 0000000..56b8def
--- /dev/null
+++ b/icing/index/lite-index.cc
@@ -0,0 +1,420 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite-index.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/legacy/index/icing-array-storage.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-lite-index-header.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Point at which we declare the trie full.
+constexpr double kTrieFullFraction = 0.95;
+
+std::string MakeHitBufferFilename(const std::string& filename_base) {
+ return filename_base + "hb";
+}
+
+size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
+
+} // namespace
+
+const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue =
+ LiteIndex::Element(0, Hit()).value();
+
+libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
+ const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
+ std::unique_ptr<LiteIndex> lite_index =
+ std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
+ ICING_RETURN_IF_ERROR(lite_index->Initialize());
+ return std::move(lite_index);
+}
+
+// size is max size in elements. An appropriate lexicon and display
+// mapping size will be chosen based on hit buffer size.
+LiteIndex::LiteIndex(const LiteIndex::Options& options,
+ const IcingFilesystem* filesystem)
+ : hit_buffer_(*filesystem),
+ hit_buffer_crc_(0),
+ lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
+ filesystem),
+ header_mmap_(false, MAP_SHARED),
+ options_(options),
+ filesystem_(filesystem) {}
+
+LiteIndex::~LiteIndex() {
+ if (initialized()) {
+ libtextclassifier3::Status unused = PersistToDisk();
+ }
+}
+
+IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
+ return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
+}
+
+libtextclassifier3::Status LiteIndex::Initialize() {
+ // Size of hit buffer's header struct, rounded up to the nearest number of
+ // system memory pages.
+ const size_t header_padded_size =
+ IcingMMapper::page_aligned_size(header_size());
+
+ // Variable declarations cannot cross goto jumps, so declare these up top.
+ libtextclassifier3::Status status;
+ uint64_t file_size;
+ IcingTimer timer;
+
+ if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
+ !lexicon_.Init()) {
+ return absl_ports::InternalError("Failed to initialize lexicon trie");
+ }
+
+ hit_buffer_fd_.reset(filesystem_->OpenForWrite(
+ MakeHitBufferFilename(options_.filename_base).c_str()));
+ if (!hit_buffer_fd_.is_valid()) {
+ status = absl_ports::InternalError("Failed to open hit buffer file");
+ goto error;
+ }
+
+ file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
+ if (file_size == IcingFilesystem::kBadFileSize) {
+ status = absl_ports::InternalError("Failed to query hit buffer file size");
+ goto error;
+ }
+
+ if (file_size < header_padded_size) {
+ if (file_size != 0) {
+ status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Hit buffer had unexpected size %" PRIu64, file_size));
+ goto error;
+ }
+
+ ICING_VLOG(2) << "Creating new hit buffer";
+ // Make sure files are fresh.
+ if (!lexicon_.Remove() ||
+ !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
+ !lexicon_.Init()) {
+ status =
+ absl_ports::InternalError("Failed to refresh lexicon during clear");
+ goto error;
+ }
+
+ // Create fresh hit buffer by first emptying the hit buffer file and then
+ // allocating header_padded_size of the cleared space.
+ if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
+ !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
+ status = absl_ports::InternalError("Failed to truncate hit buffer file");
+ goto error;
+ }
+
+ // Set up header.
+ header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+ header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
+ reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
+ header_mmap_.address()));
+ header_->Reset();
+
+ if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
+ sizeof(Element::Value), header_->cur_size(),
+ options_.hit_buffer_size, &hit_buffer_crc_, true)) {
+ status = absl_ports::InternalError("Failed to initialize new hit buffer");
+ goto error;
+ }
+
+ UpdateChecksum();
+ } else {
+ header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+ header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
+ reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
+ header_mmap_.address()));
+
+ if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
+ sizeof(Element::Value), header_->cur_size(),
+ options_.hit_buffer_size, &hit_buffer_crc_, true)) {
+ status = absl_ports::InternalError(
+ "Failed to re-initialize existing hit buffer");
+ goto error;
+ }
+
+ // Check integrity.
+ if (!header_->check_magic()) {
+ status = absl_ports::InternalError("Lite index header magic mismatch");
+ goto error;
+ }
+ Crc32 crc = ComputeChecksum();
+ if (crc.Get() != header_->lite_index_crc()) {
+ status = absl_ports::DataLossError(
+ IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
+ crc.Get(), header_->lite_index_crc()));
+ goto error;
+ }
+ }
+
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index init ok in %.3fms",
+ timer.Elapsed() * 1000);
+ return status;
+
+error:
+ header_ = nullptr;
+ header_mmap_.Unmap();
+ lexicon_.Close();
+ hit_buffer_crc_ = 0;
+ hit_buffer_.Reset();
+ hit_buffer_fd_.reset();
+ if (status.ok()) {
+ return absl_ports::InternalError(
+ "Error handling code ran but status was ok");
+ }
+ return status;
+}
+
+Crc32 LiteIndex::ComputeChecksum() {
+ IcingTimer timer;
+
+ // Update crcs.
+ uint32_t dependent_crcs[2];
+ hit_buffer_.UpdateCrc();
+ dependent_crcs[0] = hit_buffer_crc_;
+ dependent_crcs[1] = lexicon_.UpdateCrc();
+
+ // Compute the master crc.
+
+ // Header crc, excluding the actual crc field.
+ Crc32 all_crc(header_->CalculateHeaderCrc());
+ all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
+ sizeof(dependent_crcs)));
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Lite index crc computed in %.3fms", timer.Elapsed() * 1000);
+
+ return all_crc;
+}
+
+libtextclassifier3::Status LiteIndex::Reset() {
+ IcingTimer timer;
+
+ // TODO(b/140436942): When these components have been changed to return errors
+ // they should be propagated from here.
+ lexicon_.Clear();
+ hit_buffer_.Clear();
+ header_->Reset();
+ UpdateChecksum();
+
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index clear in %.3fms",
+ timer.Elapsed() * 1000);
+ return libtextclassifier3::Status::OK;
+}
+
+void LiteIndex::Warm() {
+ hit_buffer_.Warm();
+ lexicon_.Warm();
+}
+
+libtextclassifier3::Status LiteIndex::PersistToDisk() {
+ bool success = true;
+ if (!lexicon_.Sync()) {
+ ICING_VLOG(1) << "Failed to sync the lexicon.";
+ success = false;
+ }
+ hit_buffer_.Sync();
+ UpdateChecksum();
+ header_mmap_.Sync();
+
+ return (success) ? libtextclassifier3::Status::OK
+ : absl_ports::InternalError(
+ "Unable to sync lite index components.");
+}
+
+void LiteIndex::UpdateChecksum() {
+ header_->set_lite_index_crc(ComputeChecksum().Get());
+}
+
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
+ const std::string& term, TermMatchType::Code term_match_type) {
+ uint32_t tvi;
+ if (!lexicon_.Insert(term.c_str(), "", &tvi, false)) {
+ return absl_ports::ResourceExhaustedError(
+ absl_ports::StrCat("Unable to add term ", term, " to lexicon!"));
+ }
+ ICING_RETURN_IF_ERROR(UpdateTerm(tvi, term_match_type));
+ return tvi;
+}
+
+libtextclassifier3::Status LiteIndex::UpdateTerm(
+ uint32_t tvi, TermMatchType::Code term_match_type) {
+ if (term_match_type != TermMatchType::PREFIX) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ if (!lexicon_.SetProperty(tvi, kHasHitsInPrefixSection)) {
+ return absl_ports::ResourceExhaustedError(
+ "Insufficient disk space to create property!");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
+ if (is_full()) {
+ return absl_ports::ResourceExhaustedError("Hit buffer is full!");
+ }
+
+ header_->set_last_added_docid(hit.document_id());
+
+ Element elt(term_id, hit);
+ uint32_t cur_size = header_->cur_size();
+ Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1);
+ if (valp == nullptr) {
+ return absl_ports::ResourceExhaustedError(
+ "Allocating more space in hit buffer failed!");
+ }
+ *valp = elt.value();
+ header_->set_cur_size(cur_size + 1);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
+ const std::string& term) const {
+ char dummy;
+ uint32_t tvi;
+ if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Could not find ", term, " in the lexicon."));
+ }
+ return tvi;
+}
+
+uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out) {
+ uint32_t count = 0;
+ DocumentId last_document_id = kInvalidDocumentId;
+ for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
+ Element elt(hit_buffer_.array_cast<Element>()[idx]);
+ if (elt.term_id() != term_id) break;
+
+ const Hit& hit = elt.hit();
+ // Check sections.
+ if (((1u << hit.section_id()) & section_id_mask) == 0) {
+ continue;
+ }
+ // Check prefix section only.
+ if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
+ continue;
+ }
+ DocumentId document_id = hit.document_id();
+ if (document_id != last_document_id) {
+ count++;
+ if (hits_out != nullptr) {
+ hits_out->push_back(DocHitInfo(document_id));
+ }
+ last_document_id = document_id;
+ }
+ if (hits_out != nullptr) {
+ hits_out->back().UpdateSection(hit.section_id(), hit.score());
+ }
+ }
+ return count;
+}
+
+bool LiteIndex::is_full() const {
+ return (header_->cur_size() == options_.hit_buffer_size ||
+ lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
+}
+
+void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
+ absl_ports::StrAppend(
+ out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
+ header_->cur_size(),
+ options_.hit_buffer_size));
+
+ // Lexicon.
+ out->append("Lexicon stats:\n");
+ lexicon_.GetDebugInfo(verbosity, out);
+}
+
+uint32_t LiteIndex::Seek(uint32_t term_id) {
+ // Make searchable by sorting by hit buffer.
+ uint32_t sort_len = header_->cur_size() - header_->searchable_end();
+ if (sort_len > 0) {
+ IcingTimer timer;
+
+ auto* array_start =
+ hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size());
+ Element::Value* sort_start = array_start + header_->searchable_end();
+ std::sort(sort_start, array_start + header_->cur_size());
+
+ // Now merge with previous region. Since the previous region is already
+ // sorted and deduplicated, optimize the merge by skipping everything less
+ // than the new region's smallest value.
+ if (header_->searchable_end() > 0) {
+ std::inplace_merge(array_start, array_start + header_->searchable_end(),
+ array_start + header_->cur_size());
+ }
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Lite index sort and merge %u into %u in %.3fms", sort_len,
+ header_->searchable_end(), timer.Elapsed() * 1000);
+
+ // Now the entire array is sorted.
+ header_->set_searchable_end(header_->cur_size());
+
+ // Update crc in-line.
+ UpdateChecksum();
+ }
+
+ // Binary search for our term_id. Make sure we get the first
+ // element. Using kBeginSortValue ensures this for the hit value.
+ Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
+
+ const Element::Value* array = hit_buffer_.array_cast<Element::Value>();
+ const Element::Value* ptr =
+ std::lower_bound(array, array + header_->cur_size(), elt.value());
+ return ptr - array;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite-index.h b/icing/index/lite-index.h
new file mode 100644
index 0000000..ff573a0
--- /dev/null
+++ b/icing/index/lite-index.h
@@ -0,0 +1,223 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A small index with continuous updates (doesn't need explicit Flush
+// to persiste) but has more possibility for corruption. It can always
+// detect corruption reliably.
+
+#ifndef ICING_INDEX_LITE_INDEX_H_
+#define ICING_INDEX_LITE_INDEX_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/index/icing-array-storage.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-lite-index-header.h"
+#include "icing/legacy/index/icing-lite-index-options.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/bit-util.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+enum TermPropertyId {
+ kHasHitsInPrefixSection = 0,
+};
+
+class LiteIndex {
+ public:
+ // An entry in the hit buffer.
+ class Element {
+ public:
+ // Layout bits: 24 termid + 32 hit value + 8 hit score.
+ using Value = uint64_t;
+
+ static constexpr int kTermIdBits = 24;
+ static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+ static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
+
+ static const Value kInvalidValue;
+
+ explicit Element(Value v = kInvalidValue) : value_(v) {}
+
+ Element(uint32_t term_id, const Hit& hit) {
+ static_assert(
+ kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
+ "LiteIndexElementTooBig");
+
+ value_ = 0;
+ // Term id goes into the most significant bits because it takes
+ // precedent in sorts.
+ bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
+ &value_);
+ bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
+ bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
+ }
+
+ uint32_t term_id() const {
+ return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
+ kTermIdBits);
+ }
+
+ Hit hit() const {
+ return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
+ bit_util::BitfieldGet(value_, 0, kHitScoreBits));
+ }
+
+ Value value() const { return value_; }
+
+ private:
+ Value value_;
+ };
+
+ using Options = IcingLiteIndexOptions;
+
+ // Updates checksum of subcomponents.
+ ~LiteIndex();
+
+ // Creates lite index from storage. The files will be created if they do not
+ // already exist.
+ // If Create() fails, a non-ok Status will be returned.
+ static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
+ const Options& options, const IcingFilesystem* filesystem);
+
+ // Resets all internal members of the index. Returns OK if all operations were
+ // successful.
+ libtextclassifier3::Status Reset();
+
+ // Advises the OS to cache pages in the index, which will be accessed for a
+ // query soon.
+ void Warm();
+
+ // Syncs all modified files in the index to disk. Returns non-OK status if any
+ // file fails to sync properly.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculate the checksum of all sub-components of the LiteIndex
+ Crc32 ComputeChecksum();
+
+ // Returns term_id if term found, NOT_FOUND otherwise.
+ libtextclassifier3::StatusOr<uint32_t> FindTerm(
+ const std::string& term) const;
+
+ // Returns an iterator for all terms for which 'prefix' is a prefix.
+ class PrefixIterator {
+ public:
+ explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
+ : delegate_(delegate) {}
+ bool IsValid() const { return delegate_.IsValid(); }
+
+ void Advance() { delegate_.Advance(); }
+
+ const char* GetKey() const { return delegate_.GetKey(); }
+
+ uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
+
+ private:
+ IcingDynamicTrie::Iterator delegate_;
+ };
+
+ PrefixIterator FindTermPrefixes(const std::string& prefix) const {
+ return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
+ }
+
+ // Insert a term. Returns non-OK if lexicon is full.
+ libtextclassifier3::StatusOr<uint32_t> InsertTerm(
+ const std::string& term, TermMatchType::Code term_match_type);
+
+ // Updates term properties by setting the bit for has_hits_in_prefix_section
+ // only if term_match_type == PREFIX. Otherwise, this does nothing.
+ libtextclassifier3::Status UpdateTerm(uint32_t tvi,
+ TermMatchType::Code term_match_type);
+
+ // Append hit to buffer. term_id must be encoded using the same term_id_codec
+ // supplied to the index constructor. Returns non-OK if hit cannot be added
+ // (either due to hit buffer or file system capacity reached).
+ libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit);
+
+ // Add all hits with term_id from the sections specified in section_id_mask,
+ // skipping hits in non-prefix sections if only_from_prefix_sections is true,
+ // to hits_out.
+ uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out);
+
+ // Check if buffer has reached its capacity.
+ bool is_full() const;
+
+ constexpr static uint32_t max_hit_buffer_size() {
+ return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element);
+ }
+
+ // We keep track of the last added document_id. This is always the largest
+ // document_id that has been added because hits can only be added in order of
+ // increasing document_id.
+ DocumentId last_added_document_id() const {
+ return header_->last_added_docid();
+ }
+
+ // Returns debug information for the index in out.
+ // verbosity <= 0, simplest debug information - size of lexicon, hit buffer
+ // verbosity > 0, more detailed debug information from the lexicon.
+ void GetDebugInfo(int verbosity, std::string* out) const;
+
+ private:
+ static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
+
+ LiteIndex(const Options& options, const IcingFilesystem* filesystem);
+
+ // Initializes lite index from storage. Must be called exactly once after
+ // object construction. If init fails, returns a non-ok Status.
+ libtextclassifier3::Status Initialize();
+
+ bool initialized() const { return header_ != nullptr; }
+
+ // Sets the computed checksum in the header
+ void UpdateChecksum();
+
+ // Returns the position of the first element with term_id, or the size of the
+ // hit buffer if term_id is not present.
+ uint32_t Seek(uint32_t term_id);
+
+ ScopedFd hit_buffer_fd_;
+
+ IcingArrayStorage hit_buffer_;
+ uint32_t hit_buffer_crc_;
+ IcingDynamicTrie lexicon_;
+ // TODO(b/140437260): Port over to MemoryMappedFile
+ IcingMMapper header_mmap_;
+ std::unique_ptr<IcingLiteIndex_Header> header_;
+ const Options options_;
+ // TODO(b/139087650) Move to icing::Filesystem
+ const IcingFilesystem* const filesystem_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_LITE_INDEX_H_
diff --git a/icing/index/term-id-codec.cc b/icing/index/term-id-codec.cc
new file mode 100644
index 0000000..49e75f6
--- /dev/null
+++ b/icing/index/term-id-codec.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/term-id-codec.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/legacy/core/icing-string-util.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::unique_ptr<TermIdCodec>> TermIdCodec::Create(
+ uint32_t max_main_tvi, uint32_t max_lite_tvi) {
+ uint64_t sum =
+ static_cast<uint64_t>(max_main_tvi) + static_cast<uint64_t>(max_lite_tvi);
+ if (sum > std::numeric_limits<uint32_t>::max()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Sum of max_main_tvi, %d, and max_lite_tvi, %d must be less than the "
+ "uint32_t max, %d.",
+ max_main_tvi, max_lite_tvi, std::numeric_limits<uint32_t>::max()));
+ }
+
+ // TODO(cassiewang): When we convert these values to signed ints, we should
+ // check to make sure they're >= 0.
+
+ return std::unique_ptr<TermIdCodec>(
+ new TermIdCodec(max_main_tvi, max_lite_tvi));
+}
+
+libtextclassifier3::StatusOr<uint32_t> TermIdCodec::EncodeTvi(
+ uint32_t tvi, TviType tvi_type) const {
+ switch (tvi_type) {
+ case TviType::MAIN:
+ if (tvi >= max_main_tvi_) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Main tvi %d is greater "
+ "than or equal to the max_main_tvi %d",
+ tvi, max_main_tvi_));
+ }
+ return tvi;
+ case TviType::LITE: {
+ if (tvi >= max_lite_tvi_) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Lite tvi %d is greater "
+ "than or equal to the max_lite_tvi %d",
+ tvi, max_lite_tvi_));
+ }
+ return max_main_tvi_ + tvi;
+ }
+ }
+}
+
+libtextclassifier3::StatusOr<TviType> TermIdCodec::DecodeTviType(
+ uint32_t term_id) const {
+ if (term_id < max_main_tvi_) {
+ return TviType::MAIN;
+ } else if (term_id < max_term_id()) {
+ return TviType::LITE;
+ }
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Given TermId %d is greater than or equal to the max TermId %d", term_id,
+ max_term_id()));
+}
+
+libtextclassifier3::StatusOr<TermIdCodec::DecodedTermInfo>
+TermIdCodec::DecodeTermInfo(uint32_t term_id) const {
+ DecodedTermInfo result;
+ ICING_ASSIGN_OR_RETURN(result.tvi_type, DecodeTviType(term_id));
+ switch (result.tvi_type) {
+ case TviType::MAIN:
+ result.tvi = term_id;
+ break;
+ case TviType::LITE:
+ result.tvi = term_id - max_main_tvi_;
+ break;
+ }
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/term-id-codec.h b/icing/index/term-id-codec.h
new file mode 100644
index 0000000..cead108
--- /dev/null
+++ b/icing/index/term-id-codec.h
@@ -0,0 +1,107 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_ID_CODEC_H_
+#define ICING_INDEX_TERM_ID_CODEC_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "utils/base/statusor.h"
+
+// Encodes/decodes TermIds into different TviTypes. A "tvi" is a
+// term_value_index into some space, essentially a unique id within that space.
+// Across TviTypes, tvis are not necessarily unique (i.e. we can have a tvi of 0
+// in the LITE space and a tvi of 0 in the MAIN space). The codec maps tvis into
+// one overall TermId space so that tvis can be represented by a unique TermId
+// across all TviTypes (i.e. a MAIN tvi of 0 can be represented by 0, and a LITE
+// tvi of 0 can be represented by 10). The max TermId will be the sum of the max
+// MAIN tvi and the max LITE tvi.
+//
+// Example use:
+// ICING_ASSIGN_OR_RETURN(auto term_id_codec,
+// TermIdCodec::Create(/*max_main_tvi=*/5, /*max_lite_tvi=*/5);
+//
+// term_id_codec->tvi_type(0); // TviType::Main
+// term_id_codec->tvi_type(4); // TviType::Main
+// term_id_codec->tvi_type(5); // TviType::Lite
+// term_id_codec->tvi_type(9); // TviType::Lite
+//
+// term_id_codec->tvi_type(100); // INVALID_ARGUMENT, exceeds max TermId
+//
+// TODO(cassiewang): Use signed integers for the tvi values. Currently, the max
+// values that are passed in are ~5 million for max_main_tvi, and ~1 million for
+// max_lite_tvi. Since the sum of both of these is still well under the int32_t
+// max, we should use signed integers (go/totw/159) (go/totw/159). But since
+// we're getting these values from icing::DynamicTrie, we need to convert all
+// the uints at once to avoid even worse undefined conversion behavior.
+namespace icing {
+namespace lib {
+
+enum TviType { MAIN, LITE };
+
+class TermIdCodec {
+ public:
+ struct DecodedTermInfo {
+ TviType tvi_type;
+ uint32_t tvi;
+ };
+
+ // Encodes/decodes TermIds based on a max main tvi and a max lite tvi. The max
+ // tvis are an exclusive upper bound on the values. For example, Create(5, 5)
+ // creates a MAIN encoding that holds [0, 1, 2, 3, 4] TermIds and a LITE
+ // encoding that holds [5, 6, 7, 8, 9] TermIds.
+ //
+ // Returns:
+ // unique_ptr to a TermIdCodec on success
+ // INVALID_ARGUMENT if the sum of max_main_tvi and max_lite_tvi is greater
+ // than the max uint32_t value
+ static libtextclassifier3::StatusOr<std::unique_ptr<TermIdCodec>> Create(
+ uint32_t max_main_tvi, uint32_t max_lite_tvi);
+
+ // Returns:
+ // TermId that would represent the given tvi of tvi_type
+ // INVALID_ARGUMENT if the tvi of tvi_type would exceed the max TermId
+ libtextclassifier3::StatusOr<uint32_t> EncodeTvi(uint32_t tvi,
+ TviType tvi_type) const;
+
+ // Returns:
+ // TviType of the encoded TermId
+ // INVALID_ARGUMENT if the term_id exceeds the max TermId
+ libtextclassifier3::StatusOr<TviType> DecodeTviType(uint32_t term_id) const;
+
+ // Returns:
+ // Decoded info of the given term_id
+ // INVALID_ARGUMENT if the term_id exceeds the max TermId
+ libtextclassifier3::StatusOr<DecodedTermInfo> DecodeTermInfo(
+ uint32_t term_id) const;
+
+ uint32_t max_main_tvi() const { return max_main_tvi_; }
+
+ uint32_t max_lite_tvi() const { return max_lite_tvi_; }
+
+ uint32_t max_term_id() const { return max_main_tvi_ + max_lite_tvi_; }
+
+ private:
+ explicit TermIdCodec(uint32_t max_main_tvi, uint32_t max_lite_tvi)
+ : max_main_tvi_(max_main_tvi), max_lite_tvi_(max_lite_tvi) {}
+
+ uint32_t max_main_tvi_;
+ uint32_t max_lite_tvi_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_ID_CODEC_H_
diff --git a/icing/index/term-id-codec_test.cc b/icing/index/term-id-codec_test.cc
new file mode 100644
index 0000000..fa7c9e3
--- /dev/null
+++ b/icing/index/term-id-codec_test.cc
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/term-id-codec.h"
+
+#include <cstdint>
+#include <limits>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(TermIdCodecTest, CreateExceedsInt32MaxInvalid) {
+ EXPECT_THAT(TermIdCodec::Create(
+ /*max_main_tvi=*/std::numeric_limits<uint32_t>::max() - 10,
+ /*max_lite_tvi=*/std::numeric_limits<uint32_t>::max() - 10),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(TermIdCodecTest, CreateWithPositiveTvisOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/10, /*max_lite_tvi=*/10));
+}
+
+TEST(TermIdCodecTest, CreateWithZeroTvisOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/0, /*max_lite_tvi=*/0));
+}
+
+TEST(TermIdCodecTest, Encode) {
+ // Create a codec where main TVIs are: [0,3), lite: [3,10)
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/3, /*max_lite_tvi=*/7));
+
+ EXPECT_THAT(codec->EncodeTvi(0, TviType::MAIN), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(codec->EncodeTvi(2, TviType::MAIN), IsOkAndHolds(Eq(2)));
+ EXPECT_THAT(codec->EncodeTvi(3, TviType::MAIN),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(codec->EncodeTvi(0, TviType::LITE), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(codec->EncodeTvi(6, TviType::LITE), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(codec->EncodeTvi(7, TviType::LITE),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(TermIdCodecTest, DecodeTermInfo) {
+ // Create a codec where main TVIs are: [0,3), lite: [3,10)
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/3, /*max_lite_tvi=*/7));
+
+ ICING_ASSERT_OK_AND_ASSIGN(TermIdCodec::DecodedTermInfo decoded_term,
+ codec->DecodeTermInfo(0));
+ EXPECT_THAT(decoded_term.tvi_type, Eq(TviType::MAIN));
+ EXPECT_THAT(decoded_term.tvi, Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(decoded_term, codec->DecodeTermInfo(3));
+ EXPECT_THAT(decoded_term.tvi_type, Eq(TviType::LITE));
+ EXPECT_THAT(decoded_term.tvi, Eq(0));
+
+ EXPECT_THAT(codec->DecodeTermInfo(10),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(TermIdCodecTest, TviType) {
+ // Create a codec where main TVIs are: [0,3), lite: [3,10)
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/3, /*max_lite_tvi=*/7));
+
+ EXPECT_THAT(codec->DecodeTviType(0), IsOkAndHolds(Eq(TviType::MAIN)));
+ EXPECT_THAT(codec->DecodeTviType(2), IsOkAndHolds(Eq(TviType::MAIN)));
+ EXPECT_THAT(codec->DecodeTviType(3), IsOkAndHolds(Eq(TviType::LITE)));
+ EXPECT_THAT(codec->DecodeTviType(9), IsOkAndHolds(Eq(TviType::LITE)));
+ EXPECT_THAT(codec->DecodeTviType(10),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(TermIdCodecTest, Max) {
+ // Create a codec where main TVIs are: [0,3), lite: [3,10)
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIdCodec> codec,
+ TermIdCodec::Create(/*max_main_tvi=*/3, /*max_lite_tvi=*/7));
+
+ EXPECT_THAT(codec->max_main_tvi(), Eq(3));
+ EXPECT_THAT(codec->max_lite_tvi(), Eq(7));
+ EXPECT_THAT(codec->max_term_id(), Eq(10));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/core/icing-compat.h b/icing/legacy/core/icing-compat.h
new file mode 100644
index 0000000..4340707
--- /dev/null
+++ b/icing/legacy/core/icing-compat.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Author: csyoung@google.com (C. Sean Young)
+
+#ifndef ICING_LEGACY_CORE_ICING_COMPAT_H_
+#define ICING_LEGACY_CORE_ICING_COMPAT_H_
+
+#ifdef __APPLE__
+// iOS does not allow dlopen/dlclose/dlsym.
+#define ICING_ALLOW_DYNAMIC_EXTENSIONS 0
+// Xcode 6.3/LLVM 3.6 removed std::hash specialization for std::basic_string,
+// so we need the versions from util/hash/hash.h. The implementations in
+// util/hash/* have added an AVOID_TRADE_SECRET_CODE macro that's defined on
+// Apple (and some other client targets) to prevent leaking proprietary hash
+// code.
+#else
+#define ICING_ALLOW_DYNAMIC_EXTENSIONS 1
+#endif // __APPLE__
+
+#endif // ICING_LEGACY_CORE_ICING_COMPAT_H_
diff --git a/icing/legacy/core/icing-core-types.h b/icing/legacy/core/icing-core-types.h
new file mode 100644
index 0000000..cc12663
--- /dev/null
+++ b/icing/legacy/core/icing-core-types.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: sbanacho@google.com (Scott Banachowski)
+//
+// This header is used to declare typedefs and constants
+// that are shared across several modules.
+
+#ifndef ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
+#define ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
+
+#include <stdint.h>
+
+#include <cstddef> // size_t not defined implicitly for all platforms.
+#include <vector>
+
+#include "icing/legacy/core/icing-compat.h"
+
+namespace icing {
+namespace lib {
+
+enum IcingTokenizerType {
+ // Group tokens by clustering characters that match IsRuneLetter
+ // together.
+ TOKENIZER_PLAIN,
+ // Marks tokens that look like html tags (enclosed in <>) or entities
+ // (enclosed in &;).
+ TOKENIZER_HTML,
+ // Skip over html tags.
+ TOKENIZER_HTML_IGNORE_TAGS,
+ // Email address parsing. Marks the address, local address and rfc token
+ // portion of from, to, cc headers. This assumes the rfc822 tokens
+ // were pre-processed to canonical form by the Android Java Rfc822Tokenizer.
+ TOKENIZER_RFC822,
+ // For tokenizing queries, recognizes query syntax.
+ TOKENIZER_QUERY,
+ // For tokenizing simple queries, which only breaks on whitespace tokens.
+ TOKENIZER_QUERY_SIMPLE,
+ // For tokenizing ST-like queries.
+ TOKENIZER_ST_QUERY,
+ // For tokenizing URLs.
+ TOKENIZER_URL,
+ // For not tokenizing and returning one token same as the input.
+ TOKENIZER_VERBATIM,
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
diff --git a/icing/legacy/core/icing-packed-pod.h b/icing/legacy/core/icing-packed-pod.h
new file mode 100644
index 0000000..b2db680
--- /dev/null
+++ b/icing/legacy/core/icing-packed-pod.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_CORE_ICING_PACKED_POD_H_
+#define ICING_LEGACY_CORE_ICING_PACKED_POD_H_
+
+#include <type_traits>
+
+namespace icing {
+namespace lib {
+
+// Any struct whose size is not a multiple of its alignment will lead to a
+// padding of sizeof(T) % alignof(T) bytes per element when creating an array of
+// that type.
+template <typename T>
+struct no_array_padding
+ : std::integral_constant<bool, sizeof(T) % alignof(T) == 0> {};
+
+// See go/icing-ubsan for details on requirements of packed objects.
+template <typename T>
+struct icing_is_packed_pod
+ : std::conjunction<std::integral_constant<bool, alignof(T) == 1>,
+ std::is_standard_layout<T>,
+ std::is_trivially_copyable<T>> {};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_CORE_ICING_PACKED_POD_H_
diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
new file mode 100644
index 0000000..1954cd3
--- /dev/null
+++ b/icing/legacy/core/icing-string-util.cc
@@ -0,0 +1,105 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+// sbanacho@google.com (Scott Banachowski)
+//
+// This is a list of IsGoogleLetter letters. It is copied from
+// google3/util/utf8/proptables/letters.txt CL 19164202.
+#include "icing/legacy/core/icing-string-util.h"
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "icing/legacy/portable/icing-zlib.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {} // namespace
+uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) {
+ if (len > 0) {
+ crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len);
+ }
+ return crc;
+}
+
+uint32_t IcingStringUtil::UpdateAtPositionCrc32(uint32_t crc, int orig_len,
+ int offset,
+ const char *xored_str,
+ int len) {
+ // For appending, use UpdateCrc32.
+ if (offset + len > orig_len) {
+ ICING_LOG(FATAL) << "offset: " << offset << " length: " << len
+ << "original_length: " << orig_len;
+ }
+
+ // We have CRC(A|U|B) and we want CRC(A|V|B) where U is the slice
+ // that updated to V.
+ //
+ // Then if xored_str = X = U ^ V:
+ //
+ // CRC(A|V|B) = CRC(0_lenA|X|0_lenB ^ A|U|B)
+ // = CRC(0_lenA|X|0_lenB) ^ CRC(A|U|B)
+ // = CRC_COMBINE(CRC(0_lenA), CRC_COMBINE(CRC(X), CRC(0_lenB)) ^ CRC(A|U|B)
+ //
+ // But CRC(0s) = 0, and CRC_COMBINE(0, X) = X, so this becomes
+ // = CRC_COMBINE(CRC(X), CRC(0_lenB)) ^ CRC(A|U|B)
+ uint32_t update_crc = UpdateCrc32(0, xored_str, len);
+ update_crc = crc32_combine(update_crc, 0, orig_len - (offset + len));
+ return crc ^ update_crc;
+}
+
+void IcingStringUtil::SStringAppendV(std::string *strp, int bufsize,
+ const char *fmt, va_list arglist) {
+ int capacity = bufsize;
+ if (capacity <= 0) {
+ va_list backup;
+ va_copy(backup, arglist);
+ capacity = vsnprintf(nullptr, 0, fmt, backup);
+ va_end(arglist);
+ }
+
+ size_t start = strp->size();
+ strp->resize(strp->size() + capacity + 1);
+
+ int written = vsnprintf(&(*strp)[start], capacity + 1, fmt, arglist);
+ va_end(arglist);
+ strp->resize(start + std::min(capacity, written));
+}
+
+void IcingStringUtil::SStringAppendF(std::string *strp, int bufsize,
+ const char *fmt, ...) {
+ va_list arglist;
+ va_start(arglist, fmt);
+ SStringAppendV(strp, bufsize, fmt, arglist);
+}
+
+std::string IcingStringUtil::StringPrintf(const char *fmt, ...) {
+ std::string s;
+ va_list arglist;
+ va_start(arglist, fmt);
+ SStringAppendV(&s, 0, fmt, arglist);
+ return s;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
new file mode 100644
index 0000000..01c17f1
--- /dev/null
+++ b/icing/legacy/core/icing-string-util.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+// sbanacho@google.com (Scott Banachowski)
+
+#ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
+#define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "icing/legacy/core/icing-compat.h"
+
+namespace icing {
+namespace lib {
+
+class IcingStringUtil {
+ public:
+ // Returns true if the character is not the first byte of
+ // a multi-byte UTF8 character
+ static bool IsContinuationByte(char byte) {
+ return (static_cast<uint8_t>(byte) & 0xC0) == 0x80;
+ }
+
+ static bool IsAsciiChar(char c) { return static_cast<signed char>(c) >= 0; }
+
+ // Update a rolling crc32. This undoes the one's complement
+ // pre-conditioning and post-conditioning of zlib's
+ // crc32. Therefore, UpdateCrc32(0, str, len) != HashCrc32(str,
+ // len).
+ static uint32_t UpdateCrc32(uint32_t crc, const char *str, int len);
+
+ // Update a string's rolling crc for when its value at offset is
+ // xor'ed with the buffer [xored_str, xored_str + len).
+ //
+ // REQUIRES: orig_len >= offset + len.
+ static uint32_t UpdateAtPositionCrc32(uint32_t crc, int orig_len, int offset,
+ const char *xored_str, int len);
+
+ // Append vsnprintf to strp. If bufsize hint is > 0 it is
+ // used. Otherwise we compute the required bufsize (which is somewhat
+ // expensive).
+ static void SStringAppendV(std::string *strp, int bufsize, const char *fmt,
+ va_list arglist);
+ static void SStringAppendF(std::string *strp, int bufsize, const char *fmt,
+ ...) __attribute__((format(printf, 3, 4)));
+ static std::string StringPrintf(const char *fmt, ...)
+ __attribute__((format(printf, 1, 2)));
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
diff --git a/icing/legacy/core/icing-timer.cc b/icing/legacy/core/icing-timer.cc
new file mode 100644
index 0000000..f29ac40
--- /dev/null
+++ b/icing/legacy/core/icing-timer.cc
@@ -0,0 +1,43 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/core/icing-timer.h"
+
+namespace icing {
+namespace lib {
+
+double IcingTimer::WallTimeNow() {
+ struct timeval tv;
+ gettimeofday(&tv, nullptr);
+ return tv.tv_sec + tv.tv_usec / 1e6;
+}
+
+double IcingTimer::ClockTime() {
+#ifdef __APPLE__
+ // iOS targets can't rely on clock_gettime(). So, fallback to WallTime_Now().
+ return WallTimeNow();
+#else
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec + ts.tv_nsec / 1e9;
+#endif // __APPLE__
+}
+
+IcingTimer::IcingTimer() { Reset(); }
+
+void IcingTimer::Reset() { start_ = ClockTime(); }
+
+double IcingTimer::Elapsed() const { return ClockTime() - start_; }
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/core/icing-timer.h b/icing/legacy/core/icing-timer.h
new file mode 100644
index 0000000..49ba9ad
--- /dev/null
+++ b/icing/legacy/core/icing-timer.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_CORE_ICING_TIMER_H_
+#define ICING_LEGACY_CORE_ICING_TIMER_H_
+
+#include <sys/time.h>
+#include <time.h>
+
+namespace icing {
+namespace lib {
+
+// A simple stop-watch timer for performance measurement.
+class IcingTimer {
+ public:
+ static double WallTimeNow();
+
+ IcingTimer();
+
+ void Reset();
+
+ // Elapsed wall time since last Reset().
+ double Elapsed() const;
+
+ private:
+ static double ClockTime();
+
+ double start_;
+};
+
+} // namespace lib
+} // namespace icing
+#endif // ICING_LEGACY_CORE_ICING_TIMER_H_
diff --git a/icing/legacy/index/icing-array-storage.cc b/icing/legacy/index/icing-array-storage.cc
new file mode 100644
index 0000000..aeb3fa3
--- /dev/null
+++ b/icing/legacy/index/icing-array-storage.cc
@@ -0,0 +1,402 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-array-storage.h"
+
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/util/logging.h"
+
+using std::max;
+using std::min;
+using std::vector;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Do the cast and const dance.
+void *MakeVoidPtr(const void *ptr) { return const_cast<void *>(ptr); }
+
+} // namespace
+
+const uint32_t IcingArrayStorage::kPartialCrcLimitDiv = 8; // limit is 1/8th
+const size_t IcingArrayStorage::kGrowElts = 1u << 14; // 16KB
+
+IcingArrayStorage::IcingArrayStorage(const IcingFilesystem &filesystem)
+ : mmapper_(nullptr), filesystem_(filesystem) {
+ Reset();
+}
+
+IcingArrayStorage::~IcingArrayStorage() { delete mmapper_; }
+
+bool IcingArrayStorage::Init(int fd, size_t fd_offset, bool map_shared,
+ uint32_t elt_size, uint32_t num_elts,
+ uint32_t max_num_elts, uint32_t *crc_ptr,
+ bool init_crc) {
+ if (is_initialized()) {
+ return true;
+ }
+
+ // Compute capacity_num_.
+ uint64_t file_size = filesystem_.GetFileSize(fd);
+ if (file_size == IcingFilesystem::kBadFileSize) {
+ ICING_LOG(ERROR) << "Array storage could not get file size";
+ return false;
+ }
+ if (file_size < fd_offset) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Array storage file size %" PRIu64 " less than offset %zu", file_size,
+ fd_offset);
+ return false;
+ }
+
+ uint32_t capacity_num_elts = (file_size - fd_offset) / elt_size;
+ if (capacity_num_elts < num_elts) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Array storage num elts %u > capacity num elts %u", num_elts,
+ capacity_num_elts);
+ return false;
+ }
+
+ // Map beyond the capacity. We will grow underlying file to avoid
+ // SIGBUS.
+ mmapper_ = new IcingMMapper(fd, false, fd_offset, max_num_elts * elt_size,
+ map_shared ? MAP_SHARED : MAP_PRIVATE);
+ if (!mmapper_->is_valid()) {
+ ICING_LOG(ERROR) << "Array storage map failed";
+ delete mmapper_;
+ mmapper_ = nullptr;
+ return false;
+ }
+
+ fd_ = fd;
+ fd_offset_ = fd_offset;
+ map_shared_ = map_shared;
+ elt_size_ = elt_size;
+ // changes_end_ refers to the last element that was included in the
+ // current crc. If we change it, we must also update *crc_ptr_ to
+ // 0. Otherwise UpdateCrc will fail.
+ cur_num_ = changes_end_ = num_elts;
+ max_num_ = max_num_elts;
+ capacity_num_ = capacity_num_elts;
+ crc_ptr_ = crc_ptr;
+
+ if (crc_ptr_) {
+ uint32_t crc = IcingStringUtil::UpdateCrc32(0, array_cast<char>(),
+ cur_num_ * elt_size_);
+ if (init_crc) {
+ *crc_ptr_ = crc;
+ } else if (crc != *crc_ptr_) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Array storage bad crc %u vs %u", crc, *crc_ptr_);
+ goto failed;
+ }
+ }
+ return true;
+
+failed:
+ Reset();
+ return false;
+}
+
+void IcingArrayStorage::Reset() {
+ fd_ = -1;
+ fd_offset_ = 0;
+ map_shared_ = false;
+ delete mmapper_;
+ mmapper_ = nullptr;
+ elt_size_ = 0;
+ cur_num_ = 0;
+ changes_end_ = 0;
+ max_num_ = 0;
+ capacity_num_ = 0;
+ crc_ptr_ = nullptr;
+ changes_.clear();
+ saved_orig_buf_.clear();
+ dirty_pages_.clear();
+}
+
+void IcingArrayStorage::Truncate(uint32_t len) {
+ if (len > cur_num_) {
+ ICING_LOG(FATAL) << "Length exceeds current size";
+ }
+
+ cur_num_ = len;
+}
+
+void *IcingArrayStorage::GetMutableMemInternal(uint32_t elt_idx,
+ uint32_t elt_len) {
+ uint32_t start_byte = elt_idx * elt_size_;
+ uint32_t len_bytes = elt_len * elt_size_;
+
+ if (ABSL_PREDICT_FALSE(!GrowIfNecessary(elt_idx + elt_len))) {
+ return nullptr;
+ }
+
+ cur_num_ = max(cur_num_, elt_idx + elt_len);
+
+ if (crc_ptr_) {
+ // Cache original value to update crcs.
+ if (elt_idx < changes_end_) {
+ uint32_t change_len = min(changes_end_, elt_idx + elt_len) - elt_idx;
+
+ // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
+ // revert to full CRC.
+ if ((saved_orig_buf_.size() + change_len * elt_size_) *
+ kPartialCrcLimitDiv >
+ changes_end_ * elt_size_) {
+ ICING_VLOG(2) << "Array storage change tracking limit exceeded";
+ changes_.clear();
+ saved_orig_buf_.clear();
+ changes_end_ = 0;
+ *crc_ptr_ = 0;
+ } else {
+ changes_.push_back(Change(elt_idx, change_len));
+ saved_orig_buf_.append(array_cast<char>() + start_byte,
+ change_len * elt_size_);
+ }
+ }
+ }
+
+ if (!map_shared_) {
+ // Mark dirty pages.
+ int start_page = start_byte / IcingMMapper::system_page_size();
+ int end_page =
+ (start_byte + len_bytes - 1) / IcingMMapper::system_page_size();
+
+ for (int i = start_page; i <= end_page; i++) {
+ if (static_cast<size_t>(i) >= dirty_pages_.size()) {
+ dirty_pages_.resize(i + 1);
+ }
+ dirty_pages_[i] = true;
+ }
+ }
+
+ return MakeVoidPtr(&(array())[start_byte]);
+}
+
+bool IcingArrayStorage::GrowIfNecessary(uint32_t num_elts) {
+ if (ABSL_PREDICT_TRUE(num_elts <= capacity_num_)) return true;
+ if (num_elts > max_num_) return false;
+
+ // Need to grow.
+ uint64_t new_file_size = fd_offset_ + uint64_t{num_elts} * elt_size_;
+ // Grow to kGrowElts boundary.
+ new_file_size = AlignUp(new_file_size, kGrowElts * elt_size_);
+ if (!filesystem_.Grow(fd_, new_file_size)) {
+ return false;
+ }
+ capacity_num_ = (new_file_size - fd_offset_) / elt_size_;
+ return true;
+}
+
+void IcingArrayStorage::UpdateCrc() {
+ if (!crc_ptr_) return;
+
+ // First apply the modified area. Keep a bitmap of already updated
+ // regions so we don't double-update.
+ vector<bool> updated(changes_end_);
+ uint32_t cur_offset = 0;
+ uint32_t cur_crc = *crc_ptr_;
+ int num_partial_crcs = 0;
+ int num_truncated = 0;
+ int num_overlapped = 0;
+ int num_duplicate = 0;
+ for (size_t i = 0; i < changes_.size(); i++) {
+ const Change &change = changes_[i];
+ if (change.elt_offset + change.elt_len > changes_end_) {
+ ICING_LOG(FATAL) << "Off " << change.elt_offset << " len "
+ << change.elt_len << " end " << changes_end_;
+ }
+
+ // Skip truncated tracked changes.
+ if (change.elt_offset >= cur_num_) {
+ ++num_truncated;
+ continue;
+ }
+
+ // Turn change buf into change^orig.
+ const char *buf_end =
+ &saved_orig_buf_[cur_offset + change.elt_len * elt_size_];
+ const char *cur_array = array_cast<char>() + change.elt_offset * elt_size_;
+ // Now xor in. SSE acceleration please?
+ for (char *cur = &saved_orig_buf_[cur_offset]; cur < buf_end;
+ cur++, cur_array++) {
+ *cur ^= *cur_array;
+ }
+
+ // Skip over already updated bytes by setting update to 0.
+ bool new_update = false;
+ bool overlap = false;
+ uint32_t cur_elt = change.elt_offset;
+ for (char *cur = &saved_orig_buf_[cur_offset]; cur < buf_end;
+ cur_elt++, cur += elt_size_) {
+ if (updated[cur_elt]) {
+ memset(cur, 0, elt_size_);
+ overlap = true;
+ } else {
+ updated[cur_elt] = true;
+ new_update = true;
+ }
+ }
+
+ // Apply update to crc.
+ if (new_update) {
+ cur_crc = IcingStringUtil::UpdateAtPositionCrc32(
+ cur_crc, changes_end_ * elt_size_, change.elt_offset * elt_size_,
+ buf_end - change.elt_len * elt_size_, change.elt_len * elt_size_);
+ num_partial_crcs++;
+ if (overlap) {
+ num_overlapped++;
+ }
+ } else {
+ num_duplicate++;
+ }
+ cur_offset += change.elt_len * elt_size_;
+ }
+ if (!changes_.empty()) {
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
+ num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
+ }
+
+ // Now update with grown area.
+ if (changes_end_ < cur_num_) {
+ cur_crc = IcingStringUtil::UpdateCrc32(
+ cur_crc, array_cast<char>() + changes_end_ * elt_size_,
+ (cur_num_ - changes_end_) * elt_size_);
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf(
+ "Array update tail crc offset %u -> %u", changes_end_, cur_num_);
+ }
+
+ // Clear, now that we've applied changes.
+ changes_.clear();
+ saved_orig_buf_.clear();
+ changes_end_ = cur_num_;
+
+ // Commit new crc.
+ *crc_ptr_ = cur_crc;
+}
+
+void IcingArrayStorage::Warm() const {
+ if (madvise(MakeVoidPtr(array()),
+ IcingMMapper::page_aligned_size(cur_num_ * elt_size_),
+ MADV_WILLNEED) != 0) {
+ ICING_LOG(FATAL) << "Failed to madvise()";
+ }
+}
+
+void IcingArrayStorage::Clear() {
+ cur_num_ = 0;
+ changes_end_ = 0;
+ changes_.clear();
+ saved_orig_buf_.clear();
+ dirty_pages_.clear();
+ if (crc_ptr_) *crc_ptr_ = 0;
+}
+
+// TODO(b/69383247): investigate strange behavior here
+// If map_shared_ is false (i.e. we are using MAP_PRIVATE), dirty pages are
+// flushed to the underlying file, but strangely a sync isn't done.
+// If map_shared_ is true, then we call sync.
+uint32_t IcingArrayStorage::Sync() {
+ if (!map_shared_) {
+ IcingTimer timer;
+ uint32_t num_flushed = 0; // pages flushed
+ uint32_t num_contiguous = 0; // contiguous series of pages flushed
+ uint32_t dirty_pages_size = dirty_pages_.size();
+
+ bool in_dirty = false;
+ uint32_t dirty_start = 0;
+ for (size_t i = 0; i < dirty_pages_size; i++) {
+ bool is_dirty = dirty_pages_[i];
+ if (in_dirty && !is_dirty) {
+ // Flush pages between dirty_start and this.
+ uint32_t dirty_end = i * IcingMMapper::system_page_size();
+ num_contiguous++;
+ num_flushed +=
+ (dirty_end - dirty_start) / IcingMMapper::system_page_size();
+
+ if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
+ fd_offset_ + dirty_start) !=
+ static_cast<ssize_t>(dirty_end - dirty_start)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ }
+ in_dirty = false;
+ } else if (!in_dirty && is_dirty) {
+ dirty_start = i * IcingMMapper::system_page_size();
+ in_dirty = true;
+ }
+ }
+
+ // Flush remaining.
+ if (in_dirty) {
+ uint32_t dirty_end = dirty_pages_size * IcingMMapper::system_page_size();
+ num_contiguous++;
+ num_flushed +=
+ (dirty_end - dirty_start) / IcingMMapper::system_page_size();
+
+ if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
+ fd_offset_ + dirty_start) !=
+ static_cast<ssize_t>(dirty_end - dirty_start)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ }
+ }
+
+ // Clear in one shot.
+ dirty_pages_.clear();
+
+ // Invalidate region so that we are rid of dirty private pages.
+ if (madvise(MakeVoidPtr(array()),
+ IcingMMapper::page_aligned_size(cur_num_ * elt_size_),
+ MADV_DONTNEED) != 0) {
+ ICING_LOG(FATAL) << "Failed to madvise()";
+ }
+
+ if (num_flushed > 0) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Flushing %u/%u %u contiguous pages in %.3fms", num_flushed,
+ dirty_pages_size, num_contiguous, timer.Elapsed() * 1000.);
+ }
+
+ return num_flushed;
+ } else {
+ // Changes have been applied. msync() to ensure they are written out.
+ // Don't sync 0-length, which is an error in iOS and a no-op on Android
+ const size_t sync_length =
+ IcingMMapper::page_aligned_size(cur_num_ * elt_size_);
+ if (sync_length > 0) {
+ if (msync(MakeVoidPtr(array()), sync_length, MS_SYNC) != 0) {
+ ICING_LOG(FATAL) << "Failed to msync()";
+ }
+ }
+
+ return 0;
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-array-storage.h b/icing/legacy/index/icing-array-storage.h
new file mode 100644
index 0000000..fad0565
--- /dev/null
+++ b/icing/legacy/index/icing-array-storage.h
@@ -0,0 +1,168 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// A disk-backed array.
+
+#ifndef ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
+#define ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+
+namespace icing {
+namespace lib {
+
+class IcingArrayStorage {
+ public:
+ explicit IcingArrayStorage(const IcingFilesystem &filesystem);
+ ~IcingArrayStorage();
+
+ // Mmap a disk-backed array at fd_offset in fd. fd is owned by the
+ // caller and must be kept valid.
+ //
+ // If map_shared is true, changes to GetMutableMem immediately apply
+ // to the backing store. Otherwise changes are kept private until an
+ // explicit call to Flush.
+ //
+ // Each element in the array is elt_size bytes and the array is
+ // valid up to num_elts. max_num_elts is the max that the array is
+ // allowed to grow to.
+ //
+ // If crc_ptr is not NULL, explicit calls to UpdateCrc keep the crc
+ // of the array in *crc_ptr.
+ //
+ // If init_crc is true, the crc of the array is recomputed and
+ // written into crc_ptr. Else, the crc of the array is checked
+ // against the current value in crc_ptr and Init fails if the crc
+ // does not match.
+ //
+ // REQUIRES: !is_initialized()
+ bool Init(int fd, size_t fd_offset, bool map_shared, uint32_t elt_size,
+ uint32_t num_elts, uint32_t max_num_elts, uint32_t *crc_ptr,
+ bool init_crc);
+
+ // Undo Init. Make is_initialized() == false.
+ void Reset();
+
+ bool is_initialized() const { return mmapper_ != nullptr; }
+
+ // Attempt to swap into RAM.
+ void Warm() const;
+
+ // Make array empty again.
+ void Clear();
+
+ // Intent to write memory at (elt_idx, elt_idx + elt_len). Returns
+ // NULL if file cannot be grown to accommodate that offset.
+ template <class T>
+ T *GetMutableMem(uint32_t elt_idx, uint32_t elt_len) {
+ return static_cast<T *>(GetMutableMemInternal(elt_idx, elt_len));
+ }
+
+ // Resizes to first elt_len elements.
+ // REQUIRES: elt_len <= num_elts()
+ void Truncate(uint32_t len);
+
+ // Push changes to crc into crc_ptr. No effect if crc_ptr is NULL.
+ void UpdateCrc();
+
+ // Write and sync dirty pages to fd starting at offset. Returns
+ // number of pages synced.
+ uint32_t Sync();
+
+ // Accessors.
+ const uint8_t *array() const { return mmapper_->address(); }
+ template <class T>
+ const T *array_cast() const {
+ return reinterpret_cast<const T *>(array());
+ }
+ uint32_t num_elts() const { return cur_num_; }
+ uint32_t max_num_elts() const { return max_num_; }
+ uint32_t max_size() const { return max_num_elts() * elt_size_; }
+
+ // For stats.
+ uint32_t num_dirty_pages() const {
+ uint32_t num = 0;
+ for (size_t i = 0; i < dirty_pages_.size(); i++) {
+ if (dirty_pages_[i]) num++;
+ }
+ return num;
+ }
+
+ private:
+ // We track partial updates to the array for CRC updating. This
+ // requires extra memory to keep track of original buffers but
+ // allows for much faster CRC re-computation. This is the frac limit
+ // of byte len after which we will discard recorded changes and
+ // recompute the entire CRC instead.
+ static const uint32_t kPartialCrcLimitDiv; // 10 means limit is 1/10
+
+ // Grow file by at least this many elts if array is growable.
+ static const size_t kGrowElts;
+
+ // A change record (somebody called GetMutableMem on this
+ // region). We only keep changes <= changes_end_.
+ struct Change {
+ Change(uint32_t o, uint32_t l) : elt_offset(o), elt_len(l) {}
+
+ uint32_t elt_offset;
+ uint32_t elt_len;
+ };
+ static_assert(8 == sizeof(Change), "sizeof(Change) != 8");
+ static_assert(4 == alignof(Change), "alignof(Change) != 4");
+
+ void *GetMutableMemInternal(uint32_t elt_idx, uint32_t elt_len);
+
+ bool GrowIfNecessary(uint32_t num_elts);
+
+ int fd_;
+ size_t fd_offset_;
+ bool map_shared_;
+ IcingMMapper *mmapper_;
+
+ // Size of an element in the array.
+ uint32_t elt_size_;
+
+ // In bytes.
+ uint32_t cur_num_; // cur boundary of written elts
+ uint32_t changes_end_; // cur_num_ at last call to UpdateCrc
+ uint32_t max_num_; // size of array in elts
+ uint32_t capacity_num_; // num elts that can be accommodated by file size
+
+ uint32_t *crc_ptr_;
+
+ // Changes that have happened since the last update
+ // (between [0, changes_end_)).
+ std::vector<Change> changes_;
+ std::string saved_orig_buf_;
+
+ // Keep track of all pages we touched so we can write them back to
+ // disk.
+ std::vector<bool> dirty_pages_;
+
+ const IcingFilesystem &filesystem_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
diff --git a/icing/legacy/index/icing-bit-util.h b/icing/legacy/index/icing-bit-util.h
new file mode 100644
index 0000000..3273a68
--- /dev/null
+++ b/icing/legacy/index/icing-bit-util.h
@@ -0,0 +1,136 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// Utilities for fiddling bits.
+
+#ifndef ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
+#define ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <limits>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+// Manipulating bit fields.
+//
+// x value containing the bit field(s)
+// offset offset of bit field in x
+// len len of bit field in x
+//
+// REQUIREMENTS
+//
+// - x an unsigned integer <= 64 bits
+// - offset + len <= sizeof(x) * 8
+//
+// There is no error checking so you will get garbage if you don't
+// ensure the above.
+//
+// To set a value, use BITFIELD_CLEAR then BITFIELD_OR.
+
+// Shifting by more than the word length is undefined (on ARM it has the
+// intended effect, but on Intel it shifts by % word length), so check the
+// length).
+#define BITFIELD_MASK(len) ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len))))
+#define BITFIELD_GET(x, offset, len) (((x) >> (offset)) & BITFIELD_MASK(len))
+// The following modify x.
+#define BITFIELD_CLEAR(x, offset, len) (x) &= ~(BITFIELD_MASK(len) << (offset))
+// We conservatively mask val at len so x won't be corrupted if val >=
+// 1 << len.
+#define BITFIELD_OR(x, offset, len, val) \
+ (x) |= (uint64_t{val} & BITFIELD_MASK(len)) << (offset)
+
+// Number of bits needed to store the range [0, n).
+inline uint8_t BitsToStore(uint32_t n) {
+ if (n <= 1) {
+ return 0;
+ } else {
+ return 32 - __builtin_clz(n - 1);
+ }
+}
+
+#define ALIGN_UP(n, alignment) \
+ ((((n) + (alignment)-1) / (alignment)) * (alignment))
+
+// Align up to a multiple.
+inline uint64_t AlignUp(uint64_t n, uint64_t alignment) {
+ return ALIGN_UP(n, alignment);
+}
+
+inline bool SumOverflowsUint32(std::vector<uint64_t> values) {
+ uint64_t sum = 0L;
+ for (uint64_t value : values) {
+ sum += value;
+ }
+ return sum > std::numeric_limits<uint32_t>::max();
+}
+
+// VarInt (See
+// https://developers.google.com/protocol-buffers/docs/encoding)
+#define VAR_INT_MAX_ENCODED_LEN(n_size) (ALIGN_UP(8 * (n_size), 7) / 7)
+
+class VarInt {
+ public:
+ // 7 bits per byte.
+ static size_t MaxEncodedLen(size_t n_size) {
+ return VAR_INT_MAX_ENCODED_LEN(n_size);
+ }
+ static const int kMaxEncodedLen64 = VAR_INT_MAX_ENCODED_LEN(8);
+
+ // Encode n into buf. Return encoded len. buf must be at least
+ // kMaxEncodedLen64 long.
+ static size_t Encode(uint64_t n, uint8_t *buf) {
+ uint8_t *start = buf;
+ do {
+ *buf = 0x80 | (n & 0x7f);
+ n >>= 7;
+ buf++;
+ } while (n);
+ // buf is one past last byte. Last byte must have MSB cleared.
+ *(buf - 1) &= 0x7f;
+ return buf - start;
+ }
+
+ // Decode buf into unsigned integral type pn. Return length
+ // decoded. buf must terminate with a byte with MSB cleared. No
+ // error checking is done but if buf is null-terminated, Decode
+ // won't crash. If decoded doesn't fit into *pn higher order bits
+ // will be dropped.
+ template <class T>
+ static size_t Decode(const uint8_t *buf, T *pn) {
+ const uint8_t *start = buf;
+ *pn = 0;
+ int offset = 0;
+ while ((*buf & 0x80)) {
+ *pn |= static_cast<T>(*buf & 0x7f) << offset;
+ offset += 7;
+ buf++;
+ }
+ // Last byte.
+ *pn |= static_cast<T>(*buf) << offset;
+ // Buf is pointing to last byte, not one off the end.
+ return buf - start + 1;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
diff --git a/icing/legacy/index/icing-common-types.h b/icing/legacy/index/icing-common-types.h
new file mode 100644
index 0000000..592b549
--- /dev/null
+++ b/icing/legacy/index/icing-common-types.h
@@ -0,0 +1,129 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Author: sbanacho@google.com (Scott Banachowski)
+// Author: csyoung@google.com (C. Sean Young)
+
+#ifndef ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
+#define ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
+
+#include "icing/legacy/core/icing-core-types.h"
+
+// Protocol buffers are shared across several components.
+namespace com {
+namespace google {
+namespace android {
+namespace gms {
+namespace icing {
+namespace lib {
+
+class ClientFileGroup;
+class Document;
+class Document_Section;
+class DocumentStoreStatusProto;
+class IMEUpdate;
+class IMEUpdateResponse;
+class IndexCorpusScoringConfig;
+class IndexCorpusScoringConfig_Section;
+class IndexScoringConfig;
+class InitStatus;
+class InitStatus_CorpusInitInfo;
+class PendingDeleteUsageReport;
+class PhraseAffinityRequest;
+class QueryResponse;
+class QueryResponse_Corpus;
+class QueryResponse_Corpus_Section;
+class QueryResponse_Corpus_Tag;
+class QueryRequestSpec;
+class QueryRequestSpec_CorpusSpec;
+class QueryRequestSpec_SectionSpec;
+class ResponseDebugInfo;
+class ResultDebugInfo;
+class SectionConfig;
+class SuggestionResponse;
+class SuggestionResponse_Suggestion;
+class UsageReportsResponse;
+class UsageStats;
+class UsageStats_Corpus;
+
+} // namespace lib
+} // namespace icing
+} // namespace gms
+} // namespace android
+} // namespace google
+} // namespace com
+
+namespace icing {
+namespace lib {
+
+// Typedefs.
+using IcingDocId = uint32_t;
+
+using IcingSectionId = uint32_t;
+
+using IcingCorpusId = uint16_t;
+using IcingSectionIdMask = uint16_t;
+
+using IcingTagsCount = uint16_t;
+
+using IcingSequenceNumber = int64_t;
+
+using IcingScore = uint64_t;
+
+constexpr size_t kIcingMaxTokenLen = 30; // default shared between query
+ // processor and indexer
+constexpr int kIcingQueryTermLimit = 50; // Maximum number of terms in a query
+constexpr int kIcingMaxVariantsPerToken = 10; // Maximum number of variants
+
+// LINT.IfChange
+constexpr int kIcingDocIdBits = 20; // 1M docs
+constexpr IcingDocId kIcingInvalidDocId = (1u << kIcingDocIdBits) - 1;
+constexpr IcingDocId kIcingMaxDocId = kIcingInvalidDocId - 1;
+// LINT.ThenChange(//depot/google3/wireless/android/icing/plx/google_sql_common_macros.sql)
+
+constexpr int kIcingDocScoreBits = 32;
+
+constexpr int kIcingSectionIdBits = 4; // 4 bits for 16 values
+constexpr IcingSectionId kIcingMaxSectionId = (1u << kIcingSectionIdBits) - 1;
+constexpr IcingSectionId kIcingInvalidSectionId = kIcingMaxSectionId + 1;
+constexpr IcingSectionIdMask kIcingSectionIdMaskAll = ~IcingSectionIdMask{0};
+constexpr IcingSectionIdMask kIcingSectionIdMaskNone = IcingSectionIdMask{0};
+
+constexpr int kIcingCorpusIdBits = 15; // 32K
+constexpr IcingCorpusId kIcingInvalidCorpusId = (1u << kIcingCorpusIdBits) - 1;
+constexpr IcingCorpusId kIcingMaxCorpusId = kIcingInvalidCorpusId - 1;
+
+constexpr size_t kIcingMaxSearchableDocumentSize = (1u << 16) - 1; // 64K
+// Max num tokens per document. 64KB is our original maximum (searchable)
+// document size. We clip if document exceeds this.
+constexpr uint32_t kIcingMaxNumTokensPerDoc =
+ kIcingMaxSearchableDocumentSize / 5;
+constexpr uint32_t kIcingMaxNumHitsPerDocument =
+ kIcingMaxNumTokensPerDoc * kIcingMaxVariantsPerToken;
+
+constexpr IcingTagsCount kIcingInvalidTagCount = ~IcingTagsCount{0};
+constexpr IcingTagsCount kIcingMaxTagCount = kIcingInvalidTagCount - 1;
+
+// Location refers to document storage.
+constexpr uint64_t kIcingInvalidLocation = ~uint64_t{0};
+constexpr uint64_t kIcingMaxDocStoreWriteLocation = uint64_t{1}
+ << 32; // 4bytes.
+
+// Dump symbols in the proto namespace.
+using namespace ::com::google::android::gms::icing; // NOLINT(build/namespaces)
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
new file mode 100644
index 0000000..a3d6316
--- /dev/null
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -0,0 +1,2349 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// We store the trie in three areas: nodes, nexts and suffixes.
+//
+// Nodes contain an index to a children array (kept in nexts) or to
+// suffixes (for leaf nodes). Nexts contain children arrays of
+// different sizes. Each child entry has the matched char and an index
+// back into the nodes. Leaf nodes index into suffixes instead of the
+// nexts array. Each suffix is a NULL-terminated suffix off the trie,
+// followed by a 4-byte value associated with that key.
+//
+// Allocation
+//
+// Nodes are allocated and never removed. Nexts contain arrays of
+// sizes in power-of-2 increments, i.e. 1, 2, 4, ..., 256. When the
+// number of children of a node increases, it is relocated to an array
+// with the proper size. The (smaller) unused array is added to a free
+// list. A free list is kept for each array size. Allocations happen
+// from the free list first, and then from the end of the nexts
+// array. Suffixes are never freed or compacted. If a node wants to
+// refer to a smaller suffix, it moves the pointer forward and the
+// characters before the new pointer are wasted.
+//
+// Keys can contain any character except '\0'. The '\0' char is
+// special in that it specifies an end-of-key in the child array.
+//
+// Ideas to try:
+//
+// - Put suffix index in a Next instead of creating a leaf node.
+// - Change allocation buckets to 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, ..., 256
+// - Compact next array
+// - GroupVarByte and delta-encode the next array
+// - Collapse nodes with single children
+//
+// Persistence
+//
+// We persist the trie in a binary format such that resurrecting the
+// trie is simply a few file reads. The file is laid out as such:
+//
+// - Header
+// - Nodes
+// - Nexts
+// - Suffixes
+//
+// Each section is aligned to IcingMMapper::system_page_size(). The max
+// requested value for each array is pre-allocated in the file. When
+// we make modifications to the arrays, we set bits in a dirty bitmap
+// of pages. No changes get written to disk until an explicit call to
+// Flush. Then we only write the pages that have their dirty bit set.
+
+#include "icing/legacy/index/icing-dynamic-trie.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/legacy/index/icing-array-storage.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-flash-bitmap.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+
+using std::inplace_merge;
+using std::lower_bound;
+using std::max;
+using std::mismatch;
+using std::pair;
+using std::sort;
+using std::vector;
+
+namespace icing {
+namespace lib {
+
+// Based on the bit field widths.
+const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
+const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
+const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
+const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
+
+const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1;
+const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U;
+const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
+
+const int IcingDynamicTrie::kMaxNextArraySize;
+const int IcingDynamicTrie::kNumNextAllocationBuckets;
+
+const uint32_t IcingDynamicTrie::kMaxPropertyId;
+
+const uint32_t IcingDynamicTrie::kInvalidValueIndex;
+
+const uint32_t IcingDynamicTrie::kNoCrc;
+
+// Manages logical node candidates while searching for possible
+// variant matches. Currently implemented as depth first search. The
+// max stack depth is key length * variant fanout. Since max variant
+// fanout is 3, we don't need to worry about blowup of the depth first
+// search stack.
+//
+// Keeps track of original matched string (the string actually present
+// in the trie) for every candidate.
+class IcingDynamicTrie::CandidateSet {
+ public:
+ struct Candidate {
+ LogicalNode logical_node;
+ const char *key;
+ int matched_prefix_len;
+ std::string matched_span;
+
+ Candidate() {}
+
+ Candidate(const LogicalNode &logical_node_in, const char *key_in,
+ int matched_prefix_len_in, const char *matched_span_in,
+ int matched_span_len_in)
+ : logical_node(logical_node_in),
+ key(key_in),
+ matched_prefix_len(matched_prefix_len_in),
+ matched_span(matched_span_in, matched_span_len_in) {}
+
+ int matched_len() const { return matched_prefix_len + matched_span.size(); }
+ };
+
+ explicit CandidateSet(bool prefix) : prefix_(prefix) {}
+
+ bool IsTerminal(const char *key, uint32_t value_index) const {
+ // Terminal match condition:
+ //
+ // 1. Key was entirely consumed.
+ // 2. The entire suffix was consumed (hence value index is
+ // valid). OR, we are ok with prefix matches.
+ return *key == 0 && (value_index != kInvalidValueIndex || prefix_);
+ }
+
+ // Push a terminal or non-terminal.
+ void Push(const LogicalNode &logical_node, const char *key,
+ uint32_t value_index, int matched_prefix_len,
+ const char *matched_span, int matched_span_len) {
+ if (!AddMatchIfTerminal(key, value_index, matched_span, matched_span_len)) {
+ PushNonTerminal(logical_node, key, matched_prefix_len, matched_span,
+ matched_span_len);
+ }
+ }
+
+ bool AddMatchIfTerminal(const char *key, uint32_t value_index,
+ const char *matched_span, int matched_span_len) {
+ if (!IsTerminal(key, value_index)) {
+ return false;
+ }
+
+ // Terminal match.
+ matches_.push_back(OriginalMatch());
+ OriginalMatch *match = &matches_.back();
+ match->value_index = value_index;
+ match->orig.reserve(cur_prefix_.size() + matched_span_len);
+ match->orig.append(cur_prefix_).append(matched_span, matched_span_len);
+ return true;
+ }
+
+ // Push a definite non-terminal.
+ void PushNonTerminal(const LogicalNode &logical_node, const char *key,
+ int matched_prefix_len, const char *matched_span,
+ int matched_span_len) {
+ candidates_.push_back(Candidate(logical_node, key, matched_prefix_len,
+ matched_span, matched_span_len));
+ }
+
+ void Pop(Candidate *candidate) {
+ *candidate = candidates_.back();
+ if (cur_prefix_.size() < candidate->matched_prefix_len) {
+ ICING_LOG(FATAL)
+ << "Length of current prefix is smaller than length of matched "
+ "prefer, there're inconsistencies in dynamic trie.";
+ }
+
+ cur_prefix_.resize(candidate->matched_prefix_len);
+ cur_prefix_.append(candidate->matched_span);
+ candidates_.pop_back();
+ }
+
+ bool empty() const { return candidates_.empty(); }
+
+ void Release(vector<OriginalMatch> *ret) {
+ if (!empty()) {
+ ICING_LOG(FATAL) << "Candidate set not empty before releasing matches";
+ }
+
+ ret->swap(matches_);
+
+ cur_prefix_.clear();
+ candidates_.clear();
+ matches_.clear();
+ }
+
+ private:
+ const bool prefix_;
+
+ std::string cur_prefix_;
+ vector<Candidate> candidates_;
+
+ vector<IcingDynamicTrie::OriginalMatch> matches_;
+};
+
+// Options.
+bool IcingDynamicTrie::Options::is_valid() const {
+ return max_nodes <= kMaxNodes && max_nodes > 0 && max_nexts <= kMaxNexts &&
+ max_nexts > 0 && max_suffixes_size <= kMaxSuffixesSize &&
+ max_suffixes_size > 0 && value_size <= kMaxValueSize;
+}
+
+// IcingDynamicTrieStorage
+class IcingDynamicTrie::IcingDynamicTrieStorage {
+ public:
+ IcingDynamicTrieStorage(const std::string &file_basename,
+ const RuntimeOptions &runtime_options,
+ const IcingFilesystem *filesystem);
+ ~IcingDynamicTrieStorage();
+
+ bool is_initialized() const { return hdr_mmapper_.is_valid(); }
+
+ bool CreateIfNotExist(const Options &options);
+ bool Init();
+ static bool Remove(const std::string &file_basename,
+ const IcingFilesystem &filesystem);
+ bool Sync();
+ uint64_t GetDiskUsage() const;
+ void Warm();
+
+ void Clear();
+
+ bool empty() const { return hdr().num_nodes() == 0; }
+
+ // Never cast off these consts when writing to the arrays. Always
+ // use the GetMutable* helpers above.
+ const Node *GetNode(uint32_t idx) const {
+ return &array_storage_[NODE].array_cast<Node>()[idx];
+ }
+ const Node *GetRootNode() const { return GetNode(0); }
+ const Next *GetNext(uint32_t idx, int child) const {
+ return &array_storage_[NEXT].array_cast<Next>()[idx + child];
+ }
+ const char *GetSuffix(uint32_t idx) const {
+ return &array_storage_[SUFFIX].array_cast<char>()[idx];
+ }
+
+ uint32_t GetNodeIndex(const Node *node) const { return node - GetNode(0); }
+ uint32_t GetNextArrayIndex(const Next *next) const {
+ return next - GetNext(0, 0);
+ }
+ uint32_t GetSuffixIndex(const char *suffix) const {
+ return suffix - GetSuffix(0);
+ }
+
+ // By default, nodes_, nexts_ and suffixes_ are read-only. This
+ // returns a writable element or array within and sets
+ // dirty_pages_[array_type] as a side effect, assuming the mutable
+ // area will get written to.
+ Node *GetMutableNode(uint32_t idx);
+ Next *GetMutableNextArray(uint32_t idx, uint32_t len);
+ char *GetMutableSuffix(uint32_t idx, uint32_t len);
+
+ // Update crcs based on current contents. Returns all_crc or kNoCrc.
+ uint32_t UpdateCrc();
+
+ // Allocators.
+ uint32_t nodes_left() const;
+ uint32_t nexts_left() const;
+ uint32_t suffixes_left() const;
+
+ // REQUIRES: nodes_left() > 0.
+ Node *AllocNode();
+ // REQUIRES: nexts_left() >= kMaxNextArraySize.
+ Next *AllocNextArray(int size);
+ void FreeNextArray(Next *next, int log2_size);
+ // REQUIRES: suffixes_left() >= strlen(suffix) + 1 + value_size()
+ uint32_t MakeSuffix(const char *suffix, const void *value,
+ uint32_t *value_index);
+
+ const IcingDynamicTrieHeader &hdr() const { return hdr_.hdr; }
+
+ uint32_t value_size() const { return hdr().value_size(); }
+
+ void FillDirtyPageStats(Stats *stats) const;
+
+ void inc_num_keys() { hdr_.hdr.set_num_keys(hdr_.hdr.num_keys() + 1); }
+
+ private:
+ friend void IcingDynamicTrie::SetHeader(
+ const IcingDynamicTrieHeader &new_hdr);
+
+ enum ArrayType { NODE, NEXT, SUFFIX, NUM_ARRAY_TYPES };
+
+ // Returns all filenames that are part of the storage. First
+ // filename is the header and the rest correspond to ArrayType enum
+ // values.
+ static void GetFilenames(const std::string &file_basename,
+ vector<std::string> *filenames);
+ static std::string GetHeaderFilename(const std::string &file_basename);
+
+ uint32_t GetHeaderCrc() const;
+
+ uint32_t GetAllCrc() const;
+
+ uint32_t UpdateCrcInternal(bool write_hdr);
+
+ // Initializes hdr_ with options and writes the resulting header to disk.
+ bool CreateNewHeader(IcingScopedFd sfd, const Options &options);
+ bool WriteHeader();
+
+ // Header block. On-disk header block format is as follows:
+ //
+ // |serialized-header|pad|crcs|
+ // <--- system_page_size() --->
+
+ // Wrapper for header protobuf.
+ class Header {
+ // Serialized format:
+ //
+ // magic(4)|size(4)|serialized hdr(size)
+ static const uint32_t kMagic;
+ // TODO(b/77482303) : Remove version from the IcingFlashBitmap header -
+ // magic makes it unnecessary.
+ static const uint32_t kCurVersion;
+
+ public:
+ void Init(const Options &options);
+ bool Init(const uint8_t *buf, uint32_t buf_size);
+ void Invalidate() { hdr.Clear(); }
+ bool SerializeToArray(uint8_t *buf, uint32_t buf_size) const;
+ bool Verify();
+
+ IcingDynamicTrieHeader hdr;
+ };
+
+ std::string file_basename_;
+
+ Header hdr_;
+
+ IcingMMapper hdr_mmapper_;
+
+ struct Crcs {
+ uint32_t all_crc;
+ uint32_t header_crc;
+ uint32_t array_crcs[NUM_ARRAY_TYPES];
+ };
+ Crcs *crcs_;
+
+ static uint32_t serialized_header_max() {
+ return IcingMMapper::system_page_size() - sizeof(Crcs);
+ }
+
+ RuntimeOptions runtime_options_;
+
+ // Info kept about each array (NODE, NEXT, SUFFIX) to manage
+ // storage.
+ IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
+ std::vector<IcingArrayStorage> array_storage_;
+ const IcingFilesystem *filesystem_;
+};
+
+IcingDynamicTrie::IcingDynamicTrieStorage::IcingDynamicTrieStorage(
+ const std::string &file_basename, const RuntimeOptions &runtime_options,
+ const IcingFilesystem *filesystem)
+ : file_basename_(file_basename),
+ hdr_mmapper_(false, MAP_SHARED),
+ crcs_(nullptr),
+ runtime_options_(runtime_options),
+ array_storage_(NUM_ARRAY_TYPES, IcingArrayStorage(*filesystem)),
+ filesystem_(filesystem) {}
+
+IcingDynamicTrie::IcingDynamicTrieStorage::~IcingDynamicTrieStorage() {
+ if (is_initialized()) {
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ array_storage_[i].Reset();
+ }
+ }
+}
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::GetFilenames(
+ const std::string &file_basename, vector<std::string> *filenames) {
+ const char *kArrayFilenameSuffixes[NUM_ARRAY_TYPES] = {
+ ".n",
+ ".x",
+ ".s",
+ };
+
+ filenames->clear();
+ filenames->push_back(GetHeaderFilename(file_basename));
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ filenames->push_back(file_basename + kArrayFilenameSuffixes[i]);
+ }
+}
+
+std::string IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderFilename(
+ const std::string &file_basename) {
+ constexpr char kHeaderFilenameSuffix[] = ".h";
+ return file_basename + kHeaderFilenameSuffix;
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
+ bool init_crcs = false;
+ const bool map_shared =
+ runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc;
+
+ // Open files.
+ vector<std::string> filenames;
+ GetFilenames(file_basename_, &filenames);
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t file_size = filesystem_->GetFileSize(filenames[i].c_str());
+ if (file_size == IcingFilesystem::kBadFileSize) {
+ goto failed;
+ }
+ IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
+ if (!sfd.is_valid()) {
+ goto failed;
+ }
+ // The first filename is the header and the rest correspond to ArrayType
+ // enum values. The header's fd can be closed immediately after mmapping
+ // (see b/114830334). Other files' fds are tracked in array_fds_ for later
+ // closing.
+ if (i == 0) {
+ // Header.
+ if (file_size != IcingMMapper::system_page_size()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Trie hdr wrong size: %" PRIu64, file_size);
+ goto failed;
+ }
+
+ // Open hdr.
+ hdr_mmapper_.Remap(sfd.get(), 0, IcingMMapper::system_page_size());
+ if (!hdr_mmapper_.is_valid()) {
+ ICING_LOG(ERROR) << "Trie map header failed";
+ goto failed;
+ }
+ } else {
+ array_fds_[i - 1] = std::move(sfd);
+ }
+ }
+
+ // Point crcs_ to correct region.
+ crcs_ = reinterpret_cast<Crcs *>(hdr_mmapper_.address() +
+ serialized_header_max());
+ if (crcs_->header_crc == kNoCrc) {
+ // Create crcs.
+ crcs_->header_crc = GetHeaderCrc();
+
+ // Do the same for the arrays.
+ init_crcs = true;
+ } else {
+ // Verify crc.
+ if (crcs_->header_crc != GetHeaderCrc()) {
+ ICING_LOG(ERROR) << "Trie header crc failed";
+ goto failed;
+ }
+ }
+
+ // Deserialize and verify header.
+ if (!hdr_.Init(hdr_mmapper_.address(),
+ IcingMMapper::system_page_size() - sizeof(Crcs)) ||
+ !hdr_.Verify()) {
+ ICING_LOG(ERROR) << "Trie reading header failed";
+ goto failed;
+ }
+
+ // We have the header set up. Now read in the arrays.
+ if (!array_storage_[NODE].Init(array_fds_[NODE].get(), 0, map_shared,
+ sizeof(Node), hdr_.hdr.num_nodes(),
+ hdr_.hdr.max_nodes(), &crcs_->array_crcs[NODE],
+ init_crcs)) {
+ ICING_LOG(ERROR) << "Trie mmap node failed";
+ goto failed;
+ }
+
+ if (!array_storage_[NEXT].Init(array_fds_[NEXT].get(), 0, map_shared,
+ sizeof(Next), hdr_.hdr.num_nexts(),
+ hdr_.hdr.max_nexts(), &crcs_->array_crcs[NEXT],
+ init_crcs)) {
+ ICING_LOG(ERROR) << "Trie mmap next failed";
+ goto failed;
+ }
+
+ if (!array_storage_[SUFFIX].Init(array_fds_[SUFFIX].get(), 0, map_shared,
+ sizeof(char), hdr_.hdr.suffixes_size(),
+ hdr_.hdr.max_suffixes_size(),
+ &crcs_->array_crcs[SUFFIX], init_crcs)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Trie mmap suffix failed");
+ goto failed;
+ }
+
+ // Overall crc.
+ if (init_crcs) {
+ crcs_->all_crc = GetAllCrc();
+ } else {
+ // Verify crc.
+ if (crcs_->all_crc != GetAllCrc()) {
+ ICING_LOG(ERROR) << "Trie all crc failed";
+ goto failed;
+ }
+ }
+
+ return true;
+
+failed:
+ crcs_ = nullptr;
+ hdr_mmapper_.Unmap();
+ hdr_.Invalidate();
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ array_storage_[i].Reset();
+ array_fds_[i].reset();
+ }
+
+ return false;
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateIfNotExist(
+ const Options &options) {
+ vector<std::string> filenames;
+ GetFilenames(file_basename_, &filenames);
+
+ // Check already exists. Just header file check is enough.
+ if (filesystem_->FileExists(filenames[0].c_str())) {
+ return true;
+ }
+
+ // Ensure the storage directory exists
+ std::string storage_dir = filesystem_->GetDirname(filenames[0].c_str());
+ if (!filesystem_->CreateDirectoryRecursively(storage_dir.c_str())) {
+ return false;
+ }
+
+ // Create files.
+ for (size_t i = 0; i < filenames.size(); i++) {
+ IcingScopedFd sfd(filesystem_->OpenForWrite(filenames[i].c_str()));
+ if (!sfd.is_valid()) {
+ Remove(file_basename_, *filesystem_);
+ return false;
+ }
+
+ if (i == 0) {
+ if (!CreateNewHeader(std::move(sfd), options)) {
+ ICING_LOG(ERROR) << "Serialize trie header failed";
+ Remove(file_basename_, *filesystem_);
+ return false;
+ }
+ } else {
+ // Crcs are automatically kNoCrc so they will be initialized
+ // upon first call to Init.
+ if (!filesystem_->Truncate(*sfd, 0)) {
+ Remove(file_basename_, *filesystem_);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::CreateNewHeader(
+ IcingScopedFd sfd, const Options &options) {
+ ICING_VLOG(1) << "Creating header with write+sync";
+ hdr_.Init(options);
+ auto buf = std::make_unique<uint8_t[]>(IcingMMapper::system_page_size());
+ // serialized_header_max must be less than system_page_size so we don't
+ // overflow buf when serializing the header.
+ if (serialized_header_max() > IcingMMapper::system_page_size()) {
+ ICING_LOG(FATAL) << "serialized_header_max exceeds system page size";
+ }
+
+ return hdr_.SerializeToArray(buf.get(), serialized_header_max()) &&
+ filesystem_->Write(sfd.get(), buf.get(),
+ IcingMMapper::system_page_size()) &&
+ filesystem_->DataSync(sfd.get());
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Remove(
+ const std::string &file_basename, const IcingFilesystem &filesystem) {
+ bool success = true;
+ vector<std::string> files;
+ GetFilenames(file_basename, &files);
+ for (size_t i = 0; i < files.size(); i++) {
+ if (!filesystem.DeleteFile(files[i].c_str())) {
+ success = false;
+ }
+ }
+ return success;
+}
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::Warm() {
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ array_storage_[i].Warm();
+ }
+}
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::Clear() {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ // Clear header.
+ hdr_.hdr.set_num_nodes(0);
+ hdr_.hdr.set_num_nexts(0);
+ hdr_.hdr.set_suffixes_size(0);
+ for (int i = 0; i < hdr_.hdr.free_lists_size(); i++) {
+ hdr_.hdr.set_free_lists(i, kInvalidNextIndex);
+ }
+ hdr_.hdr.set_num_keys(0);
+
+ // Clear array storage.
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ array_storage_[i].Clear();
+ }
+
+ // Copy to persistence.
+ WriteHeader();
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ uint32_t total_flushed = 0;
+ bool success = true;
+
+ // Sync all array types.
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ total_flushed += array_storage_[i].Sync();
+ if (!filesystem_->DataSync(array_fds_[i].get())) {
+ ICING_LOG(ERROR) << "Unable to sync data for flushing";
+ success = false;
+ }
+ }
+
+ if (!WriteHeader()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flushing trie header failed: %s", strerror(errno));
+ success = false;
+ }
+
+ // Need to update CRCs before we sync the header mmap.
+ UpdateCrcInternal(false);
+
+ // Sync header.
+ if (!hdr_mmapper_.Sync()) {
+ ICING_LOG(ERROR) << "Unable to sync trie header for flushing";
+ success = false;
+ }
+
+ if (total_flushed > 0) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf("Flushing %u pages of trie",
+ total_flushed);
+ }
+
+ return success;
+}
+
+uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const {
+ // Trie files themselves.
+ uint64_t total = 0;
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ IcingFilesystem::IncrementByOrSetInvalid(
+ filesystem_->GetDiskUsage(array_fds_[i].get()), &total);
+ }
+
+ // Header.
+ std::string header_filename = GetHeaderFilename(file_basename_);
+ IcingFilesystem::IncrementByOrSetInvalid(
+ filesystem_->GetFileDiskUsage(header_filename.c_str()), &total);
+
+ return total;
+}
+
+IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
+ if (nodes_left() == 0) {
+ ICING_LOG(FATAL) << "No allocated nodes left";
+ }
+
+ hdr_.hdr.set_num_nodes(hdr_.hdr.num_nodes() + 1);
+ return GetMutableNode(hdr_.hdr.num_nodes() - 1);
+}
+
+IcingDynamicTrie::Next *
+IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) {
+ if (size > kMaxNextArraySize) {
+ ICING_LOG(FATAL) << "Array size exceeds the max 'next' array size";
+ }
+
+ if (nexts_left() < static_cast<uint32_t>(kMaxNextArraySize)) {
+ ICING_LOG(FATAL) << "'next' buffer not enough";
+ }
+
+ // Compute ceil(log2(size)).
+ int log2_size = 0;
+ while ((1 << log2_size) < size) log2_size++;
+ // Note: size <= aligned_size <= kMaxNextArraySize
+ int aligned_size = 1 << log2_size;
+
+ // Look in free list.
+ Next *ret;
+ if (hdr_.hdr.free_lists(log2_size) != kInvalidNextIndex) {
+ ret = GetMutableNextArray(hdr_.hdr.free_lists(log2_size), aligned_size);
+ uint32_t next_link = ret->next_index();
+ if (next_link != kInvalidNextIndex && next_link >= hdr_.hdr.max_nexts()) {
+ ICING_LOG(FATAL) << "'next' index is out of range";
+ }
+ hdr_.hdr.set_free_lists(log2_size, next_link);
+ } else {
+ // Allocate a new one.
+ ret = GetMutableNextArray(hdr_.hdr.num_nexts(), aligned_size);
+ hdr_.hdr.set_num_nexts(hdr_.hdr.num_nexts() + aligned_size);
+ }
+
+ // Fill with char 0xff so we are sorted properly.
+ for (int i = 0; i < aligned_size; i++) {
+ ret[i].set_val(0xff);
+ ret[i].set_node_index(kInvalidNodeIndex);
+ }
+ return ret;
+}
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::FreeNextArray(Next *next,
+ int log2_size) {
+ if (GetNextArrayIndex(next) + (1 << log2_size) > hdr_.hdr.max_nexts()) {
+ ICING_LOG(FATAL) << "'next' array is out of range";
+ }
+
+ // Put it in free list.
+ next->set_next_index(hdr_.hdr.free_lists(log2_size));
+ hdr_.hdr.set_free_lists(log2_size, GetNextArrayIndex(next));
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::MakeSuffix(
+ const char *suffix, const void *value, uint32_t *value_index) {
+ int suffix_len = strlen(suffix);
+ if (suffixes_left() < suffix_len + 1 + value_size()) {
+ ICING_LOG(FATAL) << "'suffix' buffer not enough";
+ }
+
+ char *start =
+ GetMutableSuffix(hdr_.hdr.suffixes_size(), suffix_len + 1 + value_size());
+ memcpy(start, suffix, suffix_len + 1);
+ memcpy(start + suffix_len + 1, value, value_size());
+ if (value_index) *value_index = GetSuffixIndex(start + suffix_len + 1);
+ hdr_.hdr.set_suffixes_size(hdr_.hdr.suffixes_size() + suffix_len + 1 +
+ value_size());
+
+ return GetSuffixIndex(start);
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::GetHeaderCrc() const {
+ return IcingStringUtil::UpdateCrc32(
+ 0, reinterpret_cast<const char *>(hdr_mmapper_.address()),
+ serialized_header_max());
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::GetAllCrc() const {
+ // Append array crcs to header crc.
+ return IcingStringUtil::UpdateCrc32(
+ crcs_->header_crc, reinterpret_cast<const char *>(crcs_->array_crcs),
+ sizeof(crcs_->array_crcs));
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrc() {
+ return UpdateCrcInternal(true);
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrcInternal(
+ bool write_hdr) {
+ if (write_hdr && !WriteHeader()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flushing trie header failed: %s", strerror(errno));
+ }
+
+ crcs_->header_crc = GetHeaderCrc();
+
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ array_storage_[i].UpdateCrc();
+ }
+
+ crcs_->all_crc = GetAllCrc();
+
+ return crcs_->all_crc;
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::WriteHeader() {
+ return hdr_.SerializeToArray(hdr_mmapper_.address(), serialized_header_max());
+}
+
+IcingDynamicTrie::Node *
+IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNode(uint32_t idx) {
+ return array_storage_[NODE].GetMutableMem<Node>(idx, 1);
+}
+
+IcingDynamicTrie::Next *
+IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableNextArray(uint32_t idx,
+ uint32_t len) {
+ return array_storage_[NEXT].GetMutableMem<Next>(idx, len);
+}
+
+char *IcingDynamicTrie::IcingDynamicTrieStorage::GetMutableSuffix(
+ uint32_t idx, uint32_t len) {
+ return array_storage_[SUFFIX].GetMutableMem<char>(idx, len);
+}
+
+// Header functions.
+const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kMagic =
+ 0x6dfba6ae;
+// For future revisions, this should be synced with global index version.
+// See comments on Upgrade() in native-index-impl.h for versioning.
+const uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::Header::kCurVersion =
+ 4;
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
+ const IcingDynamicTrie::Options &options) {
+ hdr.Clear();
+
+ hdr.set_version(kCurVersion);
+ hdr.set_max_nodes(options.max_nodes);
+ hdr.set_max_nexts(options.max_nexts);
+ hdr.set_max_suffixes_size(options.max_suffixes_size);
+ hdr.set_value_size(options.value_size);
+
+ for (int i = 0; i < kNumNextAllocationBuckets; i++) {
+ hdr.add_free_lists(kInvalidNextIndex);
+ }
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
+ const uint8_t *buf, uint32_t buf_size) {
+ // Check magic and length.
+ if (buf_size <= sizeof(kMagic) + sizeof(uint32_t)) {
+ ICING_LOG(ERROR) << "Trie header too short";
+ return false;
+ }
+
+ uint32_t magic;
+ memcpy(&magic, buf, sizeof(magic));
+ if (magic != kMagic) {
+ ICING_LOG(ERROR) << "Trie header magic mismatch";
+ return false;
+ }
+ uint32_t len;
+ memcpy(&len, buf + sizeof(magic), sizeof(len));
+ if (len > buf_size - sizeof(magic) - sizeof(len)) {
+ ICING_LOG(ERROR) << "Trie header too short";
+ return false;
+ }
+
+ return hdr.ParseFromArray(buf + sizeof(magic) + sizeof(len), len);
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
+ uint8_t *buf, uint32_t buf_size) const {
+ uint32_t size = hdr.ByteSize();
+ if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
+ memcpy(buf, &kMagic, sizeof(kMagic));
+ memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
+ hdr.SerializeWithCachedSizesToArray(buf + sizeof(kMagic) + sizeof(uint32_t));
+ return true;
+}
+
+bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Verify() {
+ // Check version.
+ if (hdr.version() != kCurVersion) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Trie version %u mismatch", hdr.version());
+ return false;
+ }
+
+ // Check that indices in hdr are within bounds. Note that this is
+ // not a comprehensive integrity check for the entire trie.
+ if (hdr.num_nodes() > hdr.max_nodes() || hdr.num_nexts() > hdr.max_nexts() ||
+ hdr.suffixes_size() > hdr.max_suffixes_size() ||
+ hdr.value_size() >= hdr.max_suffixes_size()) {
+ ICING_LOG(ERROR) << "Trie header array size out of bounds";
+ return false;
+ }
+
+ if (hdr.free_lists_size() != kNumNextAllocationBuckets) {
+ ICING_LOG(ERROR) << "Bad number of free lists";
+ return false;
+ }
+
+ for (int i = 0; i < kNumNextAllocationBuckets; i++) {
+ if (hdr.free_lists(i) != kInvalidNextIndex &&
+ hdr.free_lists(i) >= hdr.max_nexts()) {
+ ICING_LOG(ERROR) << "Free list index out of bounds";
+ return false;
+ }
+ }
+
+ return true;
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nodes_left() const {
+ return hdr_.hdr.max_nodes() - hdr_.hdr.num_nodes();
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::nexts_left() const {
+ return hdr_.hdr.max_nexts() - hdr_.hdr.num_nexts();
+}
+
+uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::suffixes_left() const {
+ return hdr_.hdr.max_suffixes_size() - hdr_.hdr.suffixes_size();
+}
+
+void IcingDynamicTrie::IcingDynamicTrieStorage::FillDirtyPageStats(
+ Stats *stats) const {
+ stats->dirty_pages_nodes = array_storage_[NODE].num_dirty_pages();
+ stats->dirty_pages_nexts = array_storage_[NEXT].num_dirty_pages();
+ stats->dirty_pages_suffixes = array_storage_[SUFFIX].num_dirty_pages();
+}
+
+// Dumper.
+class IcingDynamicTrie::Dumper {
+ public:
+ explicit Dumper(const IcingDynamicTrie &trie)
+ : all_props_(trie), del_prop_(trie), storage_(trie.storage_.get()) {}
+
+ void Dump(std::ostream *pretty_print, vector<std::string> *keys) const {
+ if (storage_->empty()) {
+ *pretty_print << "(empty)\n";
+ } else {
+ DumpNodeRecursive("", *storage_->GetRootNode(), 0, pretty_print, keys);
+ }
+ }
+
+ private:
+ std::string SuffixToValueAsString(const char *suffix) const {
+ int suffix_len = strlen(suffix);
+ std::string ret;
+ ret.reserve(storage_->value_size() * 2);
+ for (uint32_t i = 0; i < storage_->value_size(); i++) {
+ IcingStringUtil::SStringAppendF(&ret, 10, "%02x",
+ suffix[suffix_len + 1 + i]);
+ }
+
+ // Now dump set properties.
+ uint32_t value_index = storage_->GetSuffixIndex(suffix + suffix_len + 1);
+ if (del_prop_.HasProperty(value_index)) {
+ ret += " (deleted)";
+ }
+ ret += " [";
+ for (size_t i = 0; i < all_props_.size(); i++) {
+ if (all_props_.HasProperty(i, value_index)) {
+ IcingStringUtil::SStringAppendF(&ret, 10, "%zu", i);
+ }
+ }
+ ret += ']';
+
+ return ret;
+ }
+
+ // Inputs:
+ // prefix - the key prefix of the current node (so we can rebuild the key)
+ // node - the node we're at
+ // level - how many levels deep we are in the trie
+ // ret - the stream to pretty print to
+ // keys - the keys encountered are appended to this
+ void DumpNodeRecursive(const std::string &prefix, const Node &node, int level,
+ std::ostream *ret, vector<std::string> *keys) const {
+ if (node.is_leaf()) {
+ // Dump suffix and value.
+ for (int i = 0; i < level; i++) {
+ *ret << ' ';
+ }
+ const char *suffix = storage_->GetSuffix(node.next_index());
+ *ret << suffix;
+ *ret << ' ';
+ *ret << SuffixToValueAsString(suffix);
+ *ret << '\n';
+ keys->push_back(prefix + suffix);
+ } else {
+ // Go through each child (next) node. Print char and recursively
+ // print trie underneath.
+ for (uint32_t i = 0; i < (1U << node.log2_num_children()); i++) {
+ const Next &next = *storage_->GetNext(node.next_index(), i);
+ if (next.node_index() == kInvalidNodeIndex) break;
+ for (int j = 0; j < level; j++) {
+ *ret << ' ';
+ }
+ std::string new_prefix = prefix;
+ if (next.val()) {
+ *ret << static_cast<char>(next.val());
+ new_prefix += next.val();
+ } else {
+ *ret << "null";
+ }
+ *ret << '\n';
+ DumpNodeRecursive(new_prefix, *storage_->GetNode(next.node_index()),
+ level + 1, ret, keys);
+ }
+ }
+ }
+
+ PropertyReadersAll all_props_;
+ PropertyDeletedReader del_prop_;
+ const IcingDynamicTrie::IcingDynamicTrieStorage *storage_;
+};
+
+// IcingDynamicTrie.
+IcingDynamicTrie::IcingDynamicTrie(const std::string &filename_base,
+ const RuntimeOptions &runtime_options,
+ const IcingFilesystem *filesystem)
+ : IIcingStorage(),
+ filename_base_(filename_base),
+ is_initialized_(false),
+ runtime_options_(runtime_options),
+ storage_(nullptr),
+ property_bitmaps_prefix_(filename_base_ + ".prop."),
+ deleted_bitmap_filename_(filename_base_ + ".deleted"),
+ deleted_bitmap_(nullptr),
+ filesystem_(filesystem) {}
+
+IcingDynamicTrie::~IcingDynamicTrie() { Close(); }
+
+bool IcingDynamicTrie::Init() {
+ if (is_initialized_) return true;
+
+ if (storage_ != nullptr) {
+ ICING_LOG(FATAL) << "Storage is not null before initialization";
+ }
+
+ storage_ = std::make_unique<IcingDynamicTrieStorage>(
+ filename_base_, runtime_options_, filesystem_);
+ if (!storage_->Init() || !InitPropertyBitmaps()) {
+ storage_.reset();
+ return false;
+ }
+ is_initialized_ = true;
+ return true;
+}
+
+bool IcingDynamicTrie::CreateIfNotExist(const Options &options) {
+ // Initialized means exists.
+ if (is_initialized_) return true;
+
+ if (!options.is_valid()) {
+ ICING_LOG(ERROR) << "Trie options invalid";
+ return false;
+ }
+
+ auto storage = std::make_unique<IcingDynamicTrieStorage>(
+ filename_base_, runtime_options_, filesystem_);
+ return storage->CreateIfNotExist(options);
+}
+
+void IcingDynamicTrie::Close() {
+ if (!is_initialized_) return;
+
+ UpdateCrc();
+
+ storage_.reset();
+ property_bitmaps_.clear();
+ deleted_bitmap_.reset();
+ is_initialized_ = false;
+}
+
+bool IcingDynamicTrie::Remove() {
+ if (is_initialized()) {
+ Close();
+ }
+
+ bool success = true;
+
+ // Remove storage files.
+ if (!IcingDynamicTrieStorage::Remove(filename_base_, *filesystem_)) {
+ success = false;
+ }
+
+ // Also remove property bitmaps.
+ vector<std::string> files;
+ if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
+ &files)) {
+ return false;
+ }
+ for (size_t i = 0; i < files.size(); i++) {
+ if (!filesystem_->DeleteFile(files[i].c_str())) success = false;
+ }
+ // And deleted bitmap.
+ if (!filesystem_->DeleteFile(deleted_bitmap_filename_.c_str()))
+ success = false;
+
+ return success;
+}
+
+bool IcingDynamicTrie::Sync() {
+ if (!is_initialized_) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ bool success = true;
+ IcingTimer timer;
+
+ // Sync property bitmaps.
+ for (size_t i = 0; i < property_bitmaps_.size(); i++) {
+ if (property_bitmaps_[i]) {
+ if (!property_bitmaps_[i]->Sync()) success = false;
+ }
+ }
+ if (!deleted_bitmap_->Sync()) success = false;
+
+ // Sync storage.
+ if (!storage_->Sync()) success = false;
+
+ Warm();
+
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Syncing dynamic trie %s took %.3fms", filename_base_.c_str(),
+ timer.Elapsed() * 1000.);
+
+ return success;
+}
+
+uint64_t IcingDynamicTrie::GetDiskUsage() const {
+ uint64_t total = 0;
+ // Property bitmaps.
+ IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
+ &total);
+
+ for (auto &bitmap : property_bitmaps_) {
+ if (bitmap == nullptr) continue;
+ IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
+ }
+
+ // Storage.
+ IcingFilesystem::IncrementByOrSetInvalid(storage_->GetDiskUsage(), &total);
+ return total;
+}
+
+std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
+ const std::string &filename, bool verify,
+ const IcingFilesystem *filesystem) {
+ auto bitmap = std::make_unique<IcingFlashBitmap>(filename, filesystem);
+ if (!bitmap->Init() || (verify && !bitmap->Verify())) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Init of %s failed",
+ filename.c_str());
+ return nullptr;
+ }
+ return bitmap;
+}
+
+bool IcingDynamicTrie::InitPropertyBitmaps() {
+ // Only called on init.
+ if (!property_bitmaps_.empty()) {
+ ICING_LOG(FATAL) << "Property bitmaps not empty before initialization";
+ }
+
+ if (deleted_bitmap_ != nullptr) {
+ ICING_LOG(FATAL) << "Deleted bitmap not null before initialization";
+ }
+
+ // Truncate property bitmap files at current value index. Last value
+ // is at suffixes_size - value_size(). We want to clear everything
+ // after that.
+ uint64_t truncate_idx =
+ storage_->hdr().suffixes_size() > 0
+ ? ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size() -
+ value_size()) +
+ 1
+ : 0;
+
+ // Discover property bitmaps by scanning the dir.
+ vector<std::string> files;
+ if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
+ &files)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ goto failed;
+ }
+ for (size_t i = 0; i < files.size(); i++) {
+ // Decode property id from filename.
+ size_t property_id_start_idx = files[i].rfind('.');
+ if (property_id_start_idx == std::string::npos) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
+ files[i].c_str());
+ continue;
+ }
+ property_id_start_idx++; // skip dot
+ char *end;
+ uint32_t property_id =
+ strtol(files[i].c_str() + property_id_start_idx, &end, 10); // NOLINT
+ if (!end || end != (files[i].c_str() + files[i].size())) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
+ files[i].c_str());
+ continue;
+ }
+ std::unique_ptr<IcingFlashBitmap> bitmap = OpenAndInitBitmap(
+ files[i],
+ runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
+ filesystem_);
+ if (!bitmap) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Open prop bitmap failed: %s", files[i].c_str());
+ goto failed;
+ }
+ bitmap->Truncate(truncate_idx);
+ if (property_id >= property_bitmaps_.size()) {
+ property_bitmaps_.resize(property_id + 1);
+ }
+ property_bitmaps_[property_id] = std::move(bitmap);
+ }
+
+ deleted_bitmap_ = OpenAndInitBitmap(
+ deleted_bitmap_filename_,
+ runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
+ filesystem_);
+ if (!deleted_bitmap_) {
+ goto failed;
+ }
+ deleted_bitmap_->Truncate(truncate_idx);
+
+ return true;
+
+failed:
+ property_bitmaps_.clear();
+ deleted_bitmap_.reset();
+ return false;
+}
+
+void IcingDynamicTrie::Warm() const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ return storage_->Warm();
+}
+
+void IcingDynamicTrie::OnSleep() {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ // Update crcs so we can verify when we come back.
+ UpdateCrc();
+}
+
+IcingDynamicTrie::NewValueMap::~NewValueMap() {}
+
+bool IcingDynamicTrie::Compact(
+ const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
+ std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const {
+ if (old_to_new_tvi == nullptr) {
+ ICING_LOG(ERROR) << "TVI is null";
+ }
+
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ PropertyReadersAll prop_readers(*this);
+
+ old_to_new_tvi->clear();
+ old_to_new_tvi->rehash(size() * 2);
+
+ for (Iterator it_all(*this, ""); it_all.IsValid(); it_all.Advance()) {
+ uint32_t value_index = it_all.GetValueIndex();
+ const void *new_value = old_tvi_to_new_value.GetNewValue(value_index);
+ if (!new_value) continue;
+
+ uint32_t new_value_index;
+ if (!out->Insert(it_all.GetKey(), new_value, &new_value_index, false)) {
+ return false;
+ }
+
+ old_to_new_tvi->insert({value_index, new_value_index});
+
+ // Copy properties.
+ for (size_t i = 0; i < prop_readers.size(); i++) {
+ if (prop_readers.HasProperty(i, value_index)) {
+ if (!out->SetProperty(new_value_index, i)) {
+ // Ouch. We need to bail.
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+uint32_t IcingDynamicTrie::size() const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+ return storage_->hdr().num_keys();
+}
+
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
+ Stats *stats) const {
+ if (node.is_leaf()) {
+ stats->num_leaves++;
+ const char *suffix = storage_->GetSuffix(node.next_index());
+ stats->suffixes_used += strlen(suffix) + 1 + value_size();
+ if (!suffix[0]) {
+ stats->null_suffixes++;
+ }
+ } else {
+ stats->num_intermediates++;
+ uint32_t i = 0;
+ for (; i < (1U << node.log2_num_children()); i++) {
+ const Next &next = *storage_->GetNext(node.next_index(), i);
+ if (next.node_index() == kInvalidNodeIndex) break;
+ CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
+ }
+
+ // At least one valid node in each next array
+ if (i == 0) {
+ ICING_LOG(FATAL) << "No valid node in 'next' array";
+ }
+
+ stats->child_counts[i - 1]++;
+ stats->wasted[node.log2_num_children()] +=
+ (1 << node.log2_num_children()) - i;
+ stats->total_wasted += (1 << node.log2_num_children()) - i;
+ }
+}
+
+void IcingDynamicTrie::CollectStats(Stats *stats) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->num_keys = storage_->hdr().num_keys();
+ stats->num_nodes = storage_->hdr().num_nodes();
+ stats->max_nodes = storage_->hdr().max_nodes();
+ stats->num_nexts = storage_->hdr().num_nexts();
+ stats->max_nexts = storage_->hdr().max_nexts();
+ stats->suffixes_size = storage_->hdr().suffixes_size();
+ stats->max_suffixes_size = storage_->hdr().max_suffixes_size();
+
+ // Stats collected from traversing the trie.
+ if (!storage_->empty()) {
+ CollectStatsRecursive(*storage_->GetRootNode(), stats);
+ }
+
+ // Free-list stats.
+ for (int i = 0; i < kNumNextAllocationBuckets; i++) {
+ for (uint32_t cur = storage_->hdr().free_lists(i); cur != kInvalidNextIndex;
+ cur = storage_->GetNext(cur, 0)->next_index()) {
+ stats->num_free[i]++;
+ }
+ stats->total_free += stats->num_free[i] * (1 << i);
+ }
+
+ // Dirty page counts.
+ storage_->FillDirtyPageStats(stats);
+}
+
+std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
+ std::string ret;
+ IcingStringUtil::SStringAppendF(
+ &ret, 0,
+ "Keys %u "
+ "Nodes (%u/%u) %.3f%% "
+ "Nexts (%u/%u) %.3f%% "
+ "Suffixes (%u/%u) %.3f%%\n",
+ num_keys, num_nodes, max_nodes,
+ 100. * math_util::SafeDivide(num_nodes, max_nodes), num_nexts, max_nexts,
+ 100. * math_util::SafeDivide(num_nexts, max_nexts), suffixes_size,
+ max_suffixes_size,
+ 100. * math_util::SafeDivide(suffixes_size, max_suffixes_size));
+
+ if (verbosity > 0) {
+ for (int i = 0; i < kNumNextAllocationBuckets; i++) {
+ if (num_free[i] > 0) {
+ IcingStringUtil::SStringAppendF(&ret, 0, "Freelist@%d: %u\n", 1 << i,
+ num_free[i]);
+ }
+ }
+ IcingStringUtil::SStringAppendF(
+ &ret, 0, "Freelist total: %u/%u %.3f%%\n", total_free, num_nexts,
+ 100. * math_util::SafeDivide(total_free, num_nexts));
+
+ for (int i = 0; i < 256; i++) {
+ if (child_counts[i] > 0) {
+ IcingStringUtil::SStringAppendF(&ret, 0, "Child count@%d: %u\n", i + 1,
+ child_counts[i]);
+ }
+ }
+ for (int i = 0; i < kNumNextAllocationBuckets; i++) {
+ IcingStringUtil::SStringAppendF(&ret, 0, "Wasted@%d: %u\n", 1 << i,
+ wasted[i]);
+ }
+ IcingStringUtil::SStringAppendF(
+ &ret, 0,
+ "Wasted total: %u\n"
+ "Num intermediates %u num leaves %u "
+ "suffixes used %u null %u\n"
+ "Total next frag: %.3f%%\n",
+ total_wasted, num_intermediates, num_leaves, suffixes_used,
+ null_suffixes,
+ 100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
+ }
+ IcingStringUtil::SStringAppendF(
+ &ret, 0, "Memory usage: %zu/%zu bytes\n",
+ num_nodes * sizeof(Node) + num_nexts * sizeof(Next) + suffixes_size,
+ max_nodes * sizeof(Node) + max_nexts * sizeof(Next) + max_suffixes_size);
+
+ IcingStringUtil::SStringAppendF(
+ &ret, 0, "Dirty pages: nodes %u/%.0f nexts %u/%.0f suffixes %u/%.0f\n",
+ dirty_pages_nodes,
+ math_util::SafeDivide(num_nodes * sizeof(Node) + getpagesize() - 1,
+ getpagesize()),
+ dirty_pages_nexts,
+ math_util::SafeDivide(num_nexts * sizeof(Next) + getpagesize() - 1,
+ getpagesize()),
+ dirty_pages_suffixes,
+ math_util::SafeDivide(suffixes_size + getpagesize() - 1, getpagesize()));
+
+ return ret;
+}
+
+void IcingDynamicTrie::DumpTrie(std::ostream *pretty_print,
+ vector<std::string> *keys) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ Dumper dumper(*this);
+ dumper.Dump(pretty_print, keys);
+}
+
+void IcingDynamicTrie::Clear() {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ storage_->Clear();
+ for (auto &bitmap : property_bitmaps_) {
+ if (bitmap) {
+ bitmap->Delete();
+ bitmap.reset();
+ }
+ }
+ deleted_bitmap_->Truncate(0);
+}
+
+bool IcingDynamicTrie::Insert(const char *key, const void *value,
+ uint32_t *value_index, bool replace,
+ bool *pnew_key) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (pnew_key) *pnew_key = false;
+
+ // Find out ahead of time whether things will fit. A conservative
+ // check based on allocations made below.
+ //
+ // IMPORTANT: This needs to be updated if the alloc patterns below
+ // change.
+ size_t key_len = strlen(key);
+ if (!(storage_->nodes_left() >= 2 + key_len + 1 &&
+ storage_->nexts_left() >= 2 + key_len + 1 + kMaxNextArraySize &&
+ storage_->suffixes_left() >= key_len + 1 + value_size())) {
+ // No more space left.
+ return false;
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, false);
+
+ // A negative key_offset indicates that storage_ is empty
+ if (key_offset < 0) {
+ // First key.
+ if (!storage_->empty()) {
+ ICING_LOG(FATAL) << "Key offset is negative but storage is not empty, "
+ "there're inconsistencies in dynamic trie.";
+ }
+ Node *node = storage_->AllocNode();
+ node->set_next_index(storage_->MakeSuffix(key, value, value_index));
+ node->set_is_leaf(true);
+ node->set_log2_num_children(0);
+ } else if (storage_->GetNode(best_node_index)->is_leaf()) {
+ // Prefix in the trie. Split at leaf.
+ Node *split_node = storage_->GetMutableNode(best_node_index);
+ const char *prev_suffix = storage_->GetSuffix(split_node->next_index());
+
+ // Find the common prefix length.
+ const char *prev_suffix_cur = prev_suffix;
+ const char *key_cur = key + key_offset;
+ while (*prev_suffix_cur && *prev_suffix_cur == *key_cur) {
+ prev_suffix_cur++;
+ key_cur++;
+ }
+
+ // Equal strings?
+ if (*prev_suffix_cur == 0 && *key_cur == 0) {
+ // Update value if replace == true and return.
+ if (value_index) {
+ *value_index = storage_->GetSuffixIndex(prev_suffix_cur + 1);
+ }
+ if (replace) {
+ char *mutable_prev_suffix_cur = storage_->GetMutableSuffix(
+ storage_->GetSuffixIndex(prev_suffix_cur + 1), value_size());
+ memcpy(mutable_prev_suffix_cur, value, value_size());
+ }
+ return true;
+ }
+
+ if (*prev_suffix_cur == *key_cur) {
+ ICING_LOG(FATAL) << "The suffix cursor and key cursor should diverge "
+ "after finding the common prefix.";
+ }
+
+ // Create single-branch children for the common prefix
+ // length. After the loop, split_node points to the node that
+ // will have more than 1 char.
+ int common_len = prev_suffix_cur - prev_suffix;
+ for (int i = 0; i < common_len; i++) {
+ // Create a single-branch child node.
+ Next *split_next = storage_->AllocNextArray(1);
+ split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
+ split_node->set_is_leaf(false);
+ split_node->set_log2_num_children(0);
+ Node *child_node = storage_->AllocNode();
+ split_next[0].set_val(*(prev_suffix + i));
+ split_next[0].set_node_index(storage_->GetNodeIndex(child_node));
+
+ split_node = child_node;
+ }
+
+ // Fill a split.
+ Next *split_next = storage_->AllocNextArray(2);
+ split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
+ split_node->set_is_leaf(false);
+ split_node->set_log2_num_children(1);
+ Node *prev_suffix_node = storage_->AllocNode();
+ Node *key_node = storage_->AllocNode();
+ split_next[0].set_val(*(prev_suffix + common_len));
+ split_next[0].set_node_index(storage_->GetNodeIndex(prev_suffix_node));
+ if (*(prev_suffix + common_len)) {
+ uint32_t next_index =
+ storage_->GetSuffixIndex(prev_suffix + common_len) + 1;
+ prev_suffix_node->set_next_index(next_index);
+ } else {
+ uint32_t next_index = storage_->GetSuffixIndex(prev_suffix + common_len);
+ prev_suffix_node->set_next_index(next_index);
+ }
+ prev_suffix_node->set_is_leaf(true);
+ prev_suffix_node->set_log2_num_children(0);
+ split_next[1].set_val(*(key + key_offset + common_len));
+ split_next[1].set_node_index(storage_->GetNodeIndex(key_node));
+ if (*(key + key_offset + common_len)) {
+ uint32_t next_index = storage_->MakeSuffix(
+ key + key_offset + common_len + 1, value, value_index);
+ key_node->set_next_index(next_index);
+ } else {
+ uint32_t next_index = storage_->MakeSuffix(key + key_offset + common_len,
+ value, value_index);
+ key_node->set_next_index(next_index);
+ }
+ key_node->set_is_leaf(true);
+ key_node->set_log2_num_children(0);
+
+ std::sort(split_next, split_next + 2);
+ } else {
+ // Insert into intermediate node.
+ const Node *best_node = storage_->GetNode(best_node_index);
+
+ // Add our value as a node + suffix.
+ Node *new_leaf_node = storage_->AllocNode();
+ if (*(key + key_offset)) {
+ uint32_t next_index =
+ storage_->MakeSuffix(key + key_offset + 1, value, value_index);
+ new_leaf_node->set_next_index(next_index);
+ } else {
+ uint32_t next_index =
+ storage_->MakeSuffix(key + key_offset, value, value_index);
+ new_leaf_node->set_next_index(next_index);
+ }
+ new_leaf_node->set_is_leaf(true);
+ new_leaf_node->set_log2_num_children(0);
+
+ // Figure out the real length of the existing next array.
+ Next *cur_next = storage_->GetMutableNextArray(
+ best_node->next_index(), 1 << best_node->log2_num_children());
+ int next_len = 0;
+ for (; next_len < (1 << best_node->log2_num_children()) &&
+ cur_next[next_len].node_index() != kInvalidNodeIndex;
+ next_len++) {
+ }
+ Next *new_next = cur_next;
+ if (next_len == (1 << best_node->log2_num_children())) {
+ // Allocate a new, larger, array.
+ new_next = storage_->AllocNextArray(next_len + 1);
+ memcpy(new_next, cur_next, sizeof(Next) * next_len);
+ }
+
+ // Write a link to our new leaf node and sort.
+ new_next[next_len].set_val(*(key + key_offset));
+ new_next[next_len].set_node_index(storage_->GetNodeIndex(new_leaf_node));
+ inplace_merge(new_next, new_next + next_len, new_next + next_len + 1);
+ next_len++;
+
+ // If this was new, update the parent node and free the old next
+ // array.
+ if (new_next != cur_next) {
+ Node *mutable_best_node =
+ storage_->GetMutableNode(storage_->GetNodeIndex(best_node));
+ mutable_best_node->set_next_index(storage_->GetNextArrayIndex(new_next));
+ mutable_best_node->set_is_leaf(false);
+ uint8_t log2_num_children = mutable_best_node->log2_num_children();
+
+ // 8 == log2(256)
+ if (log2_num_children >= 8) {
+ ICING_LOG(FATAL) << "Number of children exceeds the max allowed size";
+ }
+
+ mutable_best_node->set_log2_num_children(log2_num_children + 1);
+
+ storage_->FreeNextArray(cur_next,
+ mutable_best_node->log2_num_children() - 1);
+ }
+ }
+
+ // We added a new key.
+ storage_->inc_num_keys();
+
+ if (pnew_key) *pnew_key = true;
+ return true;
+}
+
+const void *IcingDynamicTrie::GetValueAtIndex(uint32_t value_index) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ return static_cast<const void *>(storage_->GetSuffix(value_index));
+}
+
+void IcingDynamicTrie::SetValueAtIndex(uint32_t value_index,
+ const void *value) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (value_index > storage_->hdr().max_suffixes_size() - value_size()) {
+ ICING_LOG(FATAL) << "Value index is out of range";
+ }
+
+ memcpy(storage_->GetMutableSuffix(value_index, value_size()), value,
+ value_size());
+}
+
+bool IcingDynamicTrie::Find(const char *key, void *value,
+ uint32_t *value_index) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, false);
+
+ const Node *best_node = storage_->GetNode(best_node_index);
+ if (key_offset >= 0 && best_node->is_leaf() &&
+ !strcmp(key + key_offset, storage_->GetSuffix(best_node->next_index()))) {
+ uint32_t vidx = best_node->next_index() +
+ strlen(storage_->GetSuffix(best_node->next_index())) + 1;
+ if (value_index) *value_index = vidx;
+ if (value) memcpy(value, storage_->GetSuffix(vidx), value_size());
+ return true;
+ } else {
+ return false;
+ }
+}
+
+IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
+ const char *prefix)
+ : cur_key_(prefix),
+ cur_suffix_(nullptr),
+ cur_suffix_len_(0),
+ single_leaf_match_(false),
+ trie_(trie) {
+ if (!trie.is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ Reset();
+}
+
+void IcingDynamicTrie::Iterator::LeftBranchToLeaf(uint32_t node_index) {
+ // Go down the trie, following the left-most child until we hit a
+ // leaf. Push to stack and cur_key nodes and chars as we go.
+ for (; !trie_.storage_->GetNode(node_index)->is_leaf();
+ node_index =
+ trie_.storage_
+ ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
+ ->node_index()) {
+ branch_stack_.push_back(Branch(node_index));
+ cur_key_.push_back(
+ trie_.storage_
+ ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
+ ->val());
+ }
+
+ // We're at a leaf.
+ cur_suffix_ = trie_.storage_->GetSuffix(
+ trie_.storage_->GetNode(node_index)->next_index());
+ cur_suffix_len_ = strlen(cur_suffix_);
+ cur_key_.append(cur_suffix_, cur_suffix_len_);
+}
+
+void IcingDynamicTrie::Iterator::Reset() {
+ size_t strip_len = branch_stack_.size() + cur_suffix_len_;
+
+ if (cur_key_.size() < strip_len) {
+ ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
+ "size, there're inconsistencies in dynamic trie";
+ }
+
+ // Trim back cur_key_ to original prefix.
+ cur_key_.resize(cur_key_.size() - strip_len);
+ cur_suffix_ = nullptr;
+ cur_suffix_len_ = 0;
+ single_leaf_match_ = false;
+ branch_stack_.clear();
+
+ // Nothing to do with an empty trie.
+ if (trie_.storage_->empty()) return;
+
+ // Find node matching prefix.
+ uint32_t node_index;
+ int key_offset;
+ trie_.FindBestNode(cur_key_.c_str(), &node_index, &key_offset, true);
+
+ // Two cases/states:
+ //
+ // - Found an intermediate node. If we matched all of prefix
+ // (cur_key_), LeftBranchToLeaf.
+ //
+ // - Found a leaf node, which is the ONLY matching key for this
+ // prefix. Check that suffix matches the prefix. Then we set
+ // single_leaf_match_ = true and apply different logic for
+ // Advance.
+ if (key_offset < 0) {
+ // A negative key_offset indicates that trie_.storage_ is empty
+ ICING_LOG(FATAL) << "Trie storage is empty";
+ }
+
+ const Node *best_node = trie_.storage_->GetNode(node_index);
+ if (best_node->is_leaf() &&
+ !strncmp(cur_key_.c_str() + key_offset,
+ trie_.storage_->GetSuffix(best_node->next_index()),
+ cur_key_.size() - key_offset)) {
+ // Copy the entire suffix into the current key.
+ cur_key_.resize(key_offset);
+ cur_key_.append(trie_.storage_->GetSuffix(best_node->next_index()));
+ cur_suffix_ = trie_.storage_->GetSuffix(best_node->next_index());
+ cur_suffix_len_ = strlen(cur_suffix_);
+ single_leaf_match_ = true;
+ } else if (static_cast<size_t>(key_offset) == cur_key_.size()) {
+ LeftBranchToLeaf(node_index);
+ }
+}
+
+bool IcingDynamicTrie::Iterator::Advance() {
+ if (!IsValid()) return false;
+ if (single_leaf_match_) {
+ // If we only have an exact match, the Advance logic does not
+ // apply. Invalidate the iterator and return.
+ cur_suffix_ = nullptr;
+ cur_suffix_len_ = 0;
+ return false;
+ }
+
+ if (cur_key_.size() < (branch_stack_.size() + cur_suffix_len_)) {
+ ICING_LOG(FATAL) << "Key size < visited trie depth + remaining suffix "
+ "size, there're inconsistencies in dynamic trie";
+ }
+
+ // Move up from the current leaf.
+ cur_key_.resize(cur_key_.size() - cur_suffix_len_);
+ cur_suffix_ = nullptr;
+ cur_suffix_len_ = 0;
+
+ while (!branch_stack_.empty()) {
+ Branch *branch = &branch_stack_.back();
+ const Node *node = trie_.storage_->GetNode(branch->node_idx);
+ branch->child_idx++;
+ if (branch->child_idx < (1 << node->log2_num_children()) &&
+ trie_.storage_->GetNext(node->next_index(), branch->child_idx)
+ ->node_index() != kInvalidNodeIndex) {
+ // Successfully incremented to the next child. Update the char
+ // value at this depth.
+ cur_key_[cur_key_.size() - 1] =
+ trie_.storage_->GetNext(node->next_index(), branch->child_idx)->val();
+ // We successfully found a sub-trie to explore.
+ LeftBranchToLeaf(
+ trie_.storage_->GetNext(node->next_index(), branch->child_idx)
+ ->node_index());
+ return true;
+ }
+ branch_stack_.pop_back();
+ cur_key_.resize(cur_key_.size() - 1);
+ }
+
+ // Un-wound the entire stack. We are done.
+ return false;
+}
+
+bool IcingDynamicTrie::Iterator::IsValid() const {
+ return cur_suffix_ != nullptr;
+}
+
+const char *IcingDynamicTrie::Iterator::GetKey() const {
+ // cur_key_ can have a NULL in it so cur_key_ can be wrong but
+ // cur_key_.c_str() is always right.
+ return IsValid() ? cur_key_.c_str() : nullptr;
+}
+
+const void *IcingDynamicTrie::Iterator::GetValue() const {
+ if (!IsValid()) return nullptr;
+
+ return static_cast<const void *>(cur_suffix_ + cur_suffix_len_ + 1);
+}
+
+uint32_t IcingDynamicTrie::Iterator::GetValueIndex() const {
+ if (!IsValid()) return kInvalidSuffixIndex;
+
+ return trie_.storage_->GetSuffixIndex(cur_suffix_ + cur_suffix_len_ + 1);
+}
+
+void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
+ if (cur_len_ <= 0) {
+ ICING_LOG(FATAL) << "Invalid UTF-8 character length";
+ }
+
+ if (branch_end_ - branch_stack_ != cur_len_) {
+ ICING_LOG(FATAL) << "Depth from first visited node to last visited node "
+ "doesn't match the current UTF-8 character length";
+ }
+
+ // Use branch at top of stack to determine where to follow.
+ const Branch &branch = *(branch_end_ - 1);
+ const Node *node = trie_.storage_->GetNode(branch.child->node_index());
+
+ // If we start with non-ascii, take all left branches while there is
+ // a continuation byte.
+ if (!IcingStringUtil::IsAsciiChar(cur_[cur_len_ - 1])) {
+ while (!node->is_leaf()) {
+ if (ABSL_PREDICT_FALSE(cur_len_ >= UTFmax)) break;
+
+ InitBranch(branch_end_, node, 0);
+ // When we are looking to complete a utf8 char, skip 0s.
+ if (branch_end_->child->val() == 0) {
+ // Check if we already have a valid cur_.
+ cur_[cur_len_] = 0;
+ Rune rune;
+ chartorune(&rune, cur_);
+ if (rune == Runeerror && node->log2_num_children() > 0) {
+ branch_end_->child++;
+ } else {
+ // Good termination. Just break.
+ break;
+ }
+ }
+
+ if (!IcingStringUtil::IsContinuationByte(branch_end_->child->val()))
+ break;
+
+ cur_[cur_len_++] = branch_end_->child->val();
+ node = trie_.storage_->GetNode(branch_end_->child->node_index());
+ branch_end_++;
+ }
+
+ cur_logical_node_.node = node;
+
+ // Maybe go into suffixes and set suffix_offset.
+ if (node->is_leaf()) {
+ GoIntoSuffix(node);
+ } else {
+ cur_logical_node_.suffix_offset = 0;
+ }
+ } else { // ascii
+ cur_logical_node_.node = node;
+ cur_logical_node_.suffix_offset = 0;
+ }
+
+ // NULL-terminate.
+ cur_[cur_len_] = 0;
+}
+
+void IcingDynamicTrie::Utf8Iterator::GoIntoSuffix(const Node *node) {
+ const char *suffix = trie_.storage_->GetSuffix(node->next_index());
+ const char *cur_suffix;
+ for (cur_suffix = suffix; ABSL_PREDICT_TRUE(cur_len_ < UTFmax) &&
+ IcingStringUtil::IsContinuationByte(*cur_suffix);
+ cur_suffix++) {
+ cur_[cur_len_++] = *cur_suffix;
+ }
+ cur_logical_node_.suffix_offset = cur_suffix - suffix;
+}
+
+void IcingDynamicTrie::Utf8Iterator::Reset() {
+ cur_[0] = 0;
+ cur_len_ = 0;
+ branch_end_ = branch_stack_;
+
+ if (start_node_) {
+ // Take the first char node's children.
+ const Next *next = trie_.storage_->GetNext(start_node_->next_index(), 0);
+ branch_end_->node = start_node_;
+ branch_end_->child_end = next + (1 << start_node_->log2_num_children());
+ if (next->val() == 0) {
+ // Skip any nulls at this position. We don't return empty string
+ // as an iteration.
+ next++;
+ }
+ branch_end_->child = next;
+ cur_[cur_len_++] = next->val();
+ branch_end_++;
+
+ // Will NULL-terminate cur_.
+ LeftBranchToUtf8End();
+ } else {
+ // Nothing to return.
+ cur_logical_node_.node = nullptr;
+ cur_logical_node_.suffix_offset = 0;
+ }
+}
+
+bool IcingDynamicTrie::Utf8Iterator::Advance() {
+ if (!IsValid()) return false;
+
+ // Clip to branch.
+ cur_len_ = branch_end_ - branch_stack_;
+
+ while (branch_end_ > branch_stack_) {
+ Branch *branch = branch_end_ - 1;
+ branch->child++;
+ if (!branch->IsFinished()) {
+ // Successfully incremented to the next child. Update the char
+ // value at this depth.
+ cur_[cur_len_ - 1] = branch->child->val();
+
+ // We successfully found a sub-trie to explore.
+ LeftBranchToUtf8End();
+ return true;
+ }
+ cur_len_--;
+ branch_end_--;
+ }
+
+ // Un-wound the entire stack. We are done.
+ return false;
+}
+
+void IcingDynamicTrie::Utf8Iterator::InitBranch(Branch *branch,
+ const Node *start,
+ char key_char) {
+ branch->node = start;
+ branch->child = trie_.storage_->GetNext(start->next_index(), 0);
+ branch->child_end = branch->child + (1 << start->log2_num_children());
+ if (key_char) {
+ branch->child =
+ trie_.LowerBound(branch->child, branch->child_end, key_char);
+ }
+}
+
+bool IcingDynamicTrie::Utf8Iterator::Branch::IsFinished() {
+ return child >= child_end || child->node_index() == kInvalidNodeIndex;
+}
+
+bool IcingDynamicTrie::Utf8Iterator::IsValid() const { return cur_len_ > 0; }
+
+const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar(
+ const Node *node, uint8_t key_char) const {
+ const Next *next_start = storage_->GetNext(node->next_index(), 0);
+ const Next *next_end = next_start + (1 << node->log2_num_children());
+
+ const Next *found = LowerBound(next_start, next_end, key_char);
+ if (found >= next_end || found->val() != key_char ||
+ found->node_index() == kInvalidNodeIndex) {
+ return nullptr;
+ }
+
+ return found;
+}
+
+const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
+ const Next *start, const Next *end, uint8_t key_char) const {
+ // Above this value will use binary search instead of linear
+ // search. 16 was chosen from running some benchmarks with
+ // different values.
+ static const uint32_t kBinarySearchCutoff = 16;
+
+ if (end - start >= kBinarySearchCutoff) {
+ // Binary search.
+ Next key_next(key_char, 0);
+ return lower_bound(start, end, key_next);
+ } else {
+ // Linear search.
+ const Next *found;
+ for (found = start; found < end; found++) {
+ if (found->val() >= key_char) {
+ // Should have gotten match.
+ break;
+ }
+ }
+ return found;
+ }
+}
+
+void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
+ int *key_offset, bool prefix) const {
+ // Find the best node such that:
+ //
+ // - If key is NOT in the trie, key[0..key_offset) is a prefix to
+ // everything under best_node_index.
+ //
+ // - If key is in the trie, best_node_index is the leaf that points
+ // to the key suffix and key_offset == strlen(key).
+ //
+ // If prefix is true, when key is both in the trie AND a prefix
+ // (e.g. "ab" and "abc" are in the trie), we return the intermediate
+ // node with key as the prefix as opposed to the exactly matching
+ // leaf node.
+ if (storage_->empty()) {
+ *best_node_index = 0;
+ *key_offset = -1;
+ return;
+ }
+
+ const Node *cur_node = storage_->GetRootNode();
+ const char *cur_key = key;
+ while (!cur_node->is_leaf()) {
+ const Next *found = GetNextByChar(cur_node, *cur_key);
+ if (!found) break;
+
+ if (prefix && found->val() == 0) {
+ break;
+ }
+
+ cur_node = storage_->GetNode(found->node_index());
+
+ // End of key.
+ if (*cur_key == 0) {
+ break;
+ }
+ cur_key++;
+ }
+
+ *best_node_index = storage_->GetNodeIndex(cur_node);
+ *key_offset = reinterpret_cast<const char *>(cur_key) - key;
+}
+
+void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
+ Stats stats;
+ CollectStats(&stats);
+ out->append(stats.DumpStats(verbosity));
+
+ // Property files.
+ vector<std::string> files;
+ if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
+ &files)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ return;
+ }
+ for (size_t i = 0; i < files.size(); i++) {
+ IcingStringUtil::SStringAppendF(
+ out, 1000, "Prop file %s size %" PRIu64 "\n",
+ filesystem_->GetBasename(files[i].c_str()).c_str(),
+ filesystem_->GetFileSize(files[i].c_str()));
+ }
+ IcingStringUtil::SStringAppendF(
+ out, 1000, "Deleted file %s size %" PRIu64 "\n",
+ filesystem_->GetBasename(deleted_bitmap_filename_.c_str()).c_str(),
+ filesystem_->GetFileSize(deleted_bitmap_filename_.c_str()));
+}
+
+double IcingDynamicTrie::min_free_fraction() const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ return 1.0 - max(max(static_cast<double>(storage_->hdr().num_nodes()) /
+ storage_->hdr().max_nodes(),
+ static_cast<double>(storage_->hdr().num_nexts()) /
+ storage_->hdr().max_nexts()),
+ static_cast<double>(storage_->hdr().suffixes_size()) /
+ storage_->hdr().max_suffixes_size());
+}
+
+uint32_t IcingDynamicTrie::value_size() const {
+ return storage_->hdr().value_size();
+}
+
+uint32_t IcingDynamicTrie::max_value_index() const {
+ return storage_->hdr().max_suffixes_size();
+}
+
+uint32_t IcingDynamicTrie::UpdateCrc() {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (runtime_options_.storage_policy != RuntimeOptions::kMapSharedWithCrc) {
+ return kNoCrc;
+ }
+
+ // Combine storage crc with property bitmap crcs.
+ uint32_t crc = storage_->UpdateCrc();
+
+ // Update crcs on bitmaps.
+ for (size_t i = 0; i < property_bitmaps_.size(); ++i) {
+ if (property_bitmaps_[i]) {
+ // Combine property id with the bitmap crc.
+ uint64_t this_crc = property_bitmaps_[i]->UpdateCrc();
+ this_crc = (this_crc << 32) | i;
+ crc = IcingStringUtil::UpdateCrc32(
+ crc, reinterpret_cast<const char *>(&this_crc), sizeof(this_crc));
+ }
+ }
+ uint32_t this_crc = deleted_bitmap_->UpdateCrc();
+ crc = IcingStringUtil::UpdateCrc32(
+ crc, reinterpret_cast<const char *>(&this_crc), sizeof(this_crc));
+
+ return crc;
+}
+
+IcingFlashBitmap *IcingDynamicTrie::OpenOrCreatePropertyBitmap(
+ uint32_t property_id) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (property_id > kMaxPropertyId) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Property id %u out of range", property_id);
+ return nullptr;
+ }
+
+ if (property_id >= property_bitmaps_.size()) {
+ property_bitmaps_.resize(property_id + 1);
+ }
+ if (!property_bitmaps_[property_id]) {
+ std::string filename;
+ IcingStringUtil::SStringAppendF(
+ &filename, property_bitmaps_prefix_.size() + 10, "%s%u",
+ property_bitmaps_prefix_.c_str(), property_id);
+ property_bitmaps_[property_id] =
+ OpenAndInitBitmap(filename, false, filesystem_);
+ }
+ return property_bitmaps_[property_id].get();
+}
+
+bool IcingDynamicTrie::SetProperty(uint32_t value_index, uint32_t property_id) {
+ IcingFlashBitmap *bitmap = OpenOrCreatePropertyBitmap(property_id);
+ if (!bitmap) {
+ return false;
+ }
+ uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
+
+ // Also clear deleted bit.
+ return bitmap->SetBit(idx, true) && deleted_bitmap_->SetBit(idx, false);
+}
+
+bool IcingDynamicTrie::ClearProperty(uint32_t value_index,
+ uint32_t property_id) {
+ if (property_id >= property_bitmaps_.size() ||
+ !property_bitmaps_[property_id]) {
+ // No bitmap is ok for clearing.
+ return true;
+ }
+
+ uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
+ return property_bitmaps_[property_id]->SetBit(idx, false);
+}
+
+bool IcingDynamicTrie::SetDeleted(uint32_t value_index) {
+ uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
+ return deleted_bitmap_->SetBit(idx, true);
+}
+
+bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) {
+ uint64_t idx = ValueIndexToPropertyBitmapIndex(value_index);
+ return deleted_bitmap_->SetBit(idx, false);
+}
+
+bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ PropertyReadersAll readers(*this);
+ if (!readers.Exists(property_id)) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Properties for id %u don't exist", property_id);
+ return true;
+ }
+
+ // Mark values that have no other properties set as as deleted.
+ uint64_t max_idx =
+ ValueIndexToPropertyBitmapIndex(storage_->hdr().suffixes_size());
+ // TODO(vishwajith) Inefficient to do this bit by bit, should be word by
+ // word. Removing a corpus is likely rare enough that this is low priority.
+ for (uint64_t i = 0; i < max_idx; ++i) {
+ // See if the bit is set in our property map.
+ if (readers.IsPropertyUnique(property_id, i)) {
+ deleted_bitmap_->SetBit(i, true);
+ }
+ }
+
+ // Now delete the bitmap file for this property.
+ std::unique_ptr<IcingFlashBitmap> bitmap(
+ std::move(property_bitmaps_[property_id]));
+ // bitmap cannot be null here, because then readers.Exists(property_id) would
+ // have returned false earlier, and we wouldn't get here.
+ if (bitmap == nullptr) {
+ ICING_LOG(ERROR) << "Property bitmap is null";
+ return false;
+ }
+
+ return bitmap->Delete();
+}
+
+bool IcingDynamicTrie::PropertyReaderBase::Exists() const {
+ return bitmap_ != nullptr;
+}
+
+bool IcingDynamicTrie::PropertyReaderBase::HasProperty(
+ uint32_t value_index) const {
+ return bitmap_ &&
+ bitmap_->GetBit(trie_.ValueIndexToPropertyBitmapIndex(value_index));
+}
+
+IcingDynamicTrie::PropertyReaderBase::PropertyReaderBase(
+ const IcingDynamicTrie &trie, bool deleted, uint32_t property_id)
+ : trie_(trie) {
+ if (!trie.is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (deleted) {
+ bitmap_ = trie.deleted_bitmap_.get();
+ } else if (property_id < trie.property_bitmaps_.size()) {
+ bitmap_ = trie.property_bitmaps_[property_id].get();
+ } else {
+ bitmap_ = nullptr;
+ }
+}
+
+IcingDynamicTrie::PropertyReadersAll::PropertyReadersAll(
+ const IcingDynamicTrie &trie)
+ : trie_(trie) {
+ if (!trie.is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+}
+
+bool IcingDynamicTrie::PropertyReadersAll::Exists(uint32_t property_id) const {
+ return property_id < trie_.property_bitmaps_.size() &&
+ trie_.property_bitmaps_[property_id];
+}
+
+bool IcingDynamicTrie::PropertyReadersAll::HasProperty(
+ uint32_t property_id, uint32_t value_index) const {
+ return property_id < trie_.property_bitmaps_.size() &&
+ trie_.property_bitmaps_[property_id] &&
+ trie_.property_bitmaps_[property_id]->GetBit(
+ trie_.ValueIndexToPropertyBitmapIndex(value_index));
+}
+
+bool IcingDynamicTrie::PropertyReadersAll::IsPropertyUnique(
+ uint32_t property_id, uint32_t value_index) const {
+ uint32_t idx = trie_.ValueIndexToPropertyBitmapIndex(value_index);
+
+ // First check that value is set for the requested id.
+ if (property_id >= trie_.property_bitmaps_.size() ||
+ !trie_.property_bitmaps_[property_id] ||
+ !trie_.property_bitmaps_[property_id]->GetBit(idx)) {
+ return false;
+ }
+
+ // Now check that the value is not set for the rest.
+ for (size_t i = 0; i < trie_.property_bitmaps_.size(); ++i) {
+ if (i == property_id) {
+ continue;
+ }
+ if (trie_.property_bitmaps_[i] && trie_.property_bitmaps_[i]->GetBit(idx)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+size_t IcingDynamicTrie::PropertyReadersAll::size() const {
+ return trie_.property_bitmaps_.size();
+}
+
+uint64_t IcingDynamicTrie::ValueIndexToPropertyBitmapIndex(
+ uint32_t value_index) const {
+ // We know that value indices are separated by at least 1 +
+ // value_size() bytes (for the null terminator and the value).
+ return value_index / (value_size() + 1);
+}
+
+// Testing hooks.
+void IcingDynamicTrie::GetHeader(IcingDynamicTrieHeader *hdr) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ *hdr = storage_->hdr();
+}
+
+void IcingDynamicTrie::SetHeader(const IcingDynamicTrieHeader &new_hdr) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ storage_->hdr_.hdr = new_hdr;
+ storage_->WriteHeader();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
new file mode 100644
index 0000000..2e93ef1
--- /dev/null
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -0,0 +1,616 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// Trie for word prefix lookups. Features:
+//
+// - Dynamic additions (but not deletions)
+// - Low memory usage
+// - Reasonable latency but not QPS
+// - Revive from persistence is a disk read
+// - Stores a 4-byte value associated with every key
+//
+// Associated with each value in the trie is a set of property ids. For
+// efficiency, property ids should start at 0 and be densely packed. A value
+// may have more than one id set. There is an additional deleted property
+// for each value, which is set only when all the property ids associated with a
+// value have been cleared. In the flash_index, property ids are used to track
+// corpus ids.
+//
+// Not thread-safe.
+
+#ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
+#define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "icing/legacy/core/icing-compat.h"
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/legacy/index/icing-storage.h"
+#include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
+#include "utf.h"
+
+namespace icing {
+namespace lib {
+
+class IcingFlashBitmap;
+
+class IcingDynamicTrie : public IIcingStorage {
+ class Dumper;
+ class IcingDynamicTrieStorage;
+
+ public:
+ // Adjacent bit fields are usually packed automatically. However, that is
+ // implementation specific:
+ // http://en.cppreference.com/w/cpp/language/bit_field
+ // So we'll set packed to be explicit.
+ class Node {
+ public:
+ // This object is only ever used by an ArrayStorage, which allocates
+ // sizeof(Node) bytes, zeroes them out and then casts to a Node.
+ Node() = delete;
+
+ uint32_t next_index() const { return next_index_; }
+ void set_next_index(uint32_t next_index) { next_index_ = next_index; }
+
+ bool is_leaf() const { return is_leaf_; }
+ void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; }
+
+ uint8_t log2_num_children() const { return log2_num_children_; }
+ void set_log2_num_children(uint8_t log2_num_children) {
+ log2_num_children_ = log2_num_children;
+ }
+
+ private:
+ uint32_t next_index_ : 27;
+ uint32_t is_leaf_ : 1;
+ uint32_t log2_num_children_ : 4;
+ } __attribute__((packed));
+ static_assert(sizeof(Node) == 4, "");
+ static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan");
+
+ // Adjacent bit fields are usually packed automatically. However, that is
+ // implementation specific:
+ // http://en.cppreference.com/w/cpp/language/bit_field.
+ // So we'll set packed to be explicit.
+ union Next {
+ Next(uint8_t val, uint32_t node_index) {
+ used.val = val;
+ used.node_index = node_index;
+ }
+
+ uint8_t val() const { return used.val; }
+ void set_val(uint8_t val) { used.val = val; }
+
+ uint32_t node_index() const { return used.node_index; }
+ void set_node_index(uint32_t node_index) { used.node_index = node_index; }
+
+ uint32_t next_index() const { return freelink.next_index; }
+ void set_next_index(uint32_t next_index) {
+ freelink.next_index = next_index;
+ }
+
+ bool operator<(const Next &next2) const {
+ if (val() == next2.val()) {
+ return node_index() < next2.node_index();
+ }
+ return val() < next2.val();
+ }
+
+ private:
+ // This object is only ever used by an ArrayStorage, which allocates
+ // sizeof(Node) bytes, zeroes them out and then casts to a Node.
+ Next() = default;
+
+ struct {
+ uint32_t val : 8;
+ uint32_t node_index : 24;
+ } used;
+ struct {
+ uint32_t next_index : 32;
+ } freelink;
+ } __attribute__((packed));
+ static_assert(sizeof(Next) == 4, "");
+ static_assert(sizeof(Next) % alignof(Next) == 0, "");
+ static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan");
+
+ static const int kMaxNextArraySize = 256;
+ static const int kNumNextAllocationBuckets = 9; // [log2(1), log2(256)]
+
+ static const uint32_t kMaxPropertyId = (1 << 16) - 1;
+
+ static const uint32_t kInvalidValueIndex = 0;
+
+ static const uint32_t kNoCrc = 0;
+
+ struct Stats {
+ uint32_t num_keys;
+
+ // Node stats
+
+ uint32_t num_nodes;
+ uint32_t max_nodes;
+ // Count of intermediate nodes.
+ uint32_t num_intermediates;
+ // Count of leaf nodes.
+ uint32_t num_leaves;
+
+ // Next stats
+
+ uint32_t num_nexts;
+ uint32_t max_nexts;
+ // Count of next arrays by size.
+ uint32_t child_counts[kMaxNextArraySize];
+ // Wasted next array space per allocation bucket (in Nexts, not
+ // bytes).
+ uint32_t wasted[kNumNextAllocationBuckets];
+ // Sum of wasted array.
+ uint32_t total_wasted;
+
+ // Suffix stats
+
+ uint32_t suffixes_size;
+ uint32_t max_suffixes_size;
+ // Bytes actually used by suffixes.
+ uint32_t suffixes_used;
+ // Number of suffixes that are just empty strings.
+ uint32_t null_suffixes;
+
+ // Next free-list stats
+ uint32_t num_free[kNumNextAllocationBuckets];
+ // Total Next nodes free (weighted sum of the above).
+ uint32_t total_free;
+
+ // Dirty pages.
+ uint32_t dirty_pages_nodes;
+ uint32_t dirty_pages_nexts;
+ uint32_t dirty_pages_suffixes;
+
+ std::string DumpStats(int verbosity) const;
+ };
+
+ // Options when creating the trie. Maximums for the node/next/suffix
+ // arrays must be specified in advance.
+ struct Options {
+ // Absolute maximums.
+ static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize;
+
+ // The default takes 13MB of memory and can take about 1M English
+ // words.
+ Options()
+ : max_nodes(1U << 20),
+ max_nexts(1U << 20),
+ max_suffixes_size(5U << 20),
+ value_size(sizeof(uint32_t)) {}
+ Options(uint32_t max_nodes_in, uint32_t max_nexts_in,
+ uint32_t max_suffixes_size_in, uint32_t value_size_in)
+ : max_nodes(max_nodes_in),
+ max_nexts(max_nexts_in),
+ max_suffixes_size(max_suffixes_size_in),
+ value_size(value_size_in) {}
+
+ uint32_t max_nodes;
+ uint32_t max_nexts;
+ uint32_t max_suffixes_size;
+ uint32_t value_size;
+
+ // True if options do not exceed absolute maximums.
+ bool is_valid() const;
+ };
+
+ // These can be supplied during runtime, as opposed to the persisted
+ // Options above.
+ struct RuntimeOptions {
+ enum StoragePolicy {
+ // Changes are reflected in the underlying file immediately but
+ // more vulnerable to corruption.
+ kMapSharedWithCrc,
+
+ // Changes only applied during Flush. Smaller window of
+ // vulnerability to corruption.
+ kExplicitFlush,
+ };
+
+ RuntimeOptions &set_storage_policy(StoragePolicy sp) {
+ storage_policy = sp;
+ return *this;
+ }
+
+ StoragePolicy storage_policy = kExplicitFlush;
+ };
+
+ static uint32_t max_value_index(const Options &options) {
+ return options.max_suffixes_size;
+ }
+
+ // Light-weight constructor. Real work happens in Create or Init.
+ IcingDynamicTrie(const std::string &filename_base,
+ const RuntimeOptions &runtime_options,
+ const IcingFilesystem *filesystem);
+ ~IcingDynamicTrie() override;
+
+ bool is_initialized() const { return is_initialized_; }
+
+ // Create, but do not Init, a new trie with options if the file does
+ // not already exist.
+ //
+ // Returns true if successfully created all files or files already
+ // exist. Does not do a complete sanity check for when files seem to
+ // exist. Cleans up files if creation fails midstream.
+ bool CreateIfNotExist(const Options &options);
+
+ bool UpgradeTo(int new_version) override { return true; }
+ bool Init() override;
+ void Close() override;
+ bool Remove() override;
+ uint64_t GetDiskUsage() const override;
+
+ // REQUIRED: For all functions below is_initialized() == true.
+
+ // Number of keys in trie.
+ uint32_t size() const;
+
+ // Collecting stats.
+ void CollectStats(Stats *stats) const;
+
+ // Gets all of the contents of the trie for debugging purposes. Note: this
+ // stores the entire set of terms in memory.
+ // pretty_print - The tree structure of the trie will be written to this.
+ // keys - All keys in the trie are appended to this vector.
+ void DumpTrie(std::ostream *pretty_print,
+ std::vector<std::string> *keys) const;
+
+ // Empty out the trie without closing or removing.
+ void Clear();
+
+ // Sync to disk.
+ bool Sync() override;
+
+ // Tell kernel we will access the memory shortly.
+ void Warm() const;
+
+ // Potentially about to get nuked.
+ void OnSleep() override;
+
+ // Compact trie into out for value indices present in old_tvi_to_new_value.
+ class NewValueMap {
+ public:
+ virtual ~NewValueMap();
+
+ // Returns the new value we want to assign to the entry at old
+ // value index. We don't take ownership of the pointer.
+ virtual const void *GetNewValue(uint32_t old_value_index) const = 0;
+ };
+ // Compacts this trie. This drops all deleted keys, drops all keys for which
+ // old_tvi_to_new_value returns nullptr, updates values to be the values
+ // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into
+ // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of
+ // old value_index to new value_index.
+ bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
+ std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const;
+
+ // Insert value at key. If key already exists and replace == true,
+ // replaces old value with value. We take a copy of value.
+ //
+ // If value_index is not NULL, returns a pointer to value in
+ // value_index. This can then be used with SetValueAtIndex
+ // below. value_index is not valid past a Clear/Read/Write.
+ //
+ // Returns false if there is no space left in the trie.
+ //
+ // REQUIRES: value a buffer of size value_size()
+ bool Insert(const char *key, const void *value) {
+ return Insert(key, value, nullptr, true, nullptr);
+ }
+ bool Insert(const char *key, const void *value, uint32_t *value_index,
+ bool replace) {
+ return Insert(key, value, value_index, replace, nullptr);
+ }
+ bool Insert(const char *key, const void *value, uint32_t *value_index,
+ bool replace, bool *pnew_key);
+
+ // Get a value returned by Insert value_index. This points to the
+ // value in the trie. The pointer is immutable and always valid
+ // while the trie is alive.
+ const void *GetValueAtIndex(uint32_t value_index) const;
+
+ // Set a value returned by Insert value_index. We take a copy of
+ // value.
+ //
+ // REQUIRES: value a buffer of size value_size()
+ void SetValueAtIndex(uint32_t value_index, const void *value);
+
+ // Returns true if key is found and sets value. If value_index is
+ // not NULL, returns value_index (see Insert discussion above).
+ // If the key is not found, returns false and neither value nor
+ // value_index is modified.
+ //
+ // REQUIRES: value a buffer of size value_size()
+ bool Find(const char *key, void *value) const {
+ return Find(key, value, nullptr);
+ }
+ bool Find(const char *key, void *value, uint32_t *value_index) const;
+
+ // Find the input key and all keys that are a variant of the input
+ // key according to a variant map. Currently supports
+ // transliteration. For example "a" is a variant for "à" or "á" so
+ // an "a" in the input key can match those characters in the trie in
+ // addition to itself.
+ //
+ // If prefix is set, also returns any prefix matches (so value_index
+ // will be invalid).
+ //
+ // REQUIRES: all terms in the lexicon to be valid utf8.
+ struct OriginalMatch {
+ uint32_t value_index;
+ std::string orig;
+
+ OriginalMatch() : value_index(kInvalidValueIndex) {}
+
+ bool is_full_match() const { return value_index != kInvalidValueIndex; }
+ };
+
+ void GetDebugInfo(int verbosity, std::string *out) const override;
+
+ double min_free_fraction() const;
+
+ uint32_t value_size() const;
+
+ uint32_t max_value_index() const;
+
+ // If in kMapSharedWithCrc mode, update crcs and return the master
+ // crc, else return kNoCrc. This crc includes both the trie files
+ // and property bitmaps.
+ uint32_t UpdateCrc();
+
+ // Store dynamic properties for each value. When a property is added to
+ // a value, the deleted flag is cleared for it (if it was previously set).
+ bool SetProperty(uint32_t value_index, uint32_t property_id);
+ bool ClearProperty(uint32_t value_index, uint32_t property_id);
+
+ // Store deleted property for each value.
+ // This method is not the only way the deleted property can be set; the trie
+ // may set this property itself during other operations if it can determine a
+ // value becomes superfluous.
+ bool SetDeleted(uint32_t value_index);
+
+ // Clears the deleted property for each value.
+ bool ClearDeleted(uint32_t value_index);
+
+ // Clear a specific property id from all values. For each value that has this
+ // property cleared, also check to see if it was the only property set; if
+ // so, set the deleted property for the value to indicate it no longer has any
+ // properties associated with it.
+ bool ClearPropertyForAllValues(uint32_t property_id);
+
+ // Access properties. Usage:
+ //
+ // IcingDynamicTrie::PropertyReader reader(trie, 10);
+ // char value[SIZE];
+ // uint32_t value_index;
+ // if (trie.Find("abc", value, &value_index) &&
+ // reader.HasProperty(value_index)) {
+ // ...
+ // }
+ //
+ // Readers are valid as long as the underlying trie is open.
+ class PropertyReaderBase {
+ public:
+ // Whether underlying file exists.
+ bool Exists() const;
+
+ // Returns false for all values if underlying file is missing.
+ bool HasProperty(uint32_t value_index) const;
+
+ protected:
+ PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted,
+ uint32_t property_id);
+
+ // Does not own.
+ const IcingFlashBitmap *bitmap_;
+ const IcingDynamicTrie &trie_;
+ };
+
+ // Reader for a given property. It is invalidated when the underlying property
+ // is deleted, or the trie is closed.
+ class PropertyReader : public PropertyReaderBase {
+ public:
+ PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id)
+ : PropertyReaderBase(trie, false, property_id) {}
+ };
+
+ // Reader for the deleted property. It is invalidated when the trie is closed.
+ class PropertyDeletedReader : public PropertyReaderBase {
+ public:
+ explicit PropertyDeletedReader(const IcingDynamicTrie &trie)
+ : PropertyReaderBase(trie, true, 0) {}
+ };
+
+ // Reader for all properties (but not the deleted one). It is invalidated when
+ // the trie is closed.
+ class PropertyReadersAll {
+ public:
+ explicit PropertyReadersAll(const IcingDynamicTrie &trie);
+
+ // Whether underlying file for property_id exists.
+ bool Exists(uint32_t property_id) const;
+
+ // Returns false if underlying file or property doesn't exist.
+ bool HasProperty(uint32_t property_id, uint32_t value_index) const;
+
+ // Returns true if the value at value_index is set for the only the supplied
+ // property_id, and none of the other properties.
+ bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const;
+
+ // For iterating.
+ size_t size() const;
+
+ private:
+ const IcingDynamicTrie &trie_;
+ };
+
+ // Iterate through trie in lexicographic order.
+ //
+ // Not thread-safe.
+ //
+ // Change in underlying trie invalidates iterator.
+ class Iterator {
+ public:
+ Iterator(const IcingDynamicTrie &trie, const char *prefix);
+ void Reset();
+ bool Advance();
+
+ // If !IsValid(), GetKey() will return NULL and GetValue() will
+ // return 0.
+ bool IsValid() const;
+ const char *GetKey() const;
+ // This points directly to the underlying data and is valid while
+ // the trie is alive. We keep ownership of the pointer.
+ const void *GetValue() const;
+ uint32_t GetValueIndex() const;
+
+ private:
+ Iterator();
+ // Copy is ok.
+
+ // Helper function that takes the left-most branch down
+ // intermediate nodes to a leaf.
+ void LeftBranchToLeaf(uint32_t node_index);
+
+ std::string cur_key_;
+ const char *cur_suffix_;
+ int cur_suffix_len_;
+ struct Branch {
+ uint32_t node_idx;
+ int child_idx;
+
+ explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
+ };
+ std::vector<Branch> branch_stack_;
+ bool single_leaf_match_;
+
+ const IcingDynamicTrie &trie_;
+ };
+
+ // Represents a non-leaf node or a "virtual" trie node in the suffix
+ // region.
+ struct LogicalNode {
+ const Node *node;
+ int suffix_offset;
+
+ LogicalNode() : node(nullptr), suffix_offset(0) {}
+ LogicalNode(const Node *node_in, int suffix_offset_in)
+ : node(node_in), suffix_offset(suffix_offset_in) {}
+ };
+
+ // Iterate over all utf8 chars in the trie anchored at prefix (or
+ // node). If trie has invalid utf8 chars, behavior is undefined (but
+ // won't crash).
+ class Utf8Iterator {
+ public:
+ void Reset();
+ bool Advance();
+
+ bool IsValid() const;
+
+ private:
+ struct Branch {
+ const Node *node;
+ const Next *child;
+ const Next *child_end;
+
+ bool IsFinished();
+ };
+
+ Utf8Iterator();
+ // Copy is ok.
+
+ void LeftBranchToUtf8End();
+ void InitBranch(Branch *branch, const Node *start, char key_char);
+ void GoIntoSuffix(const Node *node);
+
+ char cur_[UTFmax + 1]; // NULL-terminated
+ int cur_len_;
+ LogicalNode cur_logical_node_;
+
+ Branch branch_stack_[UTFmax];
+ Branch *branch_end_;
+
+ const IcingDynamicTrie &trie_;
+ const Node *start_node_;
+ };
+
+ private:
+ class CandidateSet;
+
+ // For testing only.
+ friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
+ friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
+ void GetHeader(IcingDynamicTrieHeader *hdr) const;
+ void SetHeader(const IcingDynamicTrieHeader &new_hdr);
+
+ static const uint32_t kInvalidNodeIndex;
+ static const uint32_t kInvalidNextIndex;
+ static const uint32_t kInvalidSuffixIndex;
+
+ // Stats helpers.
+ void CollectStatsRecursive(const Node &node, Stats *stats) const;
+
+ // Helpers for Find and Insert.
+ const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
+ const Next *LowerBound(const Next *start, const Next *end,
+ uint8_t key_char) const;
+ void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
+ bool prefix) const;
+
+ // For value properties. This truncates the data by clearing it, but leaving
+ // the storage intact.
+ bool InitPropertyBitmaps();
+
+ // Returns a pointer to a bitmap that is successfully opened.
+ static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap(
+ const std::string &filename, bool verify,
+ const IcingFilesystem *filesystem);
+
+ // Returns a pointer to a writable bitmap, creating it if necessary. Returned
+ // pointer should not be freed, it will be maintained by property_bitmaps_.
+ // Returns null if bitmap failed to load.
+ IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id);
+
+ uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const;
+
+ const std::string filename_base_;
+ bool is_initialized_;
+ const RuntimeOptions runtime_options_;
+ std::unique_ptr<IcingDynamicTrieStorage> storage_;
+ const std::string property_bitmaps_prefix_;
+ std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_;
+ const std::string deleted_bitmap_filename_;
+ std::unique_ptr<IcingFlashBitmap> deleted_bitmap_;
+ const IcingFilesystem *const filesystem_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
diff --git a/icing/legacy/index/icing-filesystem.cc b/icing/legacy/index/icing-filesystem.cc
new file mode 100644
index 0000000..b1e1193
--- /dev/null
+++ b/icing/legacy/index/icing-filesystem.cc
@@ -0,0 +1,638 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-filesystem.h"
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/legacy/portable/icing-zlib.h"
+#include "icing/util/logging.h"
+
+using std::vector;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// The size of the block for st_blksize returned by stat() and as a
+// consequence also the granularity of GetDiskUsage(). It seems that there is
+// no appropriate constant for this. See http://linux.die.net/man/2/stat
+constexpr int kStatBlockSize = 512;
+
+// Logs information about open file descriptors.
+//
+// This function uses getrlimit() to find the maximum number of file
+// descriptors, then calls readlink("/proc/self/fd/N") for each possible file
+// descriptor number to get a description of the open file from procfs.
+//
+// We don't use readdir() to list the contents of /proc/self/fd (which would be
+// the more obvious approach) because that would require a free file descriptor
+// to open the directory, while we call this function when all file descriptors
+// are in use.
+void LogOpenFileDescriptors() {
+ // Determine the limit on file descriptor numbers. RLIMIT_NOFILE should return
+ // the maximum file descriptor + 1, which is 1024 on Android by default. We
+ // restrict the limit to 4096 so we don't take too much time if the value
+ // turns out to be much higher for some reason.
+ constexpr int kMaxFileDescriptorsToStat = 4096;
+ struct rlimit rlim = {0, 0};
+ if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "getrlimit() failed (errno=%d)", errno);
+ return;
+ }
+ int fd_lim = rlim.rlim_cur;
+ if (fd_lim > kMaxFileDescriptorsToStat) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Maximum number of file descriptors (%d) too large.", fd_lim);
+ fd_lim = kMaxFileDescriptorsToStat;
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Listing up to %d file descriptors.", fd_lim);
+
+ // Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
+ // inaccessible for some other reason. In that case, there's no point trying
+ // to read from it.
+ struct stat statbuf;
+ if (stat("/proc/self/fd", &statbuf) != 0 || !S_ISDIR(statbuf.st_mode)) {
+ ICING_LOG(ERROR) << "/proc/self/fd not available. Giving up.";
+ return;
+ }
+
+ // Now read each link individually.
+ char path[1024];
+ char target[1024];
+ for (int fd = 0; fd < fd_lim; ++fd) {
+ snprintf(path, arraysize(path), "/proc/self/fd/%d", fd);
+ ssize_t len = readlink(path, target, arraysize(target));
+ if (len >= 0) {
+ // Zero-terminate the buffer, because readlink() won't.
+ target[len < arraysize(target) ? len : arraysize(target) - 1] = '\0';
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
+ target);
+ } else if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
+ fd, errno);
+ }
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "File descriptor list complete.");
+}
+
+// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
+//
+// If errnum == EMFILE (too many open files), then it also logs a list of open
+// file descriptors (see LogOpenFileDescriptors() above).
+void LogOpenError(const char *desc1, const char *file_name, const char *desc2,
+ int errnum) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ if (errnum == EMFILE) {
+ LogOpenFileDescriptors();
+ }
+}
+
+// Recursive implementation of ListDirectory. Prefix is used to prepend the
+// directory name during recursion.
+// We cannot use scandir due to a bug in old platform versions. See b/7339844.
+bool ListDirectoryInternal(const char *dir_name,
+ const std::unordered_set<std::string> &exclude,
+ bool recursive, const char *prefix,
+ std::vector<std::string> *entries) {
+ DIR *dir = opendir(dir_name);
+ if (!dir) {
+ LogOpenError("Unable to open directory ", dir_name, ": ", errno);
+ return false;
+ }
+
+ dirent *p;
+ // readdir's implementation seems to be thread safe.
+ while ((p = readdir(dir)) != nullptr) {
+ std::string file_name(p->d_name);
+ if (file_name == "." || file_name == ".." ||
+ exclude.find(file_name) != exclude.end()) {
+ continue;
+ }
+ std::string relative_path = absl_ports::StrCat(prefix, p->d_name);
+ entries->push_back(relative_path);
+ // Recurse down directories, if requested.
+ if (recursive && (p->d_type == DT_DIR)) {
+ std::string sub_dir_name = absl_ports::StrCat(dir_name, "/", p->d_name);
+ std::string relative_path_with_slash =
+ absl_ports::StrCat(relative_path, "/");
+ if (!ListDirectoryInternal(sub_dir_name.c_str(), exclude, recursive,
+ relative_path_with_slash.c_str(), entries)) {
+ return false;
+ }
+ }
+ }
+ if (closedir(dir) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Error closing %s: %s", dir_name, strerror(errno));
+ }
+ return true;
+}
+
+} // namespace
+
+IcingScopedFd::~IcingScopedFd() {
+ if (fd_ >= 0) {
+ close(fd_);
+ }
+}
+
+void IcingScopedFd::reset(int fd) {
+ if (fd_ >= 0) {
+ close(fd_);
+ }
+ fd_ = fd;
+}
+
+const uint64_t IcingFilesystem::kBadFileSize;
+
+bool IcingFilesystem::DeleteFile(const char *file_name) const {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ int ret = unlink(file_name);
+ bool success = (ret == 0) || (errno == ENOENT);
+ if (!success) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Deleting file %s failed: %s", file_name, strerror(errno));
+ }
+ return success;
+}
+
+bool IcingFilesystem::DeleteDirectory(const char *dir_name) const {
+ int ret = rmdir(dir_name);
+ bool success = (ret == 0) || (errno == ENOENT);
+ if (!success) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ }
+ return success;
+}
+
+bool IcingFilesystem::DeleteDirectoryRecursively(const char *dir_name) const {
+ // Ensure the dir_name really is a directory and exists.
+ struct stat st;
+ if (stat(dir_name, &st) < 0) {
+ if (errno == ENOENT) {
+ return true; // If directory didn't exist, this was successful.
+ }
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Stat %s failed: %s", dir_name, strerror(errno));
+ return false;
+ }
+ vector<std::string> entries;
+ if (!ListDirectory(dir_name, &entries)) {
+ return false;
+ }
+
+ bool success = true;
+ for (vector<std::string>::iterator i = entries.begin(); i != entries.end();
+ ++i) {
+ std::string filename = std::string(dir_name) + '/' + *i;
+ if (stat(filename.c_str(), &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ success = false;
+ } else if (S_ISDIR(st.st_mode)) {
+ success = DeleteDirectoryRecursively(filename.c_str()) && success;
+ } else {
+ success = DeleteFile(filename.c_str()) && success;
+ }
+ }
+
+ if (success) {
+ success = DeleteDirectory(dir_name);
+ }
+
+ return success;
+}
+
+bool IcingFilesystem::FileExists(const char *file_name) const {
+ bool exists = false;
+ struct stat st;
+ if (stat(file_name, &st) == 0) {
+ exists = S_ISREG(st.st_mode) != 0;
+ } else {
+ if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to stat file %s: %s", file_name, strerror(errno));
+ }
+ exists = false;
+ }
+ return exists;
+}
+
+bool IcingFilesystem::DirectoryExists(const char *dir_name) const {
+ bool exists = false;
+ struct stat st;
+ if (stat(dir_name, &st) == 0) {
+ exists = S_ISDIR(st.st_mode) != 0;
+ } else {
+ if (errno != ENOENT) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ }
+ exists = false;
+ }
+ return exists;
+}
+
+int IcingFilesystem::GetBasenameIndex(const char *file_name) const {
+ // Find final slash.
+ const char *last_slash = strrchr(file_name, '/');
+ if (!last_slash) {
+ // file_name is just basename.
+ return 0;
+ }
+
+ // Skip slash.
+ return last_slash + 1 - file_name;
+}
+
+std::string IcingFilesystem::GetBasename(const char *file_name) const {
+ size_t len = strlen(file_name);
+ int idx = GetBasenameIndex(file_name);
+ return std::string(file_name + idx, len - idx);
+}
+
+std::string IcingFilesystem::GetDirname(const char *file_name) const {
+ int idx = GetBasenameIndex(file_name);
+ // Remove the trailing slash
+ if (idx > 0) {
+ idx -= 1;
+ }
+ return std::string(file_name, idx);
+}
+
+bool IcingFilesystem::ListDirectory(const char *dir_name,
+ vector<std::string> *entries) const {
+ entries->clear();
+ return ListDirectory(dir_name, /*exclude=*/{}, /*recursive=*/false, entries);
+}
+
+bool IcingFilesystem::ListDirectory(
+ const char *dir_name, const std::unordered_set<std::string> &exclude,
+ bool recursive, std::vector<std::string> *entries) const {
+ return ListDirectoryInternal(dir_name, exclude, recursive, /*prefix=*/"",
+ entries);
+}
+
+bool IcingFilesystem::GetMatchingFiles(const char *glob,
+ vector<std::string> *matches) const {
+ matches->clear();
+
+ // Split dirname/basename.
+ int basename_idx = GetBasenameIndex(glob);
+ if (basename_idx == 0) {
+ // We need a directory.
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Expected directory, no matching files for: %s", glob);
+ return true;
+ }
+ const char *basename_glob = glob + basename_idx;
+ std::string dirname(glob, basename_idx);
+ vector<std::string> entries;
+ if (!ListDirectory(dirname.c_str(), &entries) && errno != ENOENT) {
+ return false;
+ }
+
+ for (vector<std::string>::iterator i = entries.begin(); i != entries.end();
+ ++i) {
+ // The filename needs to match glob following last_slash.
+ if (!fnmatch(basename_glob, i->c_str(), FNM_PATHNAME)) {
+ // Add it to the list.
+ matches->push_back(dirname + *i);
+ }
+ }
+ return true;
+}
+
+int IcingFilesystem::OpenForWrite(const char *file_name) const {
+ int fd = open(file_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for write failed: ", errno);
+ }
+ return fd;
+}
+
+int IcingFilesystem::OpenForAppend(const char *file_name) const {
+ // Don't use the O_APPEND flag because, although it opens for
+ // append, it doesn't set the file cursor to at the end until
+ // first write occurs. This can be confusing if you expect
+ // the file position at the end. Instead, explicitly
+ // seek to end after opening.
+ int fd = open(file_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for write failed: ", errno);
+ } else {
+ lseek(fd, 0, SEEK_END);
+ }
+ return fd;
+}
+
+int IcingFilesystem::OpenForRead(const char *file_name) const {
+ int fd = open(file_name, O_RDONLY);
+ if (fd < 0) {
+ LogOpenError("Opening file ", file_name, " for read failed: ", errno);
+ }
+ return fd;
+}
+
+uint64_t IcingFilesystem::GetFileSize(int fd) const {
+ struct stat st;
+ uint64_t size = kBadFileSize;
+ if (fstat(fd, &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
+ strerror(errno));
+ } else {
+ size = st.st_size;
+ }
+ return size;
+}
+
+uint64_t IcingFilesystem::GetFileSize(const char *filename) const {
+ struct stat st;
+ uint64_t size = kBadFileSize;
+ if (stat(filename, &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to stat file %s: %s", filename, strerror(errno));
+ } else {
+ size = st.st_size;
+ }
+ return size;
+}
+
+bool IcingFilesystem::Truncate(int fd, uint64_t new_size) const {
+ int ret = ftruncate(fd, new_size);
+ if (ret == 0) {
+ lseek(fd, new_size, SEEK_SET);
+ } else {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to truncate file: %s", strerror(errno));
+ }
+ return (ret == 0);
+}
+
+bool IcingFilesystem::Truncate(const char *filename, uint64_t new_size) const {
+ int fd = OpenForAppend(filename);
+ if (fd == -1) {
+ return false;
+ }
+ bool success = Truncate(fd, new_size);
+ close(fd);
+ return success;
+}
+
+bool IcingFilesystem::Grow(int fd, uint64_t new_size) const {
+ int ret = ftruncate(fd, new_size);
+ if (ret != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
+ strerror(errno));
+ }
+ return (ret == 0);
+}
+
+bool IcingFilesystem::Write(int fd, const void *data, size_t data_size) const {
+ size_t write_len = data_size;
+ do {
+ // Don't try to write too much at once.
+ size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
+ ssize_t wrote = write(fd, data, chunk_size);
+ if (wrote < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
+ strerror(errno));
+ return false;
+ }
+ data = static_cast<const uint8_t *>(data) + wrote;
+ write_len -= wrote;
+ } while (write_len > 0);
+ return true;
+}
+
+bool IcingFilesystem::PWrite(int fd, off_t offset, const void *data,
+ size_t data_size) const {
+ size_t write_len = data_size;
+ do {
+ // Don't try to write too much at once.
+ size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
+ ssize_t wrote = pwrite(fd, data, chunk_size, offset);
+ if (wrote < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
+ strerror(errno));
+ return false;
+ }
+ data = static_cast<const uint8_t *>(data) + wrote;
+ write_len -= wrote;
+ offset += wrote;
+ } while (write_len > 0);
+ return true;
+}
+
+bool IcingFilesystem::DataSync(int fd) const {
+#ifdef __APPLE__ // iOS has no fdatasync(), only fsync()
+ int result = fsync(fd);
+#else
+ int result = fdatasync(fd);
+#endif
+
+ if (result < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool IcingFilesystem::RenameFile(const char *old_name,
+ const char *new_name) const {
+ if (rename(old_name, new_name) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Unable to rename file %s to %s: %s", old_name, new_name,
+ strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+bool IcingFilesystem::SwapFiles(const char *one, const char *two) const {
+ std::string tmp_name = absl_ports::StrCat(one, ".tmp");
+ const char *tmp_cstr = tmp_name.c_str();
+
+ // Blow away a tmp file if it already exists
+ if (FileExists(tmp_cstr) && !DeleteFile(tmp_cstr)) {
+ return false;
+ }
+ if (DirectoryExists(tmp_cstr) && !DeleteDirectoryRecursively(tmp_cstr)) {
+ return false;
+ }
+
+ // Perform the swap
+ if (!RenameFile(one, tmp_cstr)) {
+ return false;
+ }
+ if (!RenameFile(two, one)) {
+ return false;
+ }
+ if (!RenameFile(tmp_cstr, two)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool IcingFilesystem::CreateDirectory(const char *dir_name) const {
+ bool success = DirectoryExists(dir_name);
+ if (!success) {
+ if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
+ success = true;
+ } else {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Creating directory %s failed: %s", dir_name, strerror(errno));
+ }
+ }
+ return success;
+}
+
+bool IcingFilesystem::CreateDirectoryRecursively(const char *dir_name) const {
+ if ((strlen(dir_name) == 0) || DirectoryExists(dir_name)) {
+ return true;
+ }
+ std::string path_before = GetDirname(dir_name);
+ if (!CreateDirectoryRecursively(path_before.c_str())) {
+ return false;
+ }
+ return CreateDirectory(dir_name);
+}
+
+bool IcingFilesystem::CopyFile(const char *src, const char *dst) const {
+ bool success = false;
+
+ int src_fd = -1;
+ int dst_fd = -1;
+ uint64_t size = 0;
+ IcingMMapper mapper(true, MAP_PRIVATE);
+
+ if ((src_fd = OpenForRead(src)) < 0) {
+ goto end;
+ }
+ if ((dst_fd = OpenForWrite(dst)) < 0) {
+ goto end;
+ }
+ size = GetFileSize(src_fd);
+ mapper.Remap(src_fd, 0, size);
+ if (!mapper.is_valid()) {
+ goto end;
+ }
+ success = Write(dst_fd, mapper.address(), mapper.len());
+
+end:
+ if (src_fd > 0) close(src_fd);
+ if (dst_fd > 0) close(dst_fd);
+ if (!success) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Couldn't copy file %s to %s", src, dst);
+ }
+ return success;
+}
+
+bool IcingFilesystem::ComputeChecksum(int fd, uint32_t *checksum,
+ uint64_t offset, uint64_t length) const {
+ if (length == 0) {
+ return true;
+ }
+ IcingMMapper mapper(fd, true, offset, length, MAP_PRIVATE);
+ if (!mapper.is_valid()) {
+ return false;
+ }
+ *checksum = adler32(*checksum, mapper.address(), mapper.len());
+ return true;
+}
+
+uint64_t IcingFilesystem::GetDiskUsage(int fd) const {
+ struct stat st;
+ if (fstat(fd, &st) < 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
+ strerror(errno));
+ return kBadFileSize;
+ }
+ return st.st_blocks * kStatBlockSize;
+}
+
+uint64_t IcingFilesystem::GetFileDiskUsage(const char *path) const {
+ struct stat st;
+ if (stat(path, &st) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
+ path, strerror(errno));
+ return kBadFileSize;
+ }
+ return st.st_blocks * kStatBlockSize;
+}
+
+uint64_t IcingFilesystem::GetDiskUsage(const char *path) const {
+ struct stat st;
+ if (stat(path, &st) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
+ path, strerror(errno));
+ return kBadFileSize;
+ }
+ uint64_t result = st.st_blocks * kStatBlockSize;
+ if (S_ISDIR(st.st_mode)) {
+ vector<std::string> list;
+ if (!ListDirectory(path, &list)) {
+ return kBadFileSize;
+ }
+ for (vector<std::string>::iterator i = list.begin(); i != list.end(); ++i) {
+ std::string sub_path = std::string(path) + '/' + *i;
+ uint64_t sub_usage = GetDiskUsage(sub_path.c_str());
+ if (sub_usage != kBadFileSize) {
+ result += sub_usage;
+ } // Else just ignore the failing entry.
+ }
+ }
+ return result;
+}
+
+void IcingFilesystem::IncrementByOrSetInvalid(uint64_t size,
+ uint64_t *to_increment) {
+ if (*to_increment == kBadFileSize) {
+ return;
+ }
+ if (size == kBadFileSize) {
+ *to_increment = kBadFileSize;
+ return;
+ }
+ *to_increment += size;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
new file mode 100644
index 0000000..2b10c1c
--- /dev/null
+++ b/icing/legacy/index/icing-filesystem.h
@@ -0,0 +1,230 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Methods for interacting with the filesystem.
+
+#ifndef ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
+#define ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+// Closes fd when it goes out of scope, if fd >= 0.
+class IcingScopedFd {
+ public:
+ explicit IcingScopedFd(int fd = -1) : fd_(fd) {}
+ IcingScopedFd(const IcingScopedFd &) = delete;
+ IcingScopedFd(IcingScopedFd &&other) : IcingScopedFd() {
+ *this = std::move(other);
+ }
+
+ IcingScopedFd &operator=(const IcingScopedFd &) = delete;
+ IcingScopedFd &operator=(IcingScopedFd &&other) {
+ std::swap(fd_, other.fd_);
+ return *this;
+ }
+ ~IcingScopedFd();
+
+ bool is_valid() const { return fd_ >= 0; }
+ int operator*() const { return fd_; }
+ int get() const { return fd_; }
+ void reset(int fd = -1);
+
+ private:
+ int fd_;
+};
+
+struct IcingFILEDeleter {
+ void operator()(FILE *fp) const {
+ if (fp) {
+ fclose(fp);
+ }
+ }
+};
+typedef std::unique_ptr<FILE, IcingFILEDeleter> IcingScopedFILE;
+
+// Class containing file operation methods.
+// If you change methods in this class, don't forget to update the mock:
+// java/com/google/android/gmscore/integ/modules/icing/jni/index/mock-filesystem.h
+class IcingFilesystem {
+ public:
+ static const uint64_t kBadFileSize = static_cast<uint64_t>(-1);
+
+ constexpr IcingFilesystem() {}
+ virtual ~IcingFilesystem() {}
+
+ // Deletes a file, returns true on success or if the file did
+ // not yet exist.
+ virtual bool DeleteFile(const char *file_name) const;
+
+ // Deletes a directory, returns true on success or if the directory did
+ // not yet exist.
+ virtual bool DeleteDirectory(const char *dir_name) const;
+
+ // Deletes a directory, including any contents, and returns true on
+ // success or if the directory did not yet exist.
+ virtual bool DeleteDirectoryRecursively(const char *dir_name) const;
+
+ // Returns true if a file exists. False if the file doesn't exist.
+ // If there is an error getting stat on the file, it logs the error and
+ // asserts.
+ virtual bool FileExists(const char *file_name) const;
+
+ // Returns true if a directory exists. False if the file doesn't exist.
+ // If there is an error getting stat on the file, it logs the error and
+ // asserts.
+ virtual bool DirectoryExists(const char *dir_name) const;
+
+ // Return index to start of basename in file_name. Anything before
+ // basename is the dirname (including the final slash).
+ virtual int GetBasenameIndex(const char *file_name) const;
+
+ // Return a string containing the basename.
+ virtual std::string GetBasename(const char *file_name) const;
+
+ // Return a string containing the dirname.
+ virtual std::string GetDirname(const char *file_name) const;
+
+ // Gets the names of the entries of a given directory. Does not include "."
+ // and "..". Returns false on error.
+ virtual bool ListDirectory(const char *dir_name,
+ std::vector<std::string> *entries) const;
+
+ // Adds the names of the entries of a given directory -- recursively if
+ // specified, and excluding files/directories named in exclude -- to entries.
+ // Regardless of exclude, does not include "." and "..". Excluded files are
+ // excluded at every level. Returns false on error.
+ //
+ // Example use case: list all files & directories in fooDir/, recursively,
+ // excluding anything named "tmp" or "cache" (presumed directories) and the
+ // files within them.
+ virtual bool ListDirectory(const char *dir_name,
+ const std::unordered_set<std::string> &exclude,
+ bool recursive,
+ std::vector<std::string> *entries) const;
+
+ // Use glob to return matched files into "matches". Returns false if
+ // glob had an error.
+ //
+ // Cannot match multiple directories so everything up the last slash
+ // must be literal.
+ virtual bool GetMatchingFiles(const char *glob,
+ std::vector<std::string> *matches) const;
+
+ // Opens the file for read/write. Creates if not existing. Returns
+ // -1 on fail or an open file descriptor on success.
+ virtual int OpenForWrite(const char *file_name) const;
+
+ // Opens the file for read/write, and positions the file at the
+ // end for appending. Creates if not existing. Returns -1 on fail
+ // or an open file descriptor on success.
+ virtual int OpenForAppend(const char *file_name) const;
+
+ // Opens a file for read only. Fails if file does exist. Returns
+ // file descriptor or -1 on fail. Set quiet to true to suppress
+ // log warnings.
+ virtual int OpenForRead(const char *file_name) const;
+
+ // Gets the size of a file, given an open file descriptor.
+ // Returns kBadFileSize on error.
+ virtual uint64_t GetFileSize(int fd) const;
+
+ // Gets the size of a file, given a filename.
+ virtual uint64_t GetFileSize(const char *filename) const;
+
+ // Truncates the file to the requested size. Seeks to the
+ // end position of the file after truncate. Returns false
+ // if fails.
+ virtual bool Truncate(int fd, uint64_t new_size) const;
+
+ // Truncates the file to the requested size.
+ // Returns false if fails.
+ virtual bool Truncate(const char *filename, uint64_t new_size) const;
+
+ // Grows the file to the requested size. Does not change the
+ // position pointer.
+ virtual bool Grow(int fd, uint64_t new_size) const;
+
+ // Writes to a file. Returns true if all the data was successfully
+ // written. Handles interrupted writes.
+ virtual bool Write(int fd, const void *data, size_t data_size) const;
+ virtual bool PWrite(int fd, off_t offset, const void *data,
+ size_t data_size) const;
+
+ // Syncs the file to disk (fdatasync). Returns true on success.
+ virtual bool DataSync(int fd) const;
+
+ // Renames a file. A file with new_name must not already exist.
+ virtual bool RenameFile(const char *old_name, const char *new_name) const;
+
+ // Renames two files or directories so their names are swapped.
+ // Both names must already exist.
+ virtual bool SwapFiles(const char *one, const char *two) const;
+
+ // Creates a directory if it does not yet exist.
+ virtual bool CreateDirectory(const char *dir_name) const;
+
+ // Creates a directory if it does not yet exist, building the entire path
+ // if it does not yet exist.
+ virtual bool CreateDirectoryRecursively(const char *dir_name) const;
+
+ // Copy a file.
+ virtual bool CopyFile(const char *src, const char *dst) const;
+
+ // Compute an adler32 checksum over the [offset, offset+length) span
+ // of an open file. Returns false if the file could not be read.
+ // The checksum is an input/output variable (whatever value is
+ // stored there will prime the checksum computation). If length is
+ // 0, can be used to prime a checksum for future appends.
+ virtual bool ComputeChecksum(int fd, uint32_t *checksum, uint64_t offset,
+ uint64_t length) const;
+
+ // Compute the disk usage of the given file. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length.
+ // Returns kBadFileSize on error.
+ virtual uint64_t GetDiskUsage(int fd) const;
+
+ // Compute the disk usage of the given file or directory. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length. Returns kBadFileSize on
+ // error.
+ // Does not recurse on directories.
+ virtual uint64_t GetFileDiskUsage(const char *path) const;
+
+ // Compute the disk usage of the given file or directory. Similarly to the
+ // 'du' command, it attempts to estimate the actual disk usage, so for
+ // sparse files it may return less than their length. Returns kBadFileSize on
+ // error.
+ // Recurses on directories.
+ virtual uint64_t GetDiskUsage(const char *path) const;
+
+ // Increments to_increment by size if size is valid, or sets to_increment
+ // to kBadFileSize if either size or to_increment is kBadFileSize.
+ static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
diff --git a/icing/legacy/index/icing-flash-bitmap.cc b/icing/legacy/index/icing-flash-bitmap.cc
new file mode 100644
index 0000000..56dec00
--- /dev/null
+++ b/icing/legacy/index/icing-flash-bitmap.cc
@@ -0,0 +1,421 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-flash-bitmap.h"
+
+#include <sys/mman.h>
+
+#include <memory>
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// TODO(b/77482303) : Remove version from the IcingFlashBitmap header - magic
+// makes it unnecessary.
+struct IcingFlashBitmap::Header {
+ uint32_t magic;
+ uint32_t version;
+ uint32_t crc;
+ uint32_t dirty;
+};
+
+// Helper class used to access the data and the header regions
+// of the shared memory. The header appears first, followed by the
+// bitmap memory.
+class IcingFlashBitmap::Accessor {
+ public:
+ explicit Accessor(IcingMMapper *mmapper) : mmapper_(mmapper) {}
+ IcingFlashBitmap::Header *header() {
+ return reinterpret_cast<IcingFlashBitmap::Header *>(mmapper_->address());
+ }
+ const IcingFlashBitmap::Header *header() const {
+ return reinterpret_cast<const IcingFlashBitmap::Header *>(
+ mmapper_->address());
+ }
+ const char *data() const {
+ return reinterpret_cast<const char *>(mmapper_->address() +
+ sizeof(IcingFlashBitmap::Header));
+ }
+ size_t data_size() const {
+ return mmapper_->len() - sizeof(IcingFlashBitmap::Header);
+ }
+ size_t num_words() const { return data_size() / sizeof(Word); }
+ Word *data32() {
+ return reinterpret_cast<Word *>(mmapper_->address() +
+ sizeof(IcingFlashBitmap::Header));
+ }
+ const Word *data32() const { return reinterpret_cast<const Word *>(data()); }
+ const Word *end32() const {
+ return reinterpret_cast<const Word *>(mmapper_->address() +
+ mmapper_->len());
+ }
+
+ private:
+ IcingMMapper *const mmapper_;
+};
+
+bool IcingFlashBitmap::Verify() const {
+ if (!is_initialized()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Can't verify unopened flash bitmap %s", filename_.c_str());
+ return false;
+ }
+ if (mmapper_ == nullptr) {
+ // Opened for read and file doesn't exist.
+ return true;
+ }
+ Accessor accessor(mmapper_.get());
+ if (accessor.header()->magic != kMagic) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flash bitmap %s has incorrect magic header", filename_.c_str());
+ return false;
+ }
+ if (accessor.header()->version != kCurVersion) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flash bitmap %s has incorrect version", filename_.c_str());
+ return false;
+ }
+ if (accessor.header()->dirty) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flash bitmap %s is dirty", filename_.c_str());
+ return false;
+ }
+ uint32_t crc =
+ IcingStringUtil::UpdateCrc32(0, accessor.data(), accessor.data_size());
+ if (accessor.header()->crc != crc) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Flash bitmap %s has incorrect CRC32 %u %u", filename_.c_str(),
+ accessor.header()->crc, crc);
+ return false;
+ }
+ return true;
+}
+
+bool IcingFlashBitmap::Init() {
+ Close();
+
+ // Ensure the storage directory exists
+ std::string storage_dir = filesystem_->GetDirname(filename_.c_str());
+ if (!filesystem_->CreateDirectoryRecursively(storage_dir.c_str())) {
+ return false;
+ }
+
+ IcingScopedFd fd(filesystem_->OpenForWrite(filename_.c_str()));
+ if (!fd.is_valid()) {
+ return false;
+ }
+
+ // Figure out our file size for mmap.
+ uint64_t orig_file_size = filesystem_->GetFileSize(fd.get());
+ uint64_t file_size = orig_file_size;
+ if (orig_file_size == IcingFilesystem::kBadFileSize) {
+ goto error;
+ }
+
+ // Make sure we have something to mmap.
+ if (orig_file_size < kGrowSize) {
+ if (!filesystem_->Grow(fd.get(), kGrowSize)) {
+ goto error;
+ }
+ file_size = kGrowSize;
+ }
+
+ // Mmap for write.
+ mmapper_ =
+ std::make_unique<IcingMMapper>(fd.get(), false, 0, file_size, MAP_SHARED);
+ if (!mmapper_->is_valid()) {
+ goto error;
+ }
+
+ // Set open_type_ before the possible flush on create.
+ open_type_ = READ_WRITE;
+
+ if (orig_file_size == 0) {
+ Accessor accessor(mmapper_.get());
+ // Original file didn't yet exist, create header.
+ accessor.header()->magic = kMagic;
+ accessor.header()->version = kCurVersion;
+ accessor.header()->dirty = true; // Forces crc update at sync.
+ // Sync file so we know it's supposed to exist.
+ if (!Sync()) {
+ goto error;
+ }
+ }
+ return true;
+
+error:
+ open_type_ = UNOPENED;
+ mmapper_.reset();
+ return false;
+}
+
+bool IcingFlashBitmap::InitForRead() {
+ IcingTimer open_timer;
+ Close();
+
+ // Cannot mmap non-existing or zero-size files.
+ // It's not an error in this case, it just means the
+ // bitmap is empty, so proceed without mapping it.
+ if (!filesystem_->FileExists(filename_.c_str()) ||
+ filesystem_->GetFileSize(filename_.c_str()) == 0) {
+ open_type_ = READ_ONLY;
+ return true;
+ }
+
+ IcingScopedFd fd(filesystem_->OpenForRead(filename_.c_str()));
+ if (!fd.is_valid()) {
+ return false;
+ }
+
+#ifdef __APPLE__
+ // No MAP_POPULATE in iOS (so no pre-page-faulting. See man mmap)
+ // On Apple we need MAP_SHARED even for sharing the state within the same
+ // process (which gets optimized in the linux-implementation).
+ // Usages of flash-bitmap are expected to flush the content (delayed for
+ // performance reasons). That implies that the copy-on-write behavior of
+ // MAP_PRIVATE is a performance optimization, and MAP_SHARED as alternative
+ // behavior is acceptable.
+ int flags = MAP_SHARED;
+#else
+ int flags = MAP_PRIVATE | MAP_POPULATE;
+#endif
+
+ // Figure out our file size for mmap.
+ uint64_t file_size = filesystem_->GetFileSize(fd.get());
+ if (file_size == IcingFilesystem::kBadFileSize) {
+ goto error;
+ }
+
+ // Slurp the bitmap in one go with MAP_POPULATE
+ mmapper_ =
+ std::make_unique<IcingMMapper>(fd.get(), true, 0, file_size, flags);
+ if (!mmapper_->is_valid()) {
+ goto error;
+ }
+
+ open_type_ = READ_ONLY;
+ return true;
+
+error:
+ open_type_ = UNOPENED;
+ mmapper_.reset();
+ return false;
+}
+
+void IcingFlashBitmap::Close() {
+ if (is_initialized()) {
+ UpdateCrc();
+ mmapper_.reset();
+ open_type_ = UNOPENED;
+ }
+}
+
+bool IcingFlashBitmap::Delete() {
+ Close();
+ return filesystem_->DeleteFile(filename_.c_str());
+}
+
+bool IcingFlashBitmap::Sync() const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "Bitmap not initialized";
+ }
+
+ UpdateCrc();
+ return (mmapper_ == nullptr) ? true : mmapper_->Sync();
+}
+
+uint64_t IcingFlashBitmap::GetDiskUsage() const {
+ // For non-existing files, size is 0.
+ if (mmapper_ == nullptr) {
+ return 0;
+ }
+ return filesystem_->GetFileDiskUsage(filename_.c_str());
+}
+
+uint32_t IcingFlashBitmap::UpdateCrc() const {
+ Accessor accessor(mmapper_.get());
+ if (open_type_ == READ_WRITE && accessor.header()->dirty) {
+ accessor.header()->crc = IcingStringUtil::UpdateCrc32(
+ kEmptyCrc, accessor.data(), accessor.data_size());
+ accessor.header()->dirty = false;
+ }
+
+ // Non-existent mmapper means file does not exist. An empty file has
+ // a crc of kEmptyCrc, so just return that.
+ return mmapper_.get() ? accessor.header()->crc : kEmptyCrc;
+}
+
+bool IcingFlashBitmap::Grow(size_t new_file_size) {
+ IcingScopedFd fd(filesystem_->OpenForWrite(filename_.c_str()));
+ if (!filesystem_->Grow(fd.get(), new_file_size)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Grow %s to new size %zu failed", filename_.c_str(), new_file_size);
+ return false;
+ }
+ if (!mmapper_->Remap(fd.get(), 0, new_file_size)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Remap of %s after grow failed", filename_.c_str());
+ return false;
+ }
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Grew %s new size %zu", filename_.c_str(), new_file_size);
+ Accessor accessor(mmapper_.get());
+ accessor.header()->dirty = true;
+ return true;
+}
+
+bool IcingFlashBitmap::SetBit(uint64_t idx, bool value) {
+ if (open_type_ != READ_WRITE) {
+ ICING_LOG(FATAL) << "Bitmap not opened with type READ_WRITE";
+ }
+
+ Accessor accessor(mmapper_.get());
+
+ // Figure out which word needs to be modified.
+ uint64_t word_offset = idx / kWordBits;
+
+ // Grow file (and mmap) if word_offset >= file size / sizeof(Word).
+ if (word_offset >= accessor.num_words()) {
+ if (!value) {
+ // Values beyond the end of file are false by default, don't write it.
+ return true;
+ }
+ // Grow enough to fit word_offset (including the header).
+ size_t file_size = sizeof(Header) + (word_offset + 1) * sizeof(Word);
+ // Align to kGrowSize.
+ file_size = ALIGN_UP(file_size, kGrowSize);
+ if (!Grow(file_size)) {
+ return false;
+ }
+ }
+
+ // Set the word in the mmapped region.
+ Word *words = accessor.data32();
+ Word mask = GetWordBitmask(idx);
+ Word old_word = words[word_offset];
+ Word new_word = value ? old_word | mask : old_word & ~mask;
+ if (new_word != old_word) {
+ words[word_offset] = new_word;
+ accessor.header()->dirty = true;
+ }
+ return true;
+}
+
+bool IcingFlashBitmap::GetBit(uint64_t idx) const {
+ return GetWord(idx / kWordBits) & GetWordBitmask(idx);
+}
+
+IcingFlashBitmap::Word IcingFlashBitmap::GetWord(uint64_t idx) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "Bitmap not initialized";
+ }
+
+ // For non-existing files, always return false.
+ if (mmapper_ == nullptr) {
+ return 0;
+ }
+
+ Accessor accessor(mmapper_.get());
+ // Check that we are within limits.
+ if (idx >= accessor.num_words()) {
+ return 0;
+ }
+ return accessor.data32()[idx];
+}
+
+size_t IcingFlashBitmap::NumWords() const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "Bitmap not initialized";
+ }
+
+ // For non-existing files, always return false.
+ if (mmapper_ == nullptr) {
+ return 0;
+ }
+
+ return Accessor(mmapper_.get()).num_words();
+}
+
+IcingFlashBitmap::Word IcingFlashBitmap::GetWordBitmask(uint64_t idx) {
+ return 1u << (idx % kWordBits);
+}
+
+void IcingFlashBitmap::Truncate(uint64_t idx) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "Bitmap not initialized";
+ }
+
+ Accessor accessor(mmapper_.get());
+ size_t num_words = accessor.num_words();
+
+ uint64_t word_offset = idx / kWordBits;
+ if (word_offset >= num_words) {
+ // Truncation offset beyond actual file. We're done.
+ return;
+ }
+
+ Word *words = accessor.data32();
+
+ // Keep only bits < idx in the last word.
+ words[word_offset] &= (GetWordBitmask(idx) - 1);
+
+ // Clear everything starting at word_offset + 1
+ uint64_t last_word_offset = word_offset + 1;
+ if (last_word_offset < num_words) {
+ memset(words + last_word_offset, 0,
+ (num_words - last_word_offset) * sizeof(Word));
+ }
+ accessor.header()->dirty = true;
+ UpdateCrc();
+}
+
+bool IcingFlashBitmap::OrBitmap(const IcingFlashBitmap &bitmap) {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "Bitmap not initialized";
+ }
+
+ if (mmapper_ == nullptr || bitmap.mmapper_ == nullptr) {
+ // TODO(b/32125196): Figure out how we can get into this state.
+ return false;
+ }
+
+ // If this bitmap is smaller than the source, then grow the
+ // size to match.
+ if (mmapper_->len() < bitmap.mmapper_->len()) {
+ if (!Grow(bitmap.mmapper_->len())) {
+ return false;
+ }
+ }
+ Accessor src_accessor(bitmap.mmapper_.get());
+ const Word *src = src_accessor.data32();
+ const Word *end = src_accessor.end32();
+
+ Accessor dst_accessor(mmapper_.get());
+ Word *dst = dst_accessor.data32();
+ while (src < end) {
+ *dst++ |= *src++;
+ }
+ dst_accessor.header()->dirty = true;
+ UpdateCrc();
+ return true;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
new file mode 100644
index 0000000..9abd369
--- /dev/null
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -0,0 +1,154 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// A disk-backed bitmap.
+//
+// For writing:
+//
+// Init();
+// SetBit(bit_idx, <true|false>) // Automatically grows with SetBit.
+// ...
+// Sync(); // SetBit takes effect immediately but Sync persists to disk.
+//
+// For reading:
+//
+// InitForRead();
+// GetBit(bit_idx);
+// ...
+// Close();
+//
+// InitForRead uses mmap MAP_POPULATE to fault the entire file to
+// memory. Subsequent random GetBits are very fast (nanoseconds).
+//
+// It's ok to call Init after InitForRead. The last "Init" call takes
+// effect.
+
+#ifndef ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
+#define ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+
+namespace icing {
+namespace lib {
+
+class IcingFlashBitmap {
+ public:
+ using Word = uint32_t;
+
+ static constexpr uint32_t kEmptyCrc = 0;
+ static constexpr size_t kGrowSize = (1u << 12); // 4KB;
+ static constexpr size_t kWordBits = 8 * sizeof(Word);
+
+ IcingFlashBitmap(const std::string &filename,
+ const IcingFilesystem *filesystem)
+ : filesystem_(filesystem), filename_(filename), open_type_(UNOPENED) {}
+ ~IcingFlashBitmap() { Close(); }
+
+ // Init for read and write. Returns true on success. Does not verify
+ // the data checksum. To do so call Verify explicitly.
+ bool Init();
+
+ // Init for read only. Also faults the entire file into memory with
+ // MAP_POPULATE. Does not verify the data checksum. To do so call Verify
+ // explicitly.
+ bool InitForRead();
+
+ // Verifies the integrity of the data by checking the header values
+ // and data checksum. Returns true if opened with InitForRead() and
+ // file does not exist.
+ bool Verify() const;
+
+ // If either of the init functions was called successfully.
+ bool is_initialized() const { return open_type_ != UNOPENED; }
+
+ // Close file and release resources. Leaves the bitmap in uninitialized state.
+ void Close();
+
+ // The following functions require is_initialized() with Init()
+ // EXCEPT GetBit() which requires Init() or InitForRead().
+
+ // Close and delete the underlying file. Leaves the bitmap in uninitialized
+ // state (even if deletion failed).
+ bool Delete();
+
+ // Delete the underlying file, and reinitialize it. If successful, the bitmap
+ // is initialized.
+ bool Clear() { return Delete() && Init(); }
+
+ // Sync the changes to disk.
+ bool Sync() const;
+
+ uint64_t GetDiskUsage() const;
+
+ // Set or clear a bit at idx. Automatically resizes the file to fit
+ // idx. Returns true on success.
+ bool SetBit(uint64_t idx, bool value);
+
+ // Get the value of bit at idx. If idx is out of range, returns false.
+ // Can be called with InitForRead().
+ bool GetBit(uint64_t idx) const;
+
+ // Get the idx'th word in the bitmap. If idx is out of range, returns zero.
+ // Can be called with InitForRead().
+ Word GetWord(uint64_t idx) const;
+ size_t NumWords() const;
+
+ // Clear all bits starting at idx.
+ void Truncate(uint64_t idx);
+
+ // Ors all the set bits from a given bitmap into this bitmap.
+ bool OrBitmap(const IcingFlashBitmap &bitmap);
+
+ const std::string &filename() const { return filename_; }
+
+ // If the bitmap is dirty, update the crc and mark it clean.
+ uint32_t UpdateCrc() const;
+
+ private:
+ class Accessor;
+ struct Header;
+
+ static const uint32_t kMagic = 0x394b0698;
+ static const uint32_t kCurVersion = 18;
+
+ enum OpenType { UNOPENED, READ_ONLY, READ_WRITE };
+
+ static Word GetWordBitmask(uint64_t idx);
+
+ // Increase the size of the bitmap file to the new size. Return true
+ // on success.
+ bool Grow(size_t new_file_size);
+
+ // Upgrade for version 18.
+ bool UpgradeTo18();
+
+ const IcingFilesystem *const filesystem_;
+ std::string filename_;
+ OpenType open_type_;
+ std::unique_ptr<IcingMMapper> mmapper_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
diff --git a/icing/legacy/index/icing-lite-index-header.h b/icing/legacy/index/icing-lite-index-header.h
new file mode 100644
index 0000000..ac2d3c0
--- /dev/null
+++ b/icing/legacy/index/icing-lite-index-header.h
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
+#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-common-types.h"
+
+namespace icing {
+namespace lib {
+
+// A wrapper around the actual mmapped header data.
+class IcingLiteIndex_Header {
+ public:
+ virtual ~IcingLiteIndex_Header() = default;
+
+ // Returns true if the magic of the header matches the hard-coded magic
+ // value associated with this header format.
+ virtual bool check_magic() const = 0;
+
+ virtual uint32_t lite_index_crc() const = 0;
+ virtual void set_lite_index_crc(uint32_t crc) = 0;
+
+ virtual uint32_t last_added_docid() const = 0;
+ virtual void set_last_added_docid(uint32_t last_added_docid) = 0;
+
+ virtual uint32_t cur_size() const = 0;
+ virtual void set_cur_size(uint32_t cur_size) = 0;
+
+ virtual uint32_t searchable_end() const = 0;
+ virtual void set_searchable_end(uint32_t searchable_end) = 0;
+
+ virtual uint32_t CalculateHeaderCrc() const = 0;
+
+ virtual void Reset() = 0;
+};
+
+class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
+ public:
+ struct HeaderData {
+ static const uint32_t kMagic = 0x6dfba6a0;
+
+ uint32_t lite_index_crc;
+ uint32_t magic;
+ // This field is available to be reclaimed for another purpose without
+ // forcing a change in header size. NOTE: claiming this fields doesn't
+ // guarantee that the newly claimed field will have the proper value. If you
+ // are depending on the value of this field then you will have to have a
+ // migration - either a one-time event during Upgrade() or Init() or
+ // determined by a flag change in Init().
+ uint32_t padding;
+ uint32_t last_added_docid;
+ uint32_t cur_size;
+ uint32_t searchable_end;
+ };
+
+ explicit IcingLiteIndex_HeaderImpl(HeaderData *hdr) : hdr_(hdr) {}
+
+ bool check_magic() const override {
+ return hdr_->magic == HeaderData::kMagic;
+ }
+
+ uint32_t lite_index_crc() const override { return hdr_->lite_index_crc; }
+ void set_lite_index_crc(uint32_t crc) override { hdr_->lite_index_crc = crc; }
+
+ uint32_t last_added_docid() const override { return hdr_->last_added_docid; }
+ void set_last_added_docid(uint32_t last_added_docid) override {
+ hdr_->last_added_docid = last_added_docid;
+ }
+
+ uint32_t cur_size() const override { return hdr_->cur_size; }
+ void set_cur_size(uint32_t cur_size) override { hdr_->cur_size = cur_size; }
+
+ uint32_t searchable_end() const override { return hdr_->searchable_end; }
+ void set_searchable_end(uint32_t searchable_end) override {
+ hdr_->searchable_end = searchable_end;
+ }
+
+ uint32_t CalculateHeaderCrc() const override {
+ return IcingStringUtil::UpdateCrc32(
+ 0, reinterpret_cast<const char *>(hdr_) + offsetof(HeaderData, magic),
+ sizeof(HeaderData) - offsetof(HeaderData, magic));
+ }
+
+ void Reset() override {
+ hdr_->lite_index_crc = 0;
+ hdr_->magic = HeaderData::kMagic;
+ hdr_->last_added_docid = kIcingInvalidDocId;
+ hdr_->cur_size = 0;
+ hdr_->searchable_end = 0;
+ }
+
+ private:
+ HeaderData *hdr_;
+};
+static_assert(24 == sizeof(IcingLiteIndex_HeaderImpl::HeaderData),
+ "sizeof(HeaderData) != 24");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
diff --git a/icing/legacy/index/icing-lite-index-options.cc b/icing/legacy/index/icing-lite-index-options.cc
new file mode 100644
index 0000000..4bf0d38
--- /dev/null
+++ b/icing/legacy/index/icing-lite-index-options.cc
@@ -0,0 +1,64 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-lite-index-options.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t CalculateHitBufferSize(uint32_t hit_buffer_want_merge_bytes) {
+ constexpr uint32_t kHitBufferSlopMult = 2;
+
+ // Add a 2x slop for the hit buffer. We need to make sure we can at
+ // least fit one document with index variants.
+ // TODO(b/111690435) Move LiteIndex::Element to a separate file so that this
+ // can use sizeof(LiteIndex::Element)
+ uint32_t hit_capacity_elts_with_slop =
+ hit_buffer_want_merge_bytes / sizeof(uint64_t);
+ // Add some slop for index variants on top of max num tokens.
+ hit_capacity_elts_with_slop += kIcingMaxNumHitsPerDocument;
+ hit_capacity_elts_with_slop *= kHitBufferSlopMult;
+
+ return hit_capacity_elts_with_slop;
+}
+
+IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) {
+ // The default min is 1/5th of the main index lexicon, which can
+ // hold >1M terms. We don't need values so value size is 0. We
+ // conservatively scale from there.
+ //
+ // We can give this a lot of headroom because overestimating the
+ // requirement has minimal resource impact.
+ double scaling_factor =
+ std::max(1.0, static_cast<double>(hit_buffer_size) / (100u << 10));
+ return IcingDynamicTrie::Options((200u << 10) * scaling_factor,
+ (200u << 10) * scaling_factor,
+ (1u << 20) * scaling_factor, 0);
+}
+
+} // namespace
+
+IcingLiteIndexOptions::IcingLiteIndexOptions(
+ const std::string& filename_base, uint32_t hit_buffer_want_merge_bytes)
+ : filename_base(filename_base),
+ hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) {
+ hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes);
+ lexicon_options = CalculateTrieOptions(hit_buffer_size);
+ display_mappings_options = CalculateTrieOptions(hit_buffer_size);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-lite-index-options.h b/icing/legacy/index/icing-lite-index-options.h
new file mode 100644
index 0000000..2922621
--- /dev/null
+++ b/icing/legacy/index/icing-lite-index-options.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
+#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
+
+#include "icing/legacy/index/icing-common-types.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+
+namespace icing {
+namespace lib {
+
+struct IcingLiteIndexOptions {
+ IcingLiteIndexOptions() = default;
+ // Creates IcingLiteIndexOptions based off of the specified parameters. All
+ // other fields are calculated based on the value of
+ // hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and
+ // CalculateTrieOptions.
+ IcingLiteIndexOptions(const std::string& filename_base,
+ uint32_t hit_buffer_want_merge_bytes);
+
+ IcingDynamicTrie::Options lexicon_options;
+ IcingDynamicTrie::Options display_mappings_options;
+
+ std::string filename_base;
+ uint32_t hit_buffer_want_merge_bytes = 0;
+ uint32_t hit_buffer_size = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
diff --git a/icing/legacy/index/icing-mmapper.cc b/icing/legacy/index/icing-mmapper.cc
new file mode 100644
index 0000000..737335c
--- /dev/null
+++ b/icing/legacy/index/icing-mmapper.cc
@@ -0,0 +1,106 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: sbanacho@google.com (Scott Banachowski)
+//
+#include "icing/legacy/index/icing-mmapper.h"
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+IcingMMapper::IcingMMapper(bool read_only, int flags)
+ : address_(nullptr),
+ len_(0),
+ flags_(flags),
+ location_(0),
+ mmap_len_(0),
+ mmap_result_(nullptr),
+ read_only_(read_only) {}
+
+IcingMMapper::IcingMMapper(int fd, bool read_only, uint64_t location,
+ size_t size, int flags)
+ : address_(nullptr),
+ len_(0),
+ flags_(flags),
+ location_(0),
+ mmap_len_(0),
+ mmap_result_(nullptr),
+ read_only_(read_only) {
+ DoMapping(fd, location, size);
+}
+
+void IcingMMapper::DoMapping(int fd, uint64_t location, size_t size) {
+ uint64_t aligned_offset =
+ (location / system_page_size()) * system_page_size();
+ size_t alignment_adjustment = location - aligned_offset;
+ size_t mmap_len = alignment_adjustment + size;
+
+ int prot = read_only_ ? (PROT_READ) : (PROT_READ | PROT_WRITE);
+
+ mmap_result_ = mmap(nullptr, mmap_len, prot, flags_, fd, aligned_offset);
+
+ if (mmap_result_ != MAP_FAILED) {
+ len_ = size;
+ location_ = location;
+ mmap_len_ = mmap_len;
+ address_ = reinterpret_cast<uint8_t *>(mmap_result_) + alignment_adjustment;
+ } else {
+ const char *errstr = strerror(errno);
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Could not mmap file for reading: %s", errstr);
+ mmap_result_ = nullptr;
+ }
+}
+
+bool IcingMMapper::Remap(int fd, uint64_t location, size_t size) {
+ Unmap();
+ DoMapping(fd, location, size);
+ return is_valid();
+}
+
+void IcingMMapper::Unmap() {
+ if (mmap_result_ != nullptr) {
+ munmap(mmap_result_, mmap_len_);
+ }
+ address_ = nullptr;
+ len_ = 0;
+ location_ = 0;
+ mmap_len_ = 0;
+ mmap_result_ = nullptr;
+}
+
+IcingMMapper::~IcingMMapper() { Unmap(); }
+
+bool IcingMMapper::Sync() {
+ if (is_valid() && !read_only_) {
+ if (msync(mmap_result_, mmap_len_, MS_SYNC) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("msync failed: %s",
+ strerror(errno));
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-mmapper.h b/icing/legacy/index/icing-mmapper.h
new file mode 100644
index 0000000..bf62aa5
--- /dev/null
+++ b/icing/legacy/index/icing-mmapper.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: sbanacho@google.com (Scott Banachowski)
+//
+// This class is a helper for mmapping a file.
+// Use as a scoped allocator, the memory is mapped
+// on construction and released on destruction.
+
+#ifndef ICING_LEGACY_INDEX_ICING_MMAPPER_H_
+#define ICING_LEGACY_INDEX_ICING_MMAPPER_H_
+
+#include <stdint.h>
+#include <unistd.h>
+
+namespace icing {
+namespace lib {
+
+class IcingMMapper {
+ public:
+ // Provide a valid, open file description (with matching permissions
+ // for read or write). The location into the file you wish to map,
+ // and the size. "flags" are passed in directly to mmap.
+ IcingMMapper(int fd, bool read_only, uint64_t location, size_t size,
+ int flags);
+
+ // Set up Mmapper, but delay mapping until Remap is called.
+ IcingMMapper(bool read_only, int flags);
+
+ // Will unmap the region on delete. Does not close the file.
+ ~IcingMMapper();
+
+ // Move the location of the mapping to a new location. Returns
+ // true if valid.
+ bool Remap(int fd, uint64_t location, size_t size);
+
+ // Close the mapping and become invalid.
+ void Unmap();
+
+ // Sync the mapped region to the filesystem.
+ bool Sync();
+
+ // Check to see if the file was successfully mapped.
+ bool is_valid() const { return (address_ != nullptr); }
+
+ // The address in memory of the mapped file, returns NULL if the
+ // mapping of the region was unsuccesful.
+ const uint8_t *address() const { return address_; }
+
+ uint8_t *address() { return address_; }
+
+ size_t len() const { return len_; }
+
+ uint64_t location() const { return location_; }
+
+ static size_t __attribute__((const)) system_page_size() {
+ static const size_t page_size = sysconf(_SC_PAGE_SIZE);
+ return page_size;
+ }
+
+ // Rounds `size` up to a multiple of the system page size.
+ static size_t page_aligned_size(uint32_t size) {
+ return ((size + system_page_size() - 1) / system_page_size()) *
+ system_page_size();
+ }
+
+ private:
+ void DoMapping(int fd, uint64_t location, size_t size);
+
+ uint8_t *address_;
+ size_t len_; // the requested mapping length
+ const int flags_; // flags passed in to mmap
+ uint64_t location_; // the requested mapping file location
+ size_t mmap_len_; // the actual mapping length
+ void *mmap_result_;
+ const bool read_only_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_MMAPPER_H_
diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
new file mode 100644
index 0000000..31e012a
--- /dev/null
+++ b/icing/legacy/index/icing-mock-filesystem.h
@@ -0,0 +1,98 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
+#define ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/legacy/index/icing-filesystem.h"
+#include "gmock/gmock.h"
+
+namespace icing {
+namespace lib {
+
+class IcingMockFilesystem : public IcingFilesystem {
+ public:
+ MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name));
+
+ MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name));
+
+ MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name));
+
+ MOCK_CONST_METHOD1(FileExists, bool(const char *file_name));
+
+ MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name));
+
+ MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name));
+
+ MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name));
+
+ MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name));
+
+ MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name,
+ std::vector<std::string> *entries));
+
+ MOCK_CONST_METHOD2(GetMatchingFiles,
+ bool(const char *glob, std::vector<std::string> *matches));
+
+ MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name));
+
+ MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name));
+
+ MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name));
+
+ MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd));
+
+ MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename));
+
+ MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size));
+
+ MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size));
+
+ MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size));
+
+ MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size));
+ MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data,
+ size_t data_size));
+
+ MOCK_CONST_METHOD1(DataSync, bool(int fd));
+
+ MOCK_CONST_METHOD2(RenameFile,
+ bool(const char *old_name, const char *new_name));
+
+ MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two));
+
+ MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name));
+
+ MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name));
+
+ MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst));
+
+ MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum,
+ uint64_t offset, uint64_t length));
+
+ MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path));
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
diff --git a/icing/legacy/index/icing-storage-collection.cc b/icing/legacy/index/icing-storage-collection.cc
new file mode 100644
index 0000000..d31f892
--- /dev/null
+++ b/icing/legacy/index/icing-storage-collection.cc
@@ -0,0 +1,120 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-storage-collection.h"
+
+#include "icing/legacy/core/icing-compat.h"
+#include "icing/legacy/index/icing-filesystem.h"
+
+namespace icing {
+namespace lib {
+
+IcingStorageCollection::~IcingStorageCollection() {
+ // TODO(b/75960589): fix loop styling throughout
+ for (size_t i = 0; i < files_.size(); ++i) {
+ delete files_[i].file;
+ }
+}
+
+void IcingStorageCollection::Add(IIcingStorage *file,
+ bool remove_if_corrupted) {
+ files_.push_back(FileInfo(file, remove_if_corrupted));
+}
+
+void IcingStorageCollection::Swap(const IIcingStorage *current_file,
+ IIcingStorage *new_file) {
+ for (size_t i = 0; i < files_.size(); ++i) {
+ if (files_[i].file == current_file) {
+ delete files_[i].file;
+ files_[i] = FileInfo(new_file, files_[i].remove_if_corrupted);
+ }
+ }
+}
+
+bool IcingStorageCollection::UpgradeTo(int new_version) {
+ size_t count = 0;
+ for (size_t i = 0; i < files_.size(); ++i) {
+ if (files_[i].file->UpgradeTo(new_version)) {
+ ++count;
+ }
+ }
+ return count == files_.size();
+}
+
+bool IcingStorageCollection::Init() {
+ size_t count = 0;
+ for (size_t i = 0; i < files_.size(); ++i) {
+ if (files_[i].remove_if_corrupted) {
+ if (IIcingStorage::InitWithRetry(files_[i].file)) {
+ ++count;
+ }
+ } else {
+ if (files_[i].file->Init()) {
+ ++count;
+ }
+ }
+ }
+ return count == files_.size();
+}
+
+void IcingStorageCollection::Close() {
+ for (size_t i = 0; i < files_.size(); ++i) {
+ files_[i].file->Close();
+ }
+}
+
+bool IcingStorageCollection::Remove() {
+ size_t count = 0;
+ for (size_t i = 0; i < files_.size(); ++i) {
+ if (files_[i].file->Remove()) {
+ ++count;
+ }
+ }
+ return count == files_.size();
+}
+
+bool IcingStorageCollection::Sync() {
+ size_t count = 0;
+ for (size_t i = 0; i < files_.size(); ++i) {
+ if (files_[i].file->Sync()) {
+ ++count;
+ }
+ }
+ return count == files_.size();
+}
+
+uint64_t IcingStorageCollection::GetDiskUsage() const {
+ uint64_t total = 0;
+ for (auto &file_info : files_) {
+ IcingFilesystem::IncrementByOrSetInvalid(file_info.file->GetDiskUsage(),
+ &total);
+ }
+ return total;
+}
+
+void IcingStorageCollection::OnSleep() {
+ for (size_t i = 0; i < files_.size(); ++i) {
+ files_[i].file->OnSleep();
+ }
+}
+
+void IcingStorageCollection::GetDebugInfo(int verbosity,
+ std::string *out) const {
+ for (size_t i = 0; i < files_.size(); ++i) {
+ files_[i].file->GetDebugInfo(verbosity, out);
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-storage-collection.h b/icing/legacy/index/icing-storage-collection.h
new file mode 100644
index 0000000..dedfe33
--- /dev/null
+++ b/icing/legacy/index/icing-storage-collection.h
@@ -0,0 +1,64 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: sbanacho@google.com (Scott Banachowski)
+// vmarko@google.com (Vladimir Marko)
+//
+// An implementation of IIcingStorage that holds an arbitrary number of nested
+// IIcingStorage items.
+
+#ifndef ICING_LEGACY_INDEX_ICING_STORAGE_COLLECTION_H_
+#define ICING_LEGACY_INDEX_ICING_STORAGE_COLLECTION_H_
+
+#include <string>
+#include <vector>
+
+#include "icing/legacy/index/icing-storage.h"
+
+namespace icing {
+namespace lib {
+
+// Class that owns a list of IDocumentStores.
+class IcingStorageCollection : public IIcingStorage {
+ public:
+ ~IcingStorageCollection() override;
+ // remove_if_corrupted specifies Init behavior when backing file is
+ // corrupted: if false, Init will fail, else Init will clear the
+ // underlying file and succeed.
+ void Add(IIcingStorage *file, bool remove_if_corrupted);
+ void Swap(const IIcingStorage *current_file, IIcingStorage *new_file);
+ bool UpgradeTo(int new_version) override;
+ bool Init() override;
+ void Close() override;
+ bool Remove() override;
+ bool Sync() override;
+ uint64_t GetDiskUsage() const override;
+ void OnSleep() override;
+ void GetDebugInfo(int verbosity, std::string *out) const override;
+
+ private:
+ struct FileInfo {
+ FileInfo(IIcingStorage *file_in, bool remove_if_corrupted_in)
+ : file(file_in), remove_if_corrupted(remove_if_corrupted_in) {}
+
+ IIcingStorage *file;
+ bool remove_if_corrupted;
+ };
+ std::vector<FileInfo> files_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_STORAGE_COLLECTION_H_
diff --git a/icing/legacy/index/icing-storage-file.cc b/icing/legacy/index/icing-storage-file.cc
new file mode 100644
index 0000000..b27ec67
--- /dev/null
+++ b/icing/legacy/index/icing-storage-file.cc
@@ -0,0 +1,118 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-storage-file.h"
+
+#include <inttypes.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "icing/legacy/core/icing-compat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+IcingStorageFile::IcingStorageFile(const std::string &filename,
+ const IcingFilesystem *filesystem)
+ : IIcingStorage(), filesystem_(filesystem), filename_(filename) {}
+
+bool IcingStorageFile::Init() {
+ if (!is_initialized_) {
+ // Ensure the storage directory exists
+ std::string storage_dir = filesystem_->GetDirname(filename_.c_str());
+ if (!filesystem_->CreateDirectoryRecursively(storage_dir.c_str())) {
+ return false;
+ }
+
+ is_initialized_ = OnInit();
+
+ if (is_initialized_ && fd_.get() < 0) { // if initalized, fd better be set
+ ICING_LOG(FATAL)
+ << "Storage file descriptor not set after initialization";
+ }
+ }
+ return is_initialized_;
+}
+
+void IcingStorageFile::Close() {
+ if (is_initialized_) {
+ OnClose();
+ fd_.reset();
+ is_initialized_ = false;
+ }
+}
+
+bool IcingStorageFile::Remove() {
+ Close();
+ return filesystem_->DeleteFile(filename_.c_str());
+}
+
+bool IcingStorageFile::Sync() {
+ if (!is_initialized_) {
+ ICING_LOG(FATAL) << "Storage file not initialized";
+ }
+
+ IcingTimer timer;
+ if (!PreSync()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Pre-sync %s failed",
+ filename_.c_str());
+ return false;
+ }
+ if (!filesystem_->DataSync(fd_.get())) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Sync %s failed",
+ filename_.c_str());
+ return false;
+ }
+ if (!PostSync()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Post-sync %s failed",
+ filename_.c_str());
+ return false;
+ }
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Syncing %s took %.3fms", filename_.c_str(), timer.Elapsed() * 1000.);
+ return true;
+}
+
+uint64_t IcingStorageFile::GetDiskUsage() const {
+ return filesystem_->GetDiskUsage(fd_.get());
+}
+
+bool IcingStorageFile::PreSync() {
+ // Default implementation is a no-op.
+ return true;
+}
+
+bool IcingStorageFile::PostSync() {
+ // Default implementation is a no-op.
+ return true;
+}
+
+void IcingStorageFile::GetDebugInfo(int verbosity, std::string *out) const {
+ if (!is_initialized_) {
+ ICING_LOG(FATAL) << "Storage file not initialized";
+ }
+
+ if (verbosity >= 0) { // Always
+ uint64_t size = filesystem_->GetFileSize(fd_.get());
+ IcingStringUtil::SStringAppendF(
+ out, 1000, "Filename: %s Size: %" PRIu64 "\n", filename_.c_str(), size);
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-storage-file.h b/icing/legacy/index/icing-storage-file.h
new file mode 100644
index 0000000..94cab21
--- /dev/null
+++ b/icing/legacy/index/icing-storage-file.h
@@ -0,0 +1,96 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Base class for single file-based IIcingStorage implementations.
+
+#ifndef ICING_LEGACY_INDEX_ICING_STORAGE_FILE_H_
+#define ICING_LEGACY_INDEX_ICING_STORAGE_FILE_H_
+
+#include <string>
+
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-storage.h"
+
+namespace icing {
+namespace lib {
+
+// Minimal implementation that is backed by a filename and file descriptor.
+class IcingStorageFile : virtual public IIcingStorage {
+ public:
+ IcingStorageFile(const std::string &filename,
+ const IcingFilesystem *filesystem);
+
+ // This must be called before the object is usable.
+ // Returns true if the storage is in a usable state.
+ bool Init() override;
+
+ // Default implementation is a no-op.
+ bool UpgradeTo(int new_version) override { return true; }
+
+ // Closes all files and system resources.
+ // Init() must be called before the object is used again.
+ void Close() override;
+
+ // Closes all system resources, then removes the backing file.
+ // Init() is required before the object is used again.
+ // Returns true on success.
+ bool Remove() override;
+
+ // Syncs any unwritten data to disk.
+ // REQUIRES: is_initialized() == true
+ bool Sync() override;
+
+ // Gets the file size of the underlying file.
+ // Returns kBadFileSize on error.
+ uint64_t GetDiskUsage() const override;
+
+ bool is_initialized() const { return is_initialized_; }
+
+ const std::string &filename() const { return filename_; }
+
+ void GetDebugInfo(int verbosity, std::string *out) const override;
+
+ protected:
+ // Implements any initialization, returning true if successful.
+ // The child is repsonsible for calling open on the fd_ file descriptor,
+ // before returning from OnInit();
+ virtual bool OnInit() = 0;
+
+ // OnClose should remove any resources, other than the file, created
+ // during the Init.
+ // The file itself will be closed after the OnClose.
+ virtual void OnClose() = 0;
+
+ // Called before the file is synced. The child should write
+ // anything it hasn't yet written to the file so that it can be
+ // stored. Default implementation is to do nothing. Return true if
+ // successful.
+ virtual bool PreSync();
+
+ // Called after the file is synced. Default implementation is to do
+ // nothing. Return true if successful.
+ virtual bool PostSync();
+
+ const IcingFilesystem *const filesystem_;
+ IcingScopedFd fd_;
+
+ private:
+ const std::string filename_;
+ bool is_initialized_ = false;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_STORAGE_FILE_H_
diff --git a/icing/legacy/index/icing-storage.h b/icing/legacy/index/icing-storage.h
new file mode 100644
index 0000000..cc06c54
--- /dev/null
+++ b/icing/legacy/index/icing-storage.h
@@ -0,0 +1,91 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: sbanacho@google.com (Scott Banachowski)
+// vmarko@google.com (Vladimir Marko)
+//
+// Interface class for disk-backed storage.
+
+#ifndef ICING_LEGACY_INDEX_ICING_STORAGE_H_
+#define ICING_LEGACY_INDEX_ICING_STORAGE_H_
+
+#include <string>
+
+namespace icing {
+namespace lib {
+
+// Abstract base class for interface.
+class IIcingStorage {
+ public:
+ // Any resource that is not removed in the Close() function should
+ // be removed in the child's destructor.
+ virtual ~IIcingStorage() = default;
+
+ // This is called to upgrade to a new version.
+ // Returns true if the data store can be upgraded successfully.
+ virtual bool UpgradeTo(int new_version) = 0;
+
+ // This must be called before the object is usable.
+ // Returns true if the storage is in a usable state.
+ virtual bool Init() = 0;
+
+ // Attempts to init the given IIcingStorage. On failure, clears the underlying
+ // data and tries again. Returns false of the second init is also a failure.
+ static bool InitWithRetry(IIcingStorage* file_in) {
+ if (file_in->Init()) {
+ return true;
+ }
+ return file_in->Remove() && file_in->Init();
+ }
+
+ // Closes all files and system resources.
+ // Init() must be called before the object is used again.
+ virtual void Close() = 0;
+
+ // Closes all system resources, then removes the backing file.
+ // Init() is required before the object is used again.
+ // Returns true on success.
+ virtual bool Remove() = 0;
+
+ // Syncs any unwritten data to disk.
+ virtual bool Sync() = 0;
+
+ // Gets the total amount of disk usage for the object (i.e. the sum of the
+ // bytes of all underlying files).
+ // Note: reported values are estimated via the number of blocks the file takes
+ // up on disk. Sparse files are reported as their physical disk usage, as
+ // opposed to the logical size when read.
+ // Returns kBadFileSize on error.
+ virtual uint64_t GetDiskUsage() const = 0;
+
+ // Optional handler for when our process is entering a vulnerable
+ // state (highly likely to get killed). Default implementation does
+ // nothing.
+ virtual void OnSleep() {}
+
+ virtual void GetDebugInfo(int verbosity, std::string* out) const = 0;
+
+ protected:
+ IIcingStorage() = default;
+
+ private:
+ // Document stores are non-copyable.
+ IIcingStorage(const IIcingStorage&);
+ IIcingStorage& operator=(const IIcingStorage&);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_LEGACY_INDEX_ICING_STORAGE_H_
diff --git a/icing/legacy/index/icing-variant-map.h b/icing/legacy/index/icing-variant-map.h
new file mode 100644
index 0000000..dc55305
--- /dev/null
+++ b/icing/legacy/index/icing-variant-map.h
@@ -0,0 +1,805 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2013 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// Generated from translit-table.cpp. Maps the first character of
+// ascii to possible utf8 variants. Also a set of utf8 characters are
+// nullable (e.g., may be skipped in the key). These are listed as
+// "[0]".
+//
+// Then, each mapping can be an exact mapping (a -> à) or must/can
+// match a longer prefix in the key (d or dj -> đ). These are encoded
+// after the mapping and are called restricts.
+//
+// Only includes lowercase and isletter characters.
+
+#ifndef ICING_LEGACY_INDEX_ICING_VARIANT_MAP_H_
+#define ICING_LEGACY_INDEX_ICING_VARIANT_MAP_H_
+
+const char kVariantMapElts[] =
+ "ʰ" // [0] -> [ʰ] \312\260
+ "ʱ" // [0] -> [ʱ] \312\261
+ "ʲ" // [0] -> [ʲ] \312\262
+ "ʳ" // [0] -> [ʳ] \312\263
+ "ʴ" // [0] -> [ʴ] \312\264
+ "ʵ" // [0] -> [ʵ] \312\265
+ "ʶ" // [0] -> [ʶ] \312\266
+ "ʷ" // [0] -> [ʷ] \312\267
+ "ʸ" // [0] -> [ʸ] \312\270
+ "ʹ" // [0] -> [ʹ] \312\271
+ "ʺ" // [0] -> [ʺ] \312\272
+ "ʻ" // [0] -> [ʻ] \312\273
+ "ʼ" // [0] -> [ʼ] \312\274
+ "ʽ" // [0] -> [ʽ] \312\275
+ "ʾ" // [0] -> [ʾ] \312\276
+ "ʿ" // [0] -> [ʿ] \312\277
+ "ˀ" // [0] -> [ˀ] \313\200
+ "ˁ" // [0] -> [ˁ] \313\201
+ "ˆ" // [0] -> [ˆ] \313\206
+ "ˇ" // [0] -> [ˇ] \313\207
+ "ˈ" // [0] -> [ˈ] \313\210
+ "ˉ" // [0] -> [ˉ] \313\211
+ "ˊ" // [0] -> [ˊ] \313\212
+ "ˋ" // [0] -> [ˋ] \313\213
+ "ˌ" // [0] -> [ˌ] \313\214
+ "ˍ" // [0] -> [ˍ] \313\215
+ "ˎ" // [0] -> [ˎ] \313\216
+ "ˏ" // [0] -> [ˏ] \313\217
+ "ː" // [0] -> [ː] \313\220
+ "ˑ" // [0] -> [ˑ] \313\221
+ "ˠ" // [0] -> [ˠ] \313\240
+ "ˡ" // [0] -> [ˡ] \313\241
+ "ˢ" // [0] -> [ˢ] \313\242
+ "ˣ" // [0] -> [ˣ] \313\243
+ "ˤ" // [0] -> [ˤ] \313\244
+ "ˬ" // [0] -> [ˬ] \313\254
+ "ˮ" // [0] -> [ˮ] \313\256
+ "̀" // [0] -> [̀] \314\200
+ "́" // [0] -> [́] \314\201
+ "̂" // [0] -> [̂] \314\202
+ "̃" // [0] -> [̃] \314\203
+ "̄" // [0] -> [̄] \314\204
+ "̅" // [0] -> [̅] \314\205
+ "̆" // [0] -> [̆] \314\206
+ "̇" // [0] -> [̇] \314\207
+ "̈" // [0] -> [̈] \314\210
+ "̉" // [0] -> [̉] \314\211
+ "̊" // [0] -> [̊] \314\212
+ "̋" // [0] -> [̋] \314\213
+ "̌" // [0] -> [̌] \314\214
+ "̍" // [0] -> [̍] \314\215
+ "̎" // [0] -> [̎] \314\216
+ "̏" // [0] -> [̏] \314\217
+ "̐" // [0] -> [̐] \314\220
+ "̑" // [0] -> [̑] \314\221
+ "̒" // [0] -> [̒] \314\222
+ "̓" // [0] -> [̓] \314\223
+ "̔" // [0] -> [̔] \314\224
+ "̕" // [0] -> [̕] \314\225
+ "̖" // [0] -> [̖] \314\226
+ "̗" // [0] -> [̗] \314\227
+ "̘" // [0] -> [̘] \314\230
+ "̙" // [0] -> [̙] \314\231
+ "̚" // [0] -> [̚] \314\232
+ "̛" // [0] -> [̛] \314\233
+ "̜" // [0] -> [̜] \314\234
+ "̝" // [0] -> [̝] \314\235
+ "̞" // [0] -> [̞] \314\236
+ "̟" // [0] -> [̟] \314\237
+ "̠" // [0] -> [̠] \314\240
+ "̡" // [0] -> [̡] \314\241
+ "̢" // [0] -> [̢] \314\242
+ "̣" // [0] -> [̣] \314\243
+ "̤" // [0] -> [̤] \314\244
+ "̥" // [0] -> [̥] \314\245
+ "̦" // [0] -> [̦] \314\246
+ "̧" // [0] -> [̧] \314\247
+ "̨" // [0] -> [̨] \314\250
+ "̩" // [0] -> [̩] \314\251
+ "̪" // [0] -> [̪] \314\252
+ "̫" // [0] -> [̫] \314\253
+ "̬" // [0] -> [̬] \314\254
+ "̭" // [0] -> [̭] \314\255
+ "̮" // [0] -> [̮] \314\256
+ "̯" // [0] -> [̯] \314\257
+ "̰" // [0] -> [̰] \314\260
+ "̱" // [0] -> [̱] \314\261
+ "̲" // [0] -> [̲] \314\262
+ "̳" // [0] -> [̳] \314\263
+ "̴" // [0] -> [̴] \314\264
+ "̵" // [0] -> [̵] \314\265
+ "̶" // [0] -> [̶] \314\266
+ "̷" // [0] -> [̷] \314\267
+ "̸" // [0] -> [̸] \314\270
+ "̹" // [0] -> [̹] \314\271
+ "̺" // [0] -> [̺] \314\272
+ "̻" // [0] -> [̻] \314\273
+ "̼" // [0] -> [̼] \314\274
+ "̽" // [0] -> [̽] \314\275
+ "̾" // [0] -> [̾] \314\276
+ "̿" // [0] -> [̿] \314\277
+ "̀" // [0] -> [̀] \315\200
+ "́" // [0] -> [́] \315\201
+ "͂" // [0] -> [͂] \315\202
+ "̓" // [0] -> [̓] \315\203
+ "̈́" // [0] -> [̈́] \315\204
+ "ͅ" // [0] -> [ͅ] \315\205
+ "͆" // [0] -> [͆] \315\206
+ "͇" // [0] -> [͇] \315\207
+ "͈" // [0] -> [͈] \315\210
+ "͉" // [0] -> [͉] \315\211
+ "͊" // [0] -> [͊] \315\212
+ "͋" // [0] -> [͋] \315\213
+ "͌" // [0] -> [͌] \315\214
+ "͍" // [0] -> [͍] \315\215
+ "͎" // [0] -> [͎] \315\216
+ "͏" // [0] -> [͏] \315\217
+ "͐" // [0] -> [͐] \315\220
+ "͑" // [0] -> [͑] \315\221
+ "͒" // [0] -> [͒] \315\222
+ "͓" // [0] -> [͓] \315\223
+ "͔" // [0] -> [͔] \315\224
+ "͕" // [0] -> [͕] \315\225
+ "͖" // [0] -> [͖] \315\226
+ "͗" // [0] -> [͗] \315\227
+ "͘" // [0] -> [͘] \315\230
+ "͙" // [0] -> [͙] \315\231
+ "͚" // [0] -> [͚] \315\232
+ "͛" // [0] -> [͛] \315\233
+ "͜" // [0] -> [͜] \315\234
+ "͝" // [0] -> [͝] \315\235
+ "͞" // [0] -> [͞] \315\236
+ "͟" // [0] -> [͟] \315\237
+ "͠" // [0] -> [͠] \315\240
+ "͡" // [0] -> [͡] \315\241
+ "͢" // [0] -> [͢] \315\242
+ "ͣ" // [0] -> [ͣ] \315\243
+ "ͤ" // [0] -> [ͤ] \315\244
+ "ͥ" // [0] -> [ͥ] \315\245
+ "ͦ" // [0] -> [ͦ] \315\246
+ "ͧ" // [0] -> [ͧ] \315\247
+ "ͨ" // [0] -> [ͨ] \315\250
+ "ͩ" // [0] -> [ͩ] \315\251
+ "ͪ" // [0] -> [ͪ] \315\252
+ "ͫ" // [0] -> [ͫ] \315\253
+ "ͬ" // [0] -> [ͬ] \315\254
+ "ͭ" // [0] -> [ͭ] \315\255
+ "ͮ" // [0] -> [ͮ] \315\256
+ "ͯ" // [0] -> [ͯ] \315\257
+ "ъ" // [0] -> [ъ] \321\212
+ "ь" // [0] -> [ь] \321\214
+ "֑" // [0] -> [֑] \326\221
+ "֒" // [0] -> [֒] \326\222
+ "֓" // [0] -> [֓] \326\223
+ "֔" // [0] -> [֔] \326\224
+ "֕" // [0] -> [֕] \326\225
+ "֖" // [0] -> [֖] \326\226
+ "֗" // [0] -> [֗] \326\227
+ "֘" // [0] -> [֘] \326\230
+ "֙" // [0] -> [֙] \326\231
+ "֚" // [0] -> [֚] \326\232
+ "֛" // [0] -> [֛] \326\233
+ "֜" // [0] -> [֜] \326\234
+ "֝" // [0] -> [֝] \326\235
+ "֞" // [0] -> [֞] \326\236
+ "֟" // [0] -> [֟] \326\237
+ "֠" // [0] -> [֠] \326\240
+ "֡" // [0] -> [֡] \326\241
+ "֢" // [0] -> [֢] \326\242
+ "֣" // [0] -> [֣] \326\243
+ "֤" // [0] -> [֤] \326\244
+ "֥" // [0] -> [֥] \326\245
+ "֦" // [0] -> [֦] \326\246
+ "֧" // [0] -> [֧] \326\247
+ "֨" // [0] -> [֨] \326\250
+ "֩" // [0] -> [֩] \326\251
+ "֪" // [0] -> [֪] \326\252
+ "֫" // [0] -> [֫] \326\253
+ "֬" // [0] -> [֬] \326\254
+ "֭" // [0] -> [֭] \326\255
+ "֮" // [0] -> [֮] \326\256
+ "֯" // [0] -> [֯] \326\257
+ "ְ" // [0] -> [ְ] \326\260
+ "ֱ" // [0] -> [ֱ] \326\261
+ "ֲ" // [0] -> [ֲ] \326\262
+ "ֳ" // [0] -> [ֳ] \326\263
+ "ִ" // [0] -> [ִ] \326\264
+ "ֵ" // [0] -> [ֵ] \326\265
+ "ֶ" // [0] -> [ֶ] \326\266
+ "ַ" // [0] -> [ַ] \326\267
+ "ָ" // [0] -> [ָ] \326\270
+ "ֹ" // [0] -> [ֹ] \326\271
+ "ֺ" // [0] -> [ֺ] \326\272
+ "ֻ" // [0] -> [ֻ] \326\273
+ "ּ" // [0] -> [ּ] \326\274
+ "ֽ" // [0] -> [ֽ] \326\275
+ "ֿ" // [0] -> [ֿ] \326\277
+ "ׁ" // [0] -> [ׁ] \327\201
+ "ׂ" // [0] -> [ׂ] \327\202
+ "ׄ" // [0] -> [ׄ] \327\204
+ "ׅ" // [0] -> [ׅ] \327\205
+ "ׇ" // [0] -> [ׇ] \327\207
+ "ؐ" // [0] -> [ؐ] \330\220
+ "ؑ" // [0] -> [ؑ] \330\221
+ "ؒ" // [0] -> [ؒ] \330\222
+ "ؓ" // [0] -> [ؓ] \330\223
+ "ؔ" // [0] -> [ؔ] \330\224
+ "ؕ" // [0] -> [ؕ] \330\225
+ "ً" // [0] -> [ً] \331\213
+ "ٌ" // [0] -> [ٌ] \331\214
+ "ٍ" // [0] -> [ٍ] \331\215
+ "َ" // [0] -> [َ] \331\216
+ "ُ" // [0] -> [ُ] \331\217
+ "ِ" // [0] -> [ِ] \331\220
+ "ّ" // [0] -> [ّ] \331\221
+ "ْ" // [0] -> [ْ] \331\222
+ "ٓ" // [0] -> [ٓ] \331\223
+ "ٔ" // [0] -> [ٔ] \331\224
+ "ٕ" // [0] -> [ٕ] \331\225
+ "ٖ" // [0] -> [ٖ] \331\226
+ "ٗ" // [0] -> [ٗ] \331\227
+ "٘" // [0] -> [٘] \331\230
+ "ٙ" // [0] -> [ٙ] \331\231
+ "ٚ" // [0] -> [ٚ] \331\232
+ "ٛ" // [0] -> [ٛ] \331\233
+ "ٜ" // [0] -> [ٜ] \331\234
+ "ٝ" // [0] -> [ٝ] \331\235
+ "ٞ" // [0] -> [ٞ] \331\236
+ "ٰ" // [0] -> [ٰ] \331\260
+ "ۖ" // [0] -> [ۖ] \333\226
+ "ۗ" // [0] -> [ۗ] \333\227
+ "ۘ" // [0] -> [ۘ] \333\230
+ "ۙ" // [0] -> [ۙ] \333\231
+ "ۚ" // [0] -> [ۚ] \333\232
+ "ۛ" // [0] -> [ۛ] \333\233
+ "ۜ" // [0] -> [ۜ] \333\234
+ "۟" // [0] -> [۟] \333\237
+ "۠" // [0] -> [۠] \333\240
+ "ۡ" // [0] -> [ۡ] \333\241
+ "ۢ" // [0] -> [ۢ] \333\242
+ "ۣ" // [0] -> [ۣ] \333\243
+ "ۤ" // [0] -> [ۤ] \333\244
+ "ۧ" // [0] -> [ۧ] \333\247
+ "ۨ" // [0] -> [ۨ] \333\250
+ "۪" // [0] -> [۪] \333\252
+ "۫" // [0] -> [۫] \333\253
+ "۬" // [0] -> [۬] \333\254
+ "ۭ" // [0] -> [ۭ] \333\255
+ "ം" // [0] -> [ം] \340\264\202
+ "ഃ" // [0] -> [ഃ] \340\264\203
+ "്" // [0] -> [്] \340\265\215
+ "่" // [0] -> [่] \340\271\210
+ "้" // [0] -> [้] \340\271\211
+ "๊" // [0] -> [๊] \340\271\212
+ "๋" // [0] -> [๋] \340\271\213
+ "່" // [0] -> [່] \340\273\210
+ "້" // [0] -> [້] \340\273\211
+ "໊" // [0] -> [໊] \340\273\212
+ "໋" // [0] -> [໋] \340\273\213
+ "" // [0] -> [] \342\200\214
+ "" // [0] -> [] \342\200\215
+ "\0" // [0] end
+ "à" // [a] -> [à] \303\240
+ "á" // [a] -> [á] \303\241
+ "â" // [a] -> [â] \303\242
+ "ã" // [a] -> [ã] \303\243
+ "äa\0ae\0" // [a] -> [ä] \303\244
+ "åa\0aa\0" // [a] -> [å] \303\245
+ "æae\0" // [a] -> [æ] \303\246
+ "ā" // [a] -> [ā] \304\201
+ "ă" // [a] -> [ă] \304\203
+ "ą" // [a] -> [ą] \304\205
+ "ά" // [a] -> [ά] \316\254
+ "α" // [a] -> [α] \316\261
+ "а" // [a] -> [а] \320\260
+ "ъ" // [a] -> [ъ] \321\212
+ "ա" // [a] -> [ա] \325\241
+ "ა" // [a] -> [ა] \341\203\220
+ "ḁ" // [a] -> [ḁ] \341\270\201
+ "ẚ" // [a] -> [ẚ] \341\272\232
+ "ạ" // [a] -> [ạ] \341\272\241
+ "ả" // [a] -> [ả] \341\272\243
+ "ấ" // [a] -> [ấ] \341\272\245
+ "ầ" // [a] -> [ầ] \341\272\247
+ "ẩ" // [a] -> [ẩ] \341\272\251
+ "ẫ" // [a] -> [ẫ] \341\272\253
+ "ậ" // [a] -> [ậ] \341\272\255
+ "ắ" // [a] -> [ắ] \341\272\257
+ "ằ" // [a] -> [ằ] \341\272\261
+ "ẳ" // [a] -> [ẳ] \341\272\263
+ "ẵ" // [a] -> [ẵ] \341\272\265
+ "ặ" // [a] -> [ặ] \341\272\267
+ "ἀ" // [a] -> [ἀ] \341\274\200
+ "ἁ" // [a] -> [ἁ] \341\274\201
+ "ἂ" // [a] -> [ἂ] \341\274\202
+ "ἃ" // [a] -> [ἃ] \341\274\203
+ "ἄ" // [a] -> [ἄ] \341\274\204
+ "ἅ" // [a] -> [ἅ] \341\274\205
+ "ἆ" // [a] -> [ἆ] \341\274\206
+ "ἇ" // [a] -> [ἇ] \341\274\207
+ "\0" // [a] end
+ "б" // [b] -> [б] \320\261
+ "բ" // [b] -> [բ] \325\242
+ "ბ" // [b] -> [ბ] \341\203\221
+ "ḃ" // [b] -> [ḃ] \341\270\203
+ "ḅ" // [b] -> [ḅ] \341\270\205
+ "ḇ" // [b] -> [ḇ] \341\270\207
+ "\0" // [b] end
+ "ç" // [c] -> [ç] \303\247
+ "ć" // [c] -> [ć] \304\207
+ "ĉ" // [c] -> [ĉ] \304\211
+ "ċ" // [c] -> [ċ] \304\213
+ "č" // [c] -> [č] \304\215
+ "ц" // [c] -> [ц] \321\206
+ "чc\0ch\0" // [c] -> [ч] \321\207
+ "ћ" // [c] -> [ћ] \321\233
+ "ҹ" // [c] -> [ҹ] \322\271
+ "չch\0" // [c] -> [չ] \325\271
+ "ჩch\0" // [c] -> [ჩ] \341\203\251
+ "ჭch\0" // [c] -> [ჭ] \341\203\255
+ "ḉ" // [c] -> [ḉ] \341\270\211
+ "\0" // [c] end
+ "ð" // [d] -> [ð] \303\260
+ "ď" // [d] -> [ď] \304\217
+ "đd\0dj\0" // [d] -> [đ] \304\221
+ "δ" // [d] -> [δ] \316\264
+ "д" // [d] -> [д] \320\264
+ "ђdj\0" // [d] -> [ђ] \321\222
+ "ѕdz\0" // [d] -> [ѕ] \321\225
+ "џdj\0dz\0" // [d] -> [џ] \321\237
+ "դ" // [d] -> [դ] \325\244
+ "დ" // [d] -> [დ] \341\203\223
+ "ძdz\0" // [d] -> [ძ] \341\203\253
+ "ḋ" // [d] -> [ḋ] \341\270\213
+ "ḍ" // [d] -> [ḍ] \341\270\215
+ "ḏ" // [d] -> [ḏ] \341\270\217
+ "ḑ" // [d] -> [ḑ] \341\270\221
+ "ḓ" // [d] -> [ḓ] \341\270\223
+ "\0" // [d] end
+ "è" // [e] -> [è] \303\250
+ "é" // [e] -> [é] \303\251
+ "ê" // [e] -> [ê] \303\252
+ "ë" // [e] -> [ë] \303\253
+ "ē" // [e] -> [ē] \304\223
+ "ĕ" // [e] -> [ĕ] \304\225
+ "ė" // [e] -> [ė] \304\227
+ "ę" // [e] -> [ę] \304\231
+ "ě" // [e] -> [ě] \304\233
+ "έ" // [e] -> [έ] \316\255
+ "ε" // [e] -> [ε] \316\265
+ "ϵ" // [e] -> [ϵ] \317\265
+ "е" // [e] -> [е] \320\265
+ "э" // [e] -> [э] \321\215
+ "ѐ" // [e] -> [ѐ] \321\220
+ "ё" // [e] -> [ё] \321\221
+ "є" // [e] -> [є] \321\224
+ "ә" // [e] -> [ә] \323\231
+ "ե" // [e] -> [ե] \325\245
+ "է" // [e] -> [է] \325\247
+ "ըeh\0" // [e] -> [ը] \325\250
+ "ևev\0" // [e] -> [և] \326\207
+ "ე" // [e] -> [ე] \341\203\224
+ "ვ" // [e] -> [ვ] \341\203\225
+ "ḕ" // [e] -> [ḕ] \341\270\225
+ "ḗ" // [e] -> [ḗ] \341\270\227
+ "ḙ" // [e] -> [ḙ] \341\270\231
+ "ḛ" // [e] -> [ḛ] \341\270\233
+ "ḝ" // [e] -> [ḝ] \341\270\235
+ "ẹ" // [e] -> [ẹ] \341\272\271
+ "ẻ" // [e] -> [ẻ] \341\272\273
+ "ẽ" // [e] -> [ẽ] \341\272\275
+ "ế" // [e] -> [ế] \341\272\277
+ "ề" // [e] -> [ề] \341\273\201
+ "ể" // [e] -> [ể] \341\273\203
+ "ễ" // [e] -> [ễ] \341\273\205
+ "ệ" // [e] -> [ệ] \341\273\207
+ "ἐ" // [e] -> [ἐ] \341\274\220
+ "ἑ" // [e] -> [ἑ] \341\274\221
+ "ἒ" // [e] -> [ἒ] \341\274\222
+ "ἓ" // [e] -> [ἓ] \341\274\223
+ "ἔ" // [e] -> [ἔ] \341\274\224
+ "ἕ" // [e] -> [ἕ] \341\274\225
+ "\0" // [e] end
+ "φ" // [f] -> [φ] \317\206
+ "ϕ" // [f] -> [ϕ] \317\225
+ "ф" // [f] -> [ф] \321\204
+ "ֆ" // [f] -> [ֆ] \326\206
+ "ḟ" // [f] -> [ḟ] \341\270\237
+ "ẛ" // [f] -> [ẛ] \341\272\233
+ "\0" // [f] end
+ "ĝ" // [g] -> [ĝ] \304\235
+ "ğ" // [g] -> [ğ] \304\237
+ "ġ" // [g] -> [ġ] \304\241
+ "ģ" // [g] -> [ģ] \304\243
+ "γ" // [g] -> [γ] \316\263
+ "г" // [g] -> [г] \320\263
+ "ѓ" // [g] -> [ѓ] \321\223
+ "ґ" // [g] -> [ґ] \322\221
+ "ғ" // [g] -> [ғ] \322\223
+ "ҝ" // [g] -> [ҝ] \322\235
+ "գ" // [g] -> [գ] \325\243
+ "ղgh\0" // [g] -> [ղ] \325\262
+ "ջ" // [g] -> [ջ] \325\273
+ "გ" // [g] -> [გ] \341\203\222
+ "ღgh\0" // [g] -> [ღ] \341\203\246
+ "ḡ" // [g] -> [ḡ] \341\270\241
+ "\0" // [g] end
+ "ĥ" // [h] -> [ĥ] \304\245
+ "ħ" // [h] -> [ħ] \304\247
+ "х" // [h] -> [х] \321\205
+ "һ" // [h] -> [һ] \322\273
+ "հ" // [h] -> [հ] \325\260
+ "ჰ" // [h] -> [ჰ] \341\203\260
+ "ḣ" // [h] -> [ḣ] \341\270\243
+ "ḥ" // [h] -> [ḥ] \341\270\245
+ "ḧ" // [h] -> [ḧ] \341\270\247
+ "ḩ" // [h] -> [ḩ] \341\270\251
+ "ḫ" // [h] -> [ḫ] \341\270\253
+ "ẖ" // [h] -> [ẖ] \341\272\226
+ "\0" // [h] end
+ "ì" // [i] -> [ì] \303\254
+ "í" // [i] -> [í] \303\255
+ "î" // [i] -> [î] \303\256
+ "ï" // [i] -> [ï] \303\257
+ "ý" // [i] -> [ý] \303\275
+ "ĩ" // [i] -> [ĩ] \304\251
+ "ī" // [i] -> [ī] \304\253
+ "ĭ" // [i] -> [ĭ] \304\255
+ "į" // [i] -> [į] \304\257
+ "ı" // [i] -> [ı] \304\261
+ "ΐ" // [i] -> [ΐ] \316\220
+ "ή" // [i] -> [ή] \316\256
+ "ί" // [i] -> [ί] \316\257
+ "ΰ" // [i] -> [ΰ] \316\260
+ "η" // [i] -> [η] \316\267
+ "ι" // [i] -> [ι] \316\271
+ "υ" // [i] -> [υ] \317\205
+ "ϊ" // [i] -> [ϊ] \317\212
+ "ϋ" // [i] -> [ϋ] \317\213
+ "ύ" // [i] -> [ύ] \317\215
+ "и" // [i] -> [и] \320\270
+ "й" // [i] -> [й] \320\271
+ "і" // [i] -> [і] \321\226
+ "ї" // [i] -> [ї] \321\227
+ "ի" // [i] -> [ի] \325\253
+ "ი" // [i] -> [ი] \341\203\230
+ "ḭ" // [i] -> [ḭ] \341\270\255
+ "ḯ" // [i] -> [ḯ] \341\270\257
+ "ỉ" // [i] -> [ỉ] \341\273\211
+ "ị" // [i] -> [ị] \341\273\213
+ "ἠ" // [i] -> [ἠ] \341\274\240
+ "ἡ" // [i] -> [ἡ] \341\274\241
+ "ἢ" // [i] -> [ἢ] \341\274\242
+ "ἣ" // [i] -> [ἣ] \341\274\243
+ "ἤ" // [i] -> [ἤ] \341\274\244
+ "ἥ" // [i] -> [ἥ] \341\274\245
+ "ἦ" // [i] -> [ἦ] \341\274\246
+ "ἧ" // [i] -> [ἧ] \341\274\247
+ "ἰ" // [i] -> [ἰ] \341\274\260
+ "ἱ" // [i] -> [ἱ] \341\274\261
+ "ἲ" // [i] -> [ἲ] \341\274\262
+ "ἳ" // [i] -> [ἳ] \341\274\263
+ "ἴ" // [i] -> [ἴ] \341\274\264
+ "ἵ" // [i] -> [ἵ] \341\274\265
+ "ἶ" // [i] -> [ἶ] \341\274\266
+ "ἷ" // [i] -> [ἷ] \341\274\267
+ "ὐ" // [i] -> [ὐ] \341\275\220
+ "ὑ" // [i] -> [ὑ] \341\275\221
+ "ὒ" // [i] -> [ὒ] \341\275\222
+ "ὓ" // [i] -> [ὓ] \341\275\223
+ "ὔ" // [i] -> [ὔ] \341\275\224
+ "ὕ" // [i] -> [ὕ] \341\275\225
+ "ὖ" // [i] -> [ὖ] \341\275\226
+ "ὗ" // [i] -> [ὗ] \341\275\227
+ "\0" // [i] end
+ "ĵ" // [j] -> [ĵ] \304\265
+ "ж" // [j] -> [ж] \320\266
+ "ј" // [j] -> [ј] \321\230
+ "ժ" // [j] -> [ժ] \325\252
+ "ճ" // [j] -> [ճ] \325\263
+ "ჯ" // [j] -> [ჯ] \341\203\257
+ "\0" // [j] end
+ "ķ" // [k] -> [ķ] \304\267
+ "κ" // [k] -> [κ] \316\272
+ "ϰ" // [k] -> [ϰ] \317\260
+ "к" // [k] -> [к] \320\272
+ "хkh\0" // [k] -> [х] \321\205
+ "ќ" // [k] -> [ќ] \321\234
+ "կ" // [k] -> [կ] \325\257
+ "ք" // [k] -> [ք] \326\204
+ "კ" // [k] -> [კ] \341\203\231
+ "ქkh\0" // [k] -> [ქ] \341\203\245
+ "ḱ" // [k] -> [ḱ] \341\270\261
+ "ḳ" // [k] -> [ḳ] \341\270\263
+ "ḵ" // [k] -> [ḵ] \341\270\265
+ "\0" // [k] end
+ "ĺ" // [l] -> [ĺ] \304\272
+ "ļ" // [l] -> [ļ] \304\274
+ "ľ" // [l] -> [ľ] \304\276
+ "ŀ" // [l] -> [ŀ] \305\200
+ "ł" // [l] -> [ł] \305\202
+ "λ" // [l] -> [λ] \316\273
+ "л" // [l] -> [л] \320\273
+ "љlj\0" // [l] -> [љ] \321\231
+ "լ" // [l] -> [լ] \325\254
+ "ლ" // [l] -> [ლ] \341\203\232
+ "ḷ" // [l] -> [ḷ] \341\270\267
+ "ḹ" // [l] -> [ḹ] \341\270\271
+ "ḻ" // [l] -> [ḻ] \341\270\273
+ "ḽ" // [l] -> [ḽ] \341\270\275
+ "\0" // [l] end
+ "μ" // [m] -> [μ] \316\274
+ "м" // [m] -> [м] \320\274
+ "մ" // [m] -> [մ] \325\264
+ "მ" // [m] -> [მ] \341\203\233
+ "ḿ" // [m] -> [ḿ] \341\270\277
+ "ṁ" // [m] -> [ṁ] \341\271\201
+ "ṃ" // [m] -> [ṃ] \341\271\203
+ "\0" // [m] end
+ "ñ" // [n] -> [ñ] \303\261
+ "ń" // [n] -> [ń] \305\204
+ "ņ" // [n] -> [ņ] \305\206
+ "ň" // [n] -> [ň] \305\210
+ "ŋ" // [n] -> [ŋ] \305\213
+ "ν" // [n] -> [ν] \316\275
+ "н" // [n] -> [н] \320\275
+ "њnj\0" // [n] -> [њ] \321\232
+ "ն" // [n] -> [ն] \325\266
+ "ნ" // [n] -> [ნ] \341\203\234
+ "ṅ" // [n] -> [ṅ] \341\271\205
+ "ṇ" // [n] -> [ṇ] \341\271\207
+ "ṉ" // [n] -> [ṉ] \341\271\211
+ "ṋ" // [n] -> [ṋ] \341\271\213
+ "\0" // [n] end
+ "ò" // [o] -> [ò] \303\262
+ "ó" // [o] -> [ó] \303\263
+ "ô" // [o] -> [ô] \303\264
+ "õ" // [o] -> [õ] \303\265
+ "öo\0oe\0" // [o] -> [ö] \303\266
+ "øo\0oe\0" // [o] -> [ø] \303\270
+ "ō" // [o] -> [ō] \305\215
+ "ŏ" // [o] -> [ŏ] \305\217
+ "ő" // [o] -> [ő] \305\221
+ "œoe\0" // [o] -> [œ] \305\223
+ "ơ" // [o] -> [ơ] \306\241
+ "ο" // [o] -> [ο] \316\277
+ "ω" // [o] -> [ω] \317\211
+ "ό" // [o] -> [ό] \317\214
+ "ώ" // [o] -> [ώ] \317\216
+ "о" // [o] -> [о] \320\276
+ "ө" // [o] -> [ө] \323\251
+ "ո" // [o] -> [ո] \325\270
+ "օ" // [o] -> [օ] \326\205
+ "ო" // [o] -> [ო] \341\203\235
+ "ṍ" // [o] -> [ṍ] \341\271\215
+ "ṏ" // [o] -> [ṏ] \341\271\217
+ "ṑ" // [o] -> [ṑ] \341\271\221
+ "ṓ" // [o] -> [ṓ] \341\271\223
+ "ọ" // [o] -> [ọ] \341\273\215
+ "ỏ" // [o] -> [ỏ] \341\273\217
+ "ố" // [o] -> [ố] \341\273\221
+ "ồ" // [o] -> [ồ] \341\273\223
+ "ổ" // [o] -> [ổ] \341\273\225
+ "ỗ" // [o] -> [ỗ] \341\273\227
+ "ộ" // [o] -> [ộ] \341\273\231
+ "ớ" // [o] -> [ớ] \341\273\233
+ "ờ" // [o] -> [ờ] \341\273\235
+ "ở" // [o] -> [ở] \341\273\237
+ "ỡ" // [o] -> [ỡ] \341\273\241
+ "ợ" // [o] -> [ợ] \341\273\243
+ "ὀ" // [o] -> [ὀ] \341\275\200
+ "ὁ" // [o] -> [ὁ] \341\275\201
+ "ὂ" // [o] -> [ὂ] \341\275\202
+ "ὃ" // [o] -> [ὃ] \341\275\203
+ "ὄ" // [o] -> [ὄ] \341\275\204
+ "ὅ" // [o] -> [ὅ] \341\275\205
+ "ὠ" // [o] -> [ὠ] \341\275\240
+ "ὡ" // [o] -> [ὡ] \341\275\241
+ "ὢ" // [o] -> [ὢ] \341\275\242
+ "\0" // [o] end
+ "π" // [p] -> [π] \317\200
+ "ψps\0" // [p] -> [ψ] \317\210
+ "ϖ" // [p] -> [ϖ] \317\226
+ "п" // [p] -> [п] \320\277
+ "պ" // [p] -> [պ] \325\272
+ "փ" // [p] -> [փ] \326\203
+ "პ" // [p] -> [პ] \341\203\236
+ "ფph\0" // [p] -> [ფ] \341\203\244
+ "ṕ" // [p] -> [ṕ] \341\271\225
+ "ṗ" // [p] -> [ṗ] \341\271\227
+ "\0" // [p] end
+ "г" // [q] -> [г] \320\263
+ "ყ" // [q] -> [ყ] \341\203\247
+ "\0" // [q] end
+ "ŕ" // [r] -> [ŕ] \305\225
+ "ŗ" // [r] -> [ŗ] \305\227
+ "ř" // [r] -> [ř] \305\231
+ "ρ" // [r] -> [ρ] \317\201
+ "ϱ" // [r] -> [ϱ] \317\261
+ "р" // [r] -> [р] \321\200
+ "ռ" // [r] -> [ռ] \325\274
+ "ր" // [r] -> [ր] \326\200
+ "რ" // [r] -> [რ] \341\203\240
+ "ṙ" // [r] -> [ṙ] \341\271\231
+ "ṛ" // [r] -> [ṛ] \341\271\233
+ "ṝ" // [r] -> [ṝ] \341\271\235
+ "ṟ" // [r] -> [ṟ] \341\271\237
+ "\0" // [r] end
+ "ßss\0" // [s] -> [ß] \303\237
+ "ś" // [s] -> [ś] \305\233
+ "ŝ" // [s] -> [ŝ] \305\235
+ "ş" // [s] -> [ş] \305\237
+ "š" // [s] -> [š] \305\241
+ "ș" // [s] -> [ș] \310\231
+ "ς" // [s] -> [ς] \317\202
+ "σ" // [s] -> [σ] \317\203
+ "ϲ" // [s] -> [ϲ] \317\262
+ "с" // [s] -> [с] \321\201
+ "шs\0sh\0" // [s] -> [ш] \321\210
+ "щshch\0sht\0" // [s] -> [щ] \321\211
+ "շsh\0" // [s] -> [շ] \325\267
+ "ս" // [s] -> [ս] \325\275
+ "ს" // [s] -> [ს] \341\203\241
+ "შsh\0" // [s] -> [შ] \341\203\250
+ "ṡ" // [s] -> [ṡ] \341\271\241
+ "ṣ" // [s] -> [ṣ] \341\271\243
+ "ṥ" // [s] -> [ṥ] \341\271\245
+ "ṧ" // [s] -> [ṧ] \341\271\247
+ "ṩ" // [s] -> [ṩ] \341\271\251
+ "\0" // [s] end
+ "þth\0" // [t] -> [þ] \303\276
+ "ţ" // [t] -> [ţ] \305\243
+ "ť" // [t] -> [ť] \305\245
+ "ŧ" // [t] -> [ŧ] \305\247
+ "ț" // [t] -> [ț] \310\233
+ "θth\0" // [t] -> [θ] \316\270
+ "τ" // [t] -> [τ] \317\204
+ "ϑth\0" // [t] -> [ϑ] \317\221
+ "т" // [t] -> [т] \321\202
+ "цts\0" // [t] -> [ц] \321\206
+ "թ" // [t] -> [թ] \325\251
+ "ծts\0" // [t] -> [ծ] \325\256
+ "տ" // [t] -> [տ] \325\277
+ "ցts\0" // [t] -> [ց] \326\201
+ "თ" // [t] -> [თ] \341\203\227
+ "ტ" // [t] -> [ტ] \341\203\242
+ "ცts\0" // [t] -> [ც] \341\203\252
+ "წts\0" // [t] -> [წ] \341\203\254
+ "ṫ" // [t] -> [ṫ] \341\271\253
+ "ṭ" // [t] -> [ṭ] \341\271\255
+ "ṯ" // [t] -> [ṯ] \341\271\257
+ "ṱ" // [t] -> [ṱ] \341\271\261
+ "ẗ" // [t] -> [ẗ] \341\272\227
+ "\0" // [t] end
+ "ù" // [u] -> [ù] \303\271
+ "ú" // [u] -> [ú] \303\272
+ "û" // [u] -> [û] \303\273
+ "üu\0ue\0" // [u] -> [ü] \303\274
+ "ũ" // [u] -> [ũ] \305\251
+ "ū" // [u] -> [ū] \305\253
+ "ŭ" // [u] -> [ŭ] \305\255
+ "ů" // [u] -> [ů] \305\257
+ "ű" // [u] -> [ű] \305\261
+ "ų" // [u] -> [ų] \305\263
+ "ư" // [u] -> [ư] \306\260
+ "у" // [u] -> [у] \321\203
+ "ў" // [u] -> [ў] \321\236
+ "ү" // [u] -> [ү] \322\257
+ "უ" // [u] -> [უ] \341\203\243
+ "ṳ" // [u] -> [ṳ] \341\271\263
+ "ṵ" // [u] -> [ṵ] \341\271\265
+ "ṷ" // [u] -> [ṷ] \341\271\267
+ "ṹ" // [u] -> [ṹ] \341\271\271
+ "ṻ" // [u] -> [ṻ] \341\271\273
+ "ụ" // [u] -> [ụ] \341\273\245
+ "ủ" // [u] -> [ủ] \341\273\247
+ "ứ" // [u] -> [ứ] \341\273\251
+ "ừ" // [u] -> [ừ] \341\273\253
+ "ử" // [u] -> [ử] \341\273\255
+ "ữ" // [u] -> [ữ] \341\273\257
+ "ự" // [u] -> [ự] \341\273\261
+ "\0" // [u] end
+ "β" // [v] -> [β] \316\262
+ "ϐ" // [v] -> [ϐ] \317\220
+ "в" // [v] -> [в] \320\262
+ "վ" // [v] -> [վ] \325\276
+ "ւ" // [v] -> [ւ] \326\202
+ "ṽ" // [v] -> [ṽ] \341\271\275
+ "ṿ" // [v] -> [ṿ] \341\271\277
+ "\0" // [v] end
+ "ŵ" // [w] -> [ŵ] \305\265
+ "ẁ" // [w] -> [ẁ] \341\272\201
+ "ẃ" // [w] -> [ẃ] \341\272\203
+ "ẅ" // [w] -> [ẅ] \341\272\205
+ "ẇ" // [w] -> [ẇ] \341\272\207
+ "ẉ" // [w] -> [ẉ] \341\272\211
+ "ẘ" // [w] -> [ẘ] \341\272\230
+ "\0" // [w] end
+ "ξ" // [x] -> [ξ] \316\276
+ "χ" // [x] -> [χ] \317\207
+ "х" // [x] -> [х] \321\205
+ "խ" // [x] -> [խ] \325\255
+ "ხ" // [x] -> [ხ] \341\203\256
+ "ẋ" // [x] -> [ẋ] \341\272\213
+ "ẍ" // [x] -> [ẍ] \341\272\215
+ "\0" // [x] end
+ "ý" // [y] -> [ý] \303\275
+ "ÿ" // [y] -> [ÿ] \303\277
+ "ŷ" // [y] -> [ŷ] \305\267
+ "й" // [y] -> [й] \320\271
+ "ы" // [y] -> [ы] \321\213
+ "ь" // [y] -> [ь] \321\214
+ "юyu\0" // [y] -> [ю] \321\216
+ "яya\0" // [y] -> [я] \321\217
+ "ј" // [y] -> [ј] \321\230
+ "յ" // [y] -> [յ] \325\265
+ "ẏ" // [y] -> [ẏ] \341\272\217
+ "ẙ" // [y] -> [ẙ] \341\272\231
+ "ỳ" // [y] -> [ỳ] \341\273\263
+ "ỵ" // [y] -> [ỵ] \341\273\265
+ "ỷ" // [y] -> [ỷ] \341\273\267
+ "ỹ" // [y] -> [ỹ] \341\273\271
+ "\0" // [y] end
+ "ź" // [z] -> [ź] \305\272
+ "ż" // [z] -> [ż] \305\274
+ "ž" // [z] -> [ž] \305\276
+ "ƶ" // [z] -> [ƶ] \306\266
+ "ζ" // [z] -> [ζ] \316\266
+ "жz\0zh\0" // [z] -> [ж] \320\266
+ "з" // [z] -> [з] \320\267
+ "զ" // [z] -> [զ] \325\246
+ "ձ" // [z] -> [ձ] \325\261
+ "ზ" // [z] -> [ზ] \341\203\226
+ "ჟzh\0" // [z] -> [ჟ] \341\203\237
+ "ẑ" // [z] -> [ẑ] \341\272\221
+ "ẓ" // [z] -> [ẓ] \341\272\223
+ "ẕ" // [z] -> [ẕ] \341\272\225
+ "\0" // [z] end
+ ;
+
+// Currently only lowercase ascii characters are mapped from. These
+// are indices from the first ascii character in the key into the
+// kVariantMapElts string above.
+const int kVariantMapIndex[] = {
+ 536, // a
+ 649, // b
+ 666, // c
+ 710, // d
+ 770, // e
+ 884, // f
+ 899, // g
+ 941, // h
+ 973, // i
+ 1111, // j
+ 1125, // k
+ 1163, // l
+ 1200, // m
+ 1219, // n
+ 1256, // o
+ 1386, // p
+ 1417, // q
+ 1423, // r
+ 1455, // s
+ 1528, // t
+ 1608, // u
+ 1681, // v
+ 1698, // w
+ 1719, // x
+ 1737, // y
+ 1782, // z
+};
+
+#endif // ICING_LEGACY_INDEX_ICING_VARIANT_MAP_H_
diff --git a/icing/legacy/index/proto/icing-dynamic-trie-header.proto b/icing/legacy/index/proto/icing-dynamic-trie-header.proto
new file mode 100644
index 0000000..0e59832
--- /dev/null
+++ b/icing/legacy/index/proto/icing-dynamic-trie-header.proto
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains protos that are only used in native.
+
+syntax = "proto2";
+
+package icing;
+
+message IcingDynamicTrieHeader {
+ optional uint32 version = 1;
+
+ optional uint32 value_size = 2;
+
+ // Sizes of buffers.
+ optional uint32 max_nodes = 3;
+ optional uint32 max_nexts = 4;
+ optional uint32 max_suffixes_size = 5;
+
+ // Tail indices for buffers.
+ optional uint32 num_nodes = 6;
+ optional uint32 num_nexts = 7;
+ optional uint32 suffixes_size = 8;
+
+ // Next free list.
+ repeated uint32 free_lists = 9;
+
+ // Number of unique keys.
+ optional uint32 num_keys = 10;
+
+ // Flag used to indicate a flush is in progress.
+ optional bool deprecated_is_flushing = 11 [deprecated = true];
+}
diff --git a/icing/legacy/portable/icing-zlib.h b/icing/legacy/portable/icing-zlib.h
new file mode 100644
index 0000000..ed5e0e2
--- /dev/null
+++ b/icing/legacy/portable/icing-zlib.h
@@ -0,0 +1,23 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_LEGACY_PORTABLE_ICING_ZLIB_H_
+#define ICING_LEGACY_PORTABLE_ICING_ZLIB_H_
+
+// Though we use the same zlib header on all platforms, the implementation used
+// is from NDK on android and from third_party/zlib on iOS/linux. See BUILD
+// rule.
+#include <zlib.h> // IWYU pragma: export
+
+#endif // ICING_LEGACY_PORTABLE_ICING_ZLIB_H_
diff --git a/icing/portable/zlib.h b/icing/portable/zlib.h
new file mode 100644
index 0000000..b575427
--- /dev/null
+++ b/icing/portable/zlib.h
@@ -0,0 +1,23 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_PORTABLE_ZLIB_H_
+#define ICING_PORTABLE_ZLIB_H_
+
+// Though we use the same zlib header on all platforms, the implementation used
+// is from NDK on android and from third_party/zlib on iOS/linux. See BUILD
+// rule.
+#include <zlib.h> // IWYU pragma: export
+
+#endif // ICING_PORTABLE_ZLIB_H_
diff --git a/icing/proto/document.proto b/icing/proto/document.proto
index 50c0f7e..07bce0e 100644
--- a/icing/proto/document.proto
+++ b/icing/proto/document.proto
@@ -20,7 +20,7 @@
option java_multiple_files = true;
// Defines a unit of data understood by the IcingSearchEngine.
-// Next tag: 8
+// Next tag: 9
message DocumentProto {
// REQUIRED: Namespace that this Document resides in.
// Namespaces can affect read/write permissions.
@@ -37,10 +37,9 @@
optional string schema = 3;
// OPTIONAL: Seconds since epoch at which the Document was created.
- // Negative values are invalid for this field and may cause unexpected
- // behaviors. If not specified, it will default to when the Icing receives the
- // Document.
- optional fixed64 creation_timestamp_secs = 4;
+ // Negative values will lead to validation errors. If not specified, it will
+ // default to when the Icing receives the Document.
+ optional int64 creation_timestamp_secs = 4;
// REQUIRED: Properties that will be validated against the provided schema.
// The names of these properties should map to one of the properties
@@ -56,6 +55,16 @@
// ranking. Negative values will lead to validation errors. The default is the
// lowest score 0.
optional int32 score = 7 [default = 0];
+
+ // The time-to-live that should be enforced on this Document. Documents get
+ // garbage-collected once the current time exceeds the ttl_secs after the
+ // creation_timestamp_secs. Negative values will lead to validation errors.
+ //
+ // Default value of 0 keeps the Documents till they're explicitly deleted.
+ //
+ // TODO(cassiewang): Benchmark if fixed64 or some other proto type is better
+ // in terms of space/time efficiency. Both for ttl_secs and timestamp fields
+ optional int64 ttl_secs = 8 [default = 0];
}
// Holds a property field of the Document.
diff --git a/icing/proto/schema.proto b/icing/proto/schema.proto
index dbfbfc4..9a66617 100644
--- a/icing/proto/schema.proto
+++ b/icing/proto/schema.proto
@@ -41,13 +41,6 @@
// in http://schema.org. Eg: DigitalDocument, Message, Person, etc.
optional string schema_type = 1;
- // The time-to-live that should be enforced on every Document of this type.
- // Documents get garbage-collected based on their creation-timestamp and the
- // TTL of the schema they belong to.
- //
- // Default value of 0 keeps the Documents till they're explicitly deleted.
- optional int64 ttl_secs = 3;
-
// List of all properties that are supported by Documents of this type.
// An Document should never have properties that are not listed here.
//
@@ -56,7 +49,7 @@
// easier.
repeated PropertyConfigProto properties = 4;
- reserved 2;
+ reserved 2, 3;
}
// Describes how a single property should be indexed.
@@ -65,26 +58,26 @@
// Indicates how the content of this property should be matched in the index.
//
// TermMatchType.Code=UNKNOWN
- // Content in this section will not be tokenized or indexed. Useful if the
+ // Content in this property will not be tokenized or indexed. Useful if the
// data type is not made up of terms (e.g. DOCUMENT or BYTES type). All the
// properties inside the nested property won't be indexed regardless of the
// value of the term_match_type field for the nested properties.
//
// TermMatchType.Code=EXACT_ONLY
- // Content in this section should only be returned for queries matching the
- // exact tokens appearing in this section.
- // Ex. A section with "fool" should NOT match a query for "foo".
+ // Content in this property should only be returned for queries matching the
+ // exact tokens appearing in this property.
+ // Ex. A property with "fool" should NOT match a query for "foo".
//
// TermMatchType.Code=PREFIX
- // Content in this section should be returned for queries that are either
- // exact matches or query matches of the tokens appearing in this section.
- // Ex. A section with "fool" *should* match a query for "foo".
+ // Content in this property should be returned for queries that are either
+ // exact matches or query matches of the tokens appearing in this property.
+ // Ex. A property with "fool" *should* match a query for "foo".
optional TermMatchType.Code term_match_type = 1;
message TokenizerType {
enum Code {
- // It is only valid for tokenizer_type to be 'NONE' if either indexed is
- // also 'NONE' or the data type is DOCUMENT.
+ // It is only valid for tokenizer_type to be 'NONE' if the data type is
+ // DOCUMENT.
NONE = 0;
// Tokenization for plain text.
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
new file mode 100644
index 0000000..4983c33
--- /dev/null
+++ b/icing/query/query-processor.cc
@@ -0,0 +1,351 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/query-processor.h"
+
+#include <deque>
+#include <memory>
+#include <stack>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+#include "icing/index/iterator/doc-hit-info-iterator-not.h"
+#include "icing/index/iterator/doc-hit-info-iterator-or.h"
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/search.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/raw-query-tokenizer.h"
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// State of the current query parser state. This is specific to how the raw
+// query is parsed/stored.
+struct ParserStateFrame {
+ std::vector<std::unique_ptr<DocHitInfoIterator>> and_iterators;
+ std::vector<std::unique_ptr<DocHitInfoIterator>> or_iterators;
+
+ // If the last independent token was an OR, then we need to treat the next
+ // resulting iterator as part of an or_iterator
+ bool saw_or = false;
+
+ // If the last independent token was an exclusion, then we need to treat the
+ // next resulting iterator as being excluded.
+ bool saw_exclude = false;
+
+ // If the last independent token was a property/section filter, then we need
+ // to save the section name so we can create a section filter iterator.
+ std::string_view section_restrict = "";
+};
+
+// Combines any OR and AND iterators together into one iterator.
+std::unique_ptr<DocHitInfoIterator> ProcessParserStateFrame(
+ ParserStateFrame parser_state_frame,
+ const DocumentId last_added_document_id) {
+ if (parser_state_frame.and_iterators.empty() &&
+ parser_state_frame.or_iterators.empty()) {
+ // No terms specified, treat an empty query as retrieving all documents.
+ //
+ // We don't use the index_.last_added_document_id here because it's possible
+ // that documents exist in the DocumentStore, but were not successfully
+ // indexed. So to return *all* documents and not just *all indexed*
+ // documents, we use the DocumentStore's last_added_document_id
+ return std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ last_added_document_id);
+ }
+
+ if (!parser_state_frame.or_iterators.empty()) {
+ // Combine all the ORs first since they have higher priority, then add it to
+ // the ANDs.
+ parser_state_frame.and_iterators.push_back(
+ CreateOrIterator(std::move(parser_state_frame.or_iterators)));
+ }
+ return CreateAndIterator(std::move(parser_state_frame.and_iterators));
+}
+
+} // namespace
+
+QueryProcessor::QueryProcessor(Index* index,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store,
+ const Clock* clock)
+ : index_(*index),
+ language_segmenter_(*language_segmenter),
+ normalizer_(*normalizer),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ clock_(*clock) {}
+
+libtextclassifier3::StatusOr<QueryProcessor::QueryResults>
+QueryProcessor::ParseSearch(const SearchSpecProto& search_spec) {
+ ICING_ASSIGN_OR_RETURN(QueryResults results, ParseRawQuery(search_spec));
+
+ DocHitInfoIteratorFilter::Options options;
+
+ if (search_spec.namespace_filters_size() > 0) {
+ options.namespaces =
+ std::vector<std::string_view>(search_spec.namespace_filters().begin(),
+ search_spec.namespace_filters().end());
+ }
+
+ if (search_spec.schema_type_filters_size() > 0) {
+ options.schema_types =
+ std::vector<std::string_view>(search_spec.schema_type_filters().begin(),
+ search_spec.schema_type_filters().end());
+ }
+
+ results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>(
+ std::move(results.root_iterator), &document_store_, &schema_store_,
+ &clock_, options);
+ return results;
+}
+
+// TODO(cassiewang): Collect query stats to populate the SearchResultsProto
+libtextclassifier3::StatusOr<QueryProcessor::QueryResults>
+QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
+ // Tokenize the incoming raw query
+ //
+ // TODO(cassiewang): Consider caching/creating a tokenizer factory that will
+ // cache the n most recently used tokenizers. So we don't have to recreate
+ // this on every new query, if they'll all be raw queries.
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(&language_segmenter_);
+ ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
+ raw_query_tokenizer->TokenizeAll(search_spec.query()));
+
+ std::stack<ParserStateFrame> frames;
+ frames.emplace();
+
+ QueryResults results;
+ // Process all the tokens
+ for (int i = 0; i < tokens.size(); i++) {
+ const Token& token = tokens.at(i);
+ std::unique_ptr<DocHitInfoIterator> result_iterator;
+
+ // TODO(cassiewang): Handle negation tokens
+ switch (token.type) {
+ case Token::Type::QUERY_LEFT_PARENTHESES: {
+ frames.emplace(ParserStateFrame());
+ break;
+ }
+ case Token::Type::QUERY_RIGHT_PARENTHESES: {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+ result_iterator = ProcessParserStateFrame(
+ std::move(frames.top()), document_store_.last_added_document_id());
+ frames.pop();
+ break;
+ }
+ case Token::Type::QUERY_EXCLUSION: {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+ frames.top().saw_exclude = true;
+ break;
+ }
+ case Token::Type::QUERY_OR: {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+ frames.top().saw_or = true;
+ break;
+ }
+ case Token::Type::QUERY_PROPERTY: {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+
+ frames.top().section_restrict = token.text;
+ break;
+ }
+ case Token::Type::REGULAR: {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+
+ std::string normalized_text = normalizer_.NormalizeTerm(token.text);
+
+ // TODO(cassiewang): Consider removing the use of a section mask in the
+ // term iterator, or constructing a best-effort SectionIdMask based on
+ // the section filter. For some combination of schema type filters and
+ // section filters, we can't encapsulate the perfect
+ // SchemaTypeId-SectionId sets with just a SectionIdMask. So we
+ // over-retrieve hits and have to do a post-filter anyways. With a
+ // SectionIdMask, we might be able to narrow down our SectionIds, but
+ // we'll still over-retrieve hits a bit. So at that point, it's a
+ // tradeoff of
+ //
+ // 1.1 Go to SchemaStore and iterate over the schema to calculate a
+ // SectionIdMask
+ // 1.2 Use SectionIdMask and save some hit buffer memory
+ // 1.3 Do a post-filter to double check SchemaTypeId-SectionId combo
+ //
+ // vs
+ //
+ // 2.1 Use SectionIdMaskAll and use more hit buffer memory
+ // 2.2 Do a post-filter to double check SchemaTypeId-SectionId combo
+ //
+ // We do the same amount of disk reads, so it may be dependent on how
+ // big the schema is and/or how popular schema type filtering and
+ // section filtering is.
+
+ ICING_ASSIGN_OR_RETURN(
+ result_iterator,
+ index_.GetIterator(normalized_text, kSectionIdMaskAll,
+ search_spec.term_match_type()));
+
+ // Add terms to match if this is not a negation term.
+ // WARNING: setting query terms at this point is not compatible with
+ // group-level excludes, group-level sections restricts or excluded
+ // section restricts. Those are not currently supported. If they became
+ // supported, this handling for query terms would need to be altered.
+ if (!frames.top().saw_exclude) {
+ results.query_terms[frames.top().section_restrict].insert(
+ std::move(normalized_text));
+ }
+ break;
+ }
+ case Token::Type::INVALID:
+ U_FALLTHROUGH;
+ default:
+ // This wouldn't happen if tokenizer and query processor both work
+ // correctly. An unknown token indicates inconsistency between tokenizer
+ // and query processor, so we return an internal error here.
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Encountered unknown token while processing query: ", token.text));
+ }
+
+ // Did we get an iterator out of this token?
+ if (result_iterator) {
+ if (frames.empty()) {
+ return absl_ports::InternalError(
+ "Encountered empty stack of ParserStateFrames");
+ }
+
+ // NOTE: Order matters!! We must apply the section restrict first, then
+ // the NOT operator.
+ //
+ // Imagine a query like [-subject:foo] which means we
+ // want to get documents that don't have the term 'foo' in section
+ // 'subject'.
+ //
+ // Assume some Document_0:
+ // { "subject": "foo" }
+ //
+ // And assume some Document_1:
+ // { "subject": "bar" }
+ //
+ // If we use the IteratorNot first, then we'll get DocHitInfos that
+ // represent DocumentIds without any section hits like
+ // DocHitInfo(document_id_1, kSectionIdMaskNone). Then, when we try to
+ // apply the IteratorSectionRestrict, no SectionIds in the mask will match
+ // the SectionId of 'subject' and we won't return any results.
+ //
+ // If we use the IteratorSectionRestrict first, then we'll get a
+ // DocHitInfo for Document_0. Then with the IteratorNot, we can get the
+ // rest of the Documents excluding Document_0, and get Document_1 as a
+ // correct result.
+ //
+ // TODO(cassiewang): The point is a bit moot right now since we don't even
+ // support this functionality. But add tests for this once we do support
+ // more advanced section restricts with grouping, negation, etc.
+ if (!frames.top().section_restrict.empty()) {
+ // We saw a section restrict earlier, wrap the result iterator in
+ // the section restrict
+ result_iterator = std::make_unique<DocHitInfoIteratorSectionRestrict>(
+ std::move(result_iterator), &document_store_, &schema_store_,
+ frames.top().section_restrict);
+
+ frames.top().section_restrict = "";
+ }
+
+ // Check if we need to NOT/exclude this iterator
+ if (frames.top().saw_exclude) {
+ result_iterator = std::make_unique<DocHitInfoIteratorNot>(
+ std::move(result_iterator),
+ document_store_.last_added_document_id());
+ frames.top().saw_exclude = false;
+ }
+
+ if (i < tokens.size() - 1 &&
+ tokens.at(i + 1).type == Token::Type::QUERY_OR) {
+ // This isn't the last token, and the next token is an OR. Then we
+ // should OR this iterator with the next iterator, (e.g. if the query
+ // was "A OR B", we would be processing "A" right now)
+ frames.top().or_iterators.push_back(std::move(result_iterator));
+ } else if (frames.top().saw_or) {
+ // This isn't the first token, and the previous token was an OR. Then
+ // we should OR this iterator with the previous iterator (e.g. if the
+ // query was "A OR (B C)", we would be processing the iterator for "(B
+ // C)" right now)
+ frames.top().or_iterators.push_back(std::move(result_iterator));
+ frames.top().saw_or = false;
+ } else {
+ // If we're not trying to OR this iterator, we AND everything else.
+ if (!frames.top().or_iterators.empty()) {
+ // Accumulate the previous OR iterators if there were any.
+ frames.top().and_iterators.push_back(
+ CreateOrIterator(std::move(frames.top().or_iterators)));
+ frames.top().or_iterators =
+ std::vector<std::unique_ptr<DocHitInfoIterator>>();
+ }
+ frames.top().and_iterators.push_back(std::move(result_iterator));
+ }
+ }
+ }
+
+ // Guaranteed that we have some iterators to return. Need to do one last
+ // combining since we could have ORs and ANDs.
+ if (frames.size() != 1) {
+ return absl_ports::InternalError(
+ "Encountered invalid state of ParserStateFrames stack");
+ }
+ results.root_iterator = ProcessParserStateFrame(
+ std::move(frames.top()), document_store_.last_added_document_id());
+ return results;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/query-processor.h b/icing/query/query-processor.h
new file mode 100644
index 0000000..9d7e3d9
--- /dev/null
+++ b/icing/query/query-processor.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_QUERY_PROCESSOR_H_
+#define ICING_QUERY_QUERY_PROCESSOR_H_
+
+#include <memory>
+
+#include "utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/search.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+// Processes SearchSpecProtos and retrieves the specified DocHitInfos that
+// satisfies the query and its restrictions. This does not perform any scoring,
+// and returns matched documents in a descending DocumentId order.
+class QueryProcessor {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ // TODO(b/141180665): Add nullptr checks for the raw pointers
+ explicit QueryProcessor(Index* index,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store, const Clock* clock);
+
+ struct QueryResults {
+ std::unique_ptr<DocHitInfoIterator> root_iterator;
+ // A map from section names to sets of terms restricted to those sections.
+ // Query terms that are not restricted are found at the entry with key "".
+ SectionRestrictQueryTermsMap query_terms;
+ };
+ // Parse the search configurations (including the query, any additional
+ // filters, etc.) in the SearchSpecProto into one DocHitInfoIterator.
+ //
+ // Returns:
+ // On success,
+ // - One iterator that represents the entire query
+ // - A map representing the query terms and any section restrictions
+ // INVALID_ARGUMENT if query syntax is incorrect and cannot be tokenized
+ // INTERNAL_ERROR on all other errors
+ libtextclassifier3::StatusOr<QueryResults> ParseSearch(
+ const SearchSpecProto& search_spec);
+
+ private:
+ // Parse the query into a one DocHitInfoIterator that represents the root of a
+ // query tree.
+ //
+ // Returns:
+ // On success,
+ // - One iterator that represents the entire query
+ // - A map representing the query terms and any section restrictions
+ // INVALID_ARGUMENT if query syntax is incorrect and cannot be tokenized
+ // INTERNAL_ERROR on all other errors
+ libtextclassifier3::StatusOr<QueryResults> ParseRawQuery(
+ const SearchSpecProto& search_spec);
+
+ // Not const because we could modify/sort the hit buffer in the lite index at
+ // query time.
+ Index& index_;
+ const LanguageSegmenter& language_segmenter_;
+ const Normalizer& normalizer_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+ const Clock& clock_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_QUERY_PROCESSOR_H_
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
new file mode 100644
index 0000000..40df462
--- /dev/null
+++ b/icing/query/query-processor_benchmark.cc
@@ -0,0 +1,469 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "third_party/absl/flags/flag.h"
+#include "icing/document-builder.h"
+#include "icing/index/index.h"
+#include "icing/proto/term.pb.h"
+#include "icing/query/query-processor.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/logging.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/query:query-processor_benchmark
+//
+// $ blaze-bin/icing/query/query-processor_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// Make target //icing/tokenization:language-segmenter depend on
+// //third_party/icu
+//
+// Make target //icing/transform:normalizer depend on
+// //third_party/icu
+//
+// Download LangId model file from
+// //nlp/saft/components/lang_id/mobile/fb_model:models/latest_model.smfb and
+// put it into your device:
+// $ adb push [your model path] /data/local/tmp/
+//
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/query:query-processor_benchmark
+//
+// $ adb push blaze-bin/icing/query/query-processor_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/query-processor_benchmark --benchmarks=all
+// --adb
+
+// Flag to tell the benchmark that it'll be run on an Android device via adb,
+// the benchmark will set up data files accordingly.
+ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id,
+ TermMatchType::Code term_match_type,
+ const std::string& token) {
+ Index::Editor editor = index->Edit(document_id, section_id, term_match_type);
+ ICING_ASSERT_OK(editor.AddHit(token.c_str()));
+}
+
+std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
+ const std::string& index_dir) {
+ Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10);
+ return Index::Create(options, &filesystem).ValueOrDie();
+}
+
+std::unique_ptr<LanguageSegmenter> CreateLanguageSegmenter() {
+ if (absl::GetFlag(FLAGS_adb)) {
+ return LanguageSegmenter::Create("/data/local/tmp/latest_model.smfb")
+ .ValueOrDie();
+ } else {
+ return LanguageSegmenter::Create(GetLangIdModelPath()).ValueOrDie();
+ }
+}
+
+std::unique_ptr<Normalizer> CreateNormalizer() {
+ return Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max())
+ .ValueOrDie();
+}
+
+void CleanUp(const Filesystem& filesystem, const std::string& base_dir) {
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+
+void BM_QueryOneTerm(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string index_dir = base_dir + "/index";
+ const std::string schema_dir = base_dir + "/schema";
+ const std::string doc_store_dir = base_dir + "/store";
+
+ CleanUp(filesystem, base_dir);
+ if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ FakeClock fake_clock;
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("type1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir));
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ std::unique_ptr<DocumentStore> document_store =
+ DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
+ schema_store.get())
+ .ValueOrDie();
+
+ DocumentId document_id = document_store
+ ->Put(DocumentBuilder()
+ .SetKey("icing", "type1")
+ .SetSchema("type1")
+ .Build())
+ .ValueOrDie();
+
+ const std::string input_string(state.range(0), 'A');
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, input_string);
+
+ QueryProcessor query_processor(index.get(), language_segmenter.get(),
+ normalizer.get(), document_store.get(),
+ schema_store.get(), &fake_clock);
+ SearchSpecProto search_spec;
+ search_spec.set_query(input_string);
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ for (auto _ : state) {
+ QueryProcessor::QueryResults results =
+ query_processor.ParseSearch(search_spec).ValueOrDie();
+ while (results.root_iterator->Advance().ok()) {
+ results.root_iterator->doc_hit_info();
+ }
+ }
+
+ // Destroy document store before the whole directory is removed because it
+ // persists data in destructor.
+ document_store.reset();
+ CleanUp(filesystem, base_dir);
+}
+BENCHMARK(BM_QueryOneTerm)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_QueryFiveTerms(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string index_dir = base_dir + "/index";
+ const std::string schema_dir = base_dir + "/schema";
+ const std::string doc_store_dir = base_dir + "/store";
+
+ CleanUp(filesystem, base_dir);
+ if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ FakeClock fake_clock;
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("type1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir));
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ std::unique_ptr<DocumentStore> document_store =
+ DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
+ schema_store.get())
+ .ValueOrDie();
+
+ DocumentId document_id = document_store
+ ->Put(DocumentBuilder()
+ .SetKey("icing", "type1")
+ .SetSchema("type1")
+ .Build())
+ .ValueOrDie();
+
+ int term_length = state.range(0) / 5;
+
+ const std::string input_string_a(term_length, 'A');
+ const std::string input_string_b(term_length, 'B');
+ const std::string input_string_c(term_length, 'C');
+ const std::string input_string_d(term_length, 'D');
+ const std::string input_string_e(term_length, 'E');
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, input_string_a);
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/1,
+ TermMatchType::EXACT_ONLY, input_string_b);
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/2,
+ TermMatchType::EXACT_ONLY, input_string_c);
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/3,
+ TermMatchType::EXACT_ONLY, input_string_d);
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/4,
+ TermMatchType::EXACT_ONLY, input_string_e);
+
+ QueryProcessor query_processor(index.get(), language_segmenter.get(),
+ normalizer.get(), document_store.get(),
+ schema_store.get(), &fake_clock);
+
+ const std::string query_string = absl_ports::StrCat(
+ input_string_a, " ", input_string_b, " ", input_string_c, " ",
+ input_string_d, " ", input_string_e);
+
+ SearchSpecProto search_spec;
+ search_spec.set_query(query_string);
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ for (auto _ : state) {
+ QueryProcessor::QueryResults results =
+ query_processor.ParseSearch(search_spec).ValueOrDie();
+ while (results.root_iterator->Advance().ok()) {
+ results.root_iterator->doc_hit_info();
+ }
+ }
+
+ // Destroy document store before the whole directory is removed because it
+ // persists data in destructor.
+ document_store.reset();
+ CleanUp(filesystem, base_dir);
+}
+BENCHMARK(BM_QueryFiveTerms)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_QueryDiacriticTerm(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string index_dir = base_dir + "/index";
+ const std::string schema_dir = base_dir + "/schema";
+ const std::string doc_store_dir = base_dir + "/store";
+
+ CleanUp(filesystem, base_dir);
+ if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ FakeClock fake_clock;
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("type1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir));
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ std::unique_ptr<DocumentStore> document_store =
+ DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
+ schema_store.get())
+ .ValueOrDie();
+
+ DocumentId document_id = document_store
+ ->Put(DocumentBuilder()
+ .SetKey("icing", "type1")
+ .SetSchema("type1")
+ .Build())
+ .ValueOrDie();
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ }
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, input_string);
+
+ QueryProcessor query_processor(index.get(), language_segmenter.get(),
+ normalizer.get(), document_store.get(),
+ schema_store.get(), &fake_clock);
+ SearchSpecProto search_spec;
+ search_spec.set_query(input_string);
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ for (auto _ : state) {
+ QueryProcessor::QueryResults results =
+ query_processor.ParseSearch(search_spec).ValueOrDie();
+ while (results.root_iterator->Advance().ok()) {
+ results.root_iterator->doc_hit_info();
+ }
+ }
+
+ // Destroy document store before the whole directory is removed because it
+ // persists data in destructor.
+ document_store.reset();
+ CleanUp(filesystem, base_dir);
+}
+BENCHMARK(BM_QueryDiacriticTerm)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_QueryHiragana(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string index_dir = base_dir + "/index";
+ const std::string schema_dir = base_dir + "/schema";
+ const std::string doc_store_dir = base_dir + "/store";
+
+ CleanUp(filesystem, base_dir);
+ if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
+ !filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+ std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
+ FakeClock fake_clock;
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("type1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir));
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ std::unique_ptr<DocumentStore> document_store =
+ DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
+ schema_store.get())
+ .ValueOrDie();
+
+ DocumentId document_id = document_store
+ ->Put(DocumentBuilder()
+ .SetKey("icing", "type1")
+ .SetSchema("type1")
+ .Build())
+ .ValueOrDie();
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ }
+ AddTokenToIndex(index.get(), document_id, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, input_string);
+
+ QueryProcessor query_processor(index.get(), language_segmenter.get(),
+ normalizer.get(), document_store.get(),
+ schema_store.get(), &fake_clock);
+
+ SearchSpecProto search_spec;
+ search_spec.set_query(input_string);
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ for (auto _ : state) {
+ QueryProcessor::QueryResults results =
+ query_processor.ParseSearch(search_spec).ValueOrDie();
+ while (results.root_iterator->Advance().ok()) {
+ results.root_iterator->doc_hit_info();
+ }
+ }
+
+ // Destroy document store before the whole directory is removed because it
+ // persists data in destructor.
+ document_store.reset();
+ CleanUp(filesystem, base_dir);
+}
+BENCHMARK(BM_QueryHiragana)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
new file mode 100644
index 0000000..fbc83e2
--- /dev/null
+++ b/icing/query/query-processor_test.cc
@@ -0,0 +1,1668 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/query-processor.h"
+
+#include <memory>
+#include <string>
+
+#include "utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::Test;
+using ::testing::UnorderedElementsAre;
+
+class QueryProcessorTest : public Test {
+ protected:
+ QueryProcessorTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ index_dir_(test_dir_ + "/index"),
+ store_dir_(test_dir_ + "/store") {}
+
+ void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
+
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(index_,
+ Index::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_,
+ Normalizer::Create(/*max_term_byte_size=*/1000));
+
+ SchemaProto schema;
+
+ // Message schema
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ // Add an indexed property so we generate section metadata on it
+ auto property = type_config->add_properties();
+ property->set_property_name("foo");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ // Add another indexed property so we generate section metadata on it
+ property = type_config->add_properties();
+ property->set_property_name(indexed_property_);
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ // Since we order indexed properties alphabetically, "foo" gets section id
+ // 0, and "subject" gets section id 1 for messages
+ indexed_message_section_id_ = 1;
+
+ // Email schema
+ type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ // Add an indexed property so we generate section metadata on it
+ property = type_config->add_properties();
+ property->set_property_name(indexed_property_);
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ // First and only indexed property, so it gets the first id of 0
+ indexed_email_section_id_ = 0;
+
+ // Add an unindexed property
+ property = type_config->add_properties();
+ property->set_property_name(unindexed_property_);
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
+ }
+
+ libtextclassifier3::Status AddTokenToIndex(
+ DocumentId document_id, SectionId section_id,
+ TermMatchType::Code term_match_type, const std::string& token) {
+ Index::Editor editor =
+ index_->Edit(document_id, section_id, term_match_type);
+ return editor.AddHit(token.c_str());
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<Index> index_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+ const std::string indexed_property_ = "subject";
+ const std::string unindexed_property_ = "to";
+ int indexed_email_section_id_;
+ int indexed_message_section_id_;
+
+ private:
+ IcingFilesystem icing_filesystem_;
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string index_dir_;
+ const std::string store_dir_;
+};
+
+TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
+ // We don't need to insert anything in the index since the empty query will
+ // match all DocumentIds from the DocumentStore
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("()");
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocumentIds(results.root_iterator.get()),
+ ElementsAre(document_id2, document_id1));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+}
+
+TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
+ // We don't need to insert anything in the index since the empty query will
+ // match all DocumentIds from the DocumentStore
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("");
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocumentIds(results.root_iterator.get()),
+ ElementsAre(document_id2, document_id1));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+}
+
+TEST_F(QueryProcessorTest, QueryTermNormalized) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hElLo WORLD");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
+}
+
+TEST_F(QueryProcessorTest, OneTermPrefixMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("he");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he"));
+}
+
+TEST_F(QueryProcessorTest, OneTermExactMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello"));
+}
+
+TEST_F(QueryProcessorTest, AndTwoTermExactMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello world");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
+}
+
+TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("he wo");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he", "wo"));
+}
+
+TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(AddTokenToIndex(document_id, section_id,
+ TermMatchType::EXACT_ONLY, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, TermMatchType::PREFIX, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello wo");
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "wo"));
+}
+
+TEST_F(QueryProcessorTest, OrTwoTermExactMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello OR world");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
+}
+
+TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("he OR wo");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he", "wo"));
+}
+
+TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id,
+ TermMatchType::EXACT_ONLY, "hello"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, TermMatchType::PREFIX, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello OR wo");
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "wo"));
+}
+
+TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "animal puppy dog"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "animal kitten cat"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+
+ {
+ // OR gets precedence over AND, this is parsed as ((puppy OR kitten) AND
+ // dog)
+ SearchSpecProto search_spec;
+ search_spec.set_query("puppy OR kitten dog");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Only Document 1 matches since it has puppy AND dog
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("puppy", "kitten", "dog"));
+ }
+
+ {
+ // OR gets precedence over AND, this is parsed as (animal AND (puppy OR
+ // kitten))
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal puppy OR kitten");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Both Document 1 and 2 match since Document 1 has puppy AND dog, and
+ // Document 2 has kitten
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("animal", "puppy", "kitten"));
+ }
+
+ {
+ // OR gets precedence over AND, this is parsed as (kitten AND ((foo OR bar)
+ // OR cat))
+ SearchSpecProto search_spec;
+ search_spec.set_query("kitten foo OR bar OR cat");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Only Document 2 matches since it has both kitten and cat
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("kitten", "foo", "bar", "cat"));
+ }
+}
+
+TEST_F(QueryProcessorTest, OneGroup) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "puppy dog"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "kitten cat"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+
+ // Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and
+ // no documents would match. But with grouping, Document 1 matches puppy
+ SearchSpecProto search_spec;
+ search_spec.set_query("puppy OR (kitten foo)");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("puppy", "kitten", "foo"));
+}
+
+TEST_F(QueryProcessorTest, TwoGroups) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "puppy dog"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "kitten cat"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+
+ // Without grouping, this would be parsed as (puppy AND (dog OR kitten) AND
+ // cat) and wouldn't match any documents. But with grouping, Document 1
+ // matches (puppy AND dog) and Document 2 matches (kitten and cat).
+ SearchSpecProto search_spec;
+ search_spec.set_query("(puppy dog) OR (kitten cat)");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("puppy", "dog", "kitten", "cat"));
+}
+
+TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "puppy dog"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "kitten cat"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+
+ // Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and
+ // no documents would match. But with grouping, Document 1 matches puppy
+ SearchSpecProto search_spec;
+ search_spec.set_query("puppy OR ((((kitten foo))))");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("puppy", "kitten", "foo"));
+}
+
+TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that the DocHitInfoIterators will see that the
+ // document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "puppy dog"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "kitten cat"
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ // Document 1 will match puppy and Document 2 matches (kitten AND (cat))
+ SearchSpecProto search_spec;
+ search_spec.set_query("puppy OR (kitten(cat))");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""],
+ UnorderedElementsAre("puppy", "kitten", "cat"));
+}
+
+TEST_F(QueryProcessorTest, ExcludeTerm) {
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that they'll bump the last_added_document_id,
+ // which will give us the proper exclusion results
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
+ IsOk());
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("-hello");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // We don't know have the section mask to indicate what section "world" came.
+ // It doesn't matter which section it was in since the query doesn't care. It
+ // just wanted documents that didn't have "hello"
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, kSectionIdMaskNone)));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+}
+
+TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) {
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that they'll bump the last_added_document_id,
+ // which will give us the proper exclusion results
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
+ IsOk());
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("-foo");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, kSectionIdMaskNone),
+ DocHitInfo(document_id1, kSectionIdMaskNone)));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+}
+
+TEST_F(QueryProcessorTest, ExcludeAnd) {
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that they'll bump the last_added_document_id,
+ // which will give us the proper exclusion results
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "animal dog"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "animal cat"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ {
+ SearchSpecProto search_spec;
+ search_spec.set_query("-dog -cat");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // The query is interpreted as "exclude all documents that have animal, and
+ // exclude all documents that have cat". Since both documents contain
+ // animal, there are no results.
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_terms, IsEmpty());
+ }
+
+ {
+ SearchSpecProto search_spec;
+ search_spec.set_query("-animal cat");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // The query is interpreted as "exclude all documents that have animal, and
+ // include all documents that have cat". Since both documents contain
+ // animal, there are no results.
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("cat"));
+ }
+}
+
+TEST_F(QueryProcessorTest, ExcludeOr) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're just
+ // inserting the documents so that they'll bump the last_added_document_id,
+ // which will give us the proper exclusion results
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "animal dog"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "animal cat"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ {
+ SearchSpecProto search_spec;
+ search_spec.set_query("-animal OR -cat");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // We don't have a section mask indicating which sections in this document
+ // matched the query since it's not based on section-term matching. It's
+ // more based on the fact that the query excluded all the other documents.
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, kSectionIdMaskNone)));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+ }
+
+ {
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal OR -cat");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask),
+ DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
+ }
+}
+
+TEST_F(QueryProcessorTest, DeletedFilter) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+ EXPECT_THAT(document_store_->Delete("namespace", "1"), IsOk());
+
+ // Document 1 has content "animal dog"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "animal cat"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id2, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, NamespaceFilter) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .Build()));
+
+ // Document 1 has content "animal dog"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+
+ // Document 2 has content "animal cat"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.add_namespace_filters("namespace1");
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SchemaTypeFilter) {
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Document 1 has content "animal dog"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+
+ // Document 2 has content "animal cat"
+ ASSERT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.add_schema_type_filters("email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
+ SectionIdMask section_id_mask = 1U << indexed_email_section_id_;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>'
+ search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[indexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
+ SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
+ SectionIdMask message_section_id_mask = 1U << indexed_message_section_id_;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ // Message document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>'
+ search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Ordered by descending DocumentId, so message comes first since it was
+ // inserted last
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(message_document_id, message_section_id_mask),
+ DocHitInfo(email_document_id, email_section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[indexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
+ SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ // Message document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>', but only look within
+ // documents of email schema
+ search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.add_schema_type_filters("email");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Shouldn't include the message document since we're only looking at email
+ // types
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[indexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
+ SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ // Message document has content "animal", but put in in the same section id as
+ // the indexed email section id, the same id as indexed property "foo" in the
+ // message type
+ ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>', but only look within
+ // documents of email schema
+ search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Even though the section id is the same, we should be able to tell that it
+ // doesn't match to the name of the section filter
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[indexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) {
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>', but only look within
+ // documents of email schema
+ search_spec.set_query("nonexistent.section:animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Even though the section id is the same, we should be able to tell that it
+ // doesn't match to the name of the section filter
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms["nonexistent.section"],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) {
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>', but only look within
+ // documents of email schema
+ search_spec.set_query(unindexed_property_ + ":animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Even though the section id is the same, we should be able to tell that it
+ // doesn't match to the name of the section filter
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[unindexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
+ SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
+ SectionIdMask message_section_id_mask = 1U << indexed_message_section_id_;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ term_match_type, "cat"),
+ IsOk());
+
+ // Message document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ term_match_type, "animal"),
+ IsOk());
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock_);
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>'
+ search_spec.set_query("cat OR " + indexed_property_ + ":animal");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ // Ordered by descending DocumentId, so message comes first since it was
+ // inserted last
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(message_document_id, message_section_id_mask),
+ DocHitInfo(email_document_id, email_section_id_mask)));
+ EXPECT_THAT(results.query_terms, SizeIs(2));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("cat"));
+ EXPECT_THAT(results.query_terms[indexed_property_],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) {
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(100)
+ .Build()));
+
+ EXPECT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
+ term_match_type, "hello"),
+ IsOk());
+
+ // Arbitrary value, just has to be less than the document's creation
+ // timestamp + ttl
+ FakeClock fake_clock;
+ fake_clock.SetSeconds(50);
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ SectionIdMask section_id_mask = 1U << indexed_email_section_id_;
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(DocHitInfo(document_id, section_id_mask)));
+}
+
+TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) {
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .SetTtlSecs(100)
+ .Build()));
+
+ EXPECT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
+ term_match_type, "hello"),
+ IsOk());
+
+ // Arbitrary value, just has to be greater than the document's creation
+ // timestamp + ttl
+ FakeClock fake_clock;
+ fake_clock.SetSeconds(200);
+
+ QueryProcessor query_processor(index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(),
+ schema_store_.get(), &fake_clock);
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello");
+ search_spec.set_term_match_type(term_match_type);
+
+ ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
+ query_processor.ParseSearch(search_spec));
+
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/query-terms.h b/icing/query/query-terms.h
new file mode 100644
index 0000000..1c5ce02
--- /dev/null
+++ b/icing/query/query-terms.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_QUERY_TERMS_H_
+#define ICING_QUERY_QUERY_TERMS_H_
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace icing {
+namespace lib {
+
+// A map from section names to sets of terms restricted to those sections.
+// Query terms that are not restricted are found at the entry with key "".
+using SectionRestrictQueryTermsMap =
+ std::unordered_map<std::string_view, std::unordered_set<std::string>>;
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_QUERY_TERMS_H_
diff --git a/icing/result-retriever.cc b/icing/result-retriever.cc
new file mode 100644
index 0000000..a80cf96
--- /dev/null
+++ b/icing/result-retriever.cc
@@ -0,0 +1,73 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result-retriever.h"
+
+#include "utils/base/statusor.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
+ResultRetriever::RetrieveResults(
+ const ResultSpecProto& result_spec,
+ const SectionRestrictQueryTermsMap& query_terms,
+ TermMatchType::Code match_type,
+ const std::vector<ScoredDocumentHit>& scored_document_hits) const {
+ const int num_results_returned =
+ std::min(static_cast<int>(scored_document_hits.size()),
+ result_spec.num_to_retrieve());
+ std::vector<SearchResultProto::ResultProto> search_results;
+ search_results.reserve(num_results_returned);
+ for (const auto& scored_document_hit : scored_document_hits) {
+ if (search_results.size() >= result_spec.num_to_retrieve()) {
+ break;
+ }
+ libtextclassifier3::StatusOr<DocumentProto> document_or =
+ doc_store_.Get(scored_document_hit.document_id());
+
+ if (!document_or.ok()) {
+ // Internal errors from document store are IO errors, return directly.
+ if (absl_ports::IsInternal(document_or.status())) {
+ return document_or.status();
+ }
+
+ if (ignore_bad_document_ids_) {
+ continue;
+ } else {
+ return document_or.status();
+ }
+ }
+
+ SearchResultProto::ResultProto result;
+ // Add the snippet if requested.
+ if (result_spec.snippet_spec().num_matches_per_property() > 0 &&
+ result_spec.snippet_spec().num_to_snippet() > search_results.size()) {
+ SnippetProto snippet_proto = snippet_retriever_.RetrieveSnippet(
+ query_terms, match_type, result_spec.snippet_spec(),
+ document_or.ValueOrDie(), scored_document_hit.hit_section_id_mask());
+ *result.mutable_snippet() = std::move(snippet_proto);
+ }
+
+ // Add the document, itself.
+ *result.mutable_document() = std::move(document_or).ValueOrDie();
+ search_results.push_back(std::move(result));
+ }
+ return search_results;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result-retriever.h b/icing/result-retriever.h
new file mode 100644
index 0000000..48ff5c7
--- /dev/null
+++ b/icing/result-retriever.h
@@ -0,0 +1,82 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RETRIEVER_H_
+#define ICING_RESULT_RETRIEVER_H_
+
+#include <utility>
+#include <vector>
+
+#include "utils/base/statusor.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/snippet-retriever.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+class ResultRetriever {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ explicit ResultRetriever(const DocumentStore* doc_store,
+ const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter,
+ bool ignore_bad_document_ids = true)
+ : doc_store_(*doc_store),
+ snippet_retriever_(schema_store, language_segmenter),
+ ignore_bad_document_ids_(ignore_bad_document_ids) {}
+
+ // Gets results (pairs of DocumentProtos and SnippetProtos) with the given
+ // document ids from the given document store. The
+ // order of documents returned is the same as the order of document_ids.
+ //
+ // Parameter "ignore_bad_document_ids" indicates whether to ignore invalid and
+ // non-existing document_ids. If it's true, errors on some document_ids will
+ // be ignored and valid documents will be returned, otherwise any error will
+ // be returned immediately. Note that IO errors will always be returned.
+ //
+ // Returns when ignore_bad_document_ids is true:
+ // A list of valid documents on success
+ // INTERNAL_ERROR on IO error
+ //
+ // Returns when ignore_bad_document_ids is false:
+ // A list of documents on success
+ // INVALID_ARGUMENT if any document_id < 0
+ // NOT_FOUND if any doc doesn't exist or has been deleted
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
+ RetrieveResults(
+ const ResultSpecProto& result_spec,
+ const SectionRestrictQueryTermsMap& query_terms,
+ TermMatchType::Code match_type,
+ const std::vector<ScoredDocumentHit>& scored_document_hits) const;
+
+ private:
+ const DocumentStore& doc_store_;
+ const SnippetRetriever snippet_retriever_;
+ const bool ignore_bad_document_ids_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RETRIEVER_H_
diff --git a/icing/result-retriever_test.cc b/icing/result-retriever_test.cc
new file mode 100644
index 0000000..5e22041
--- /dev/null
+++ b/icing/result-retriever_test.cc
@@ -0,0 +1,438 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result-retriever.h"
+
+#include <limits>
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/snippet-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Return;
+using ::testing::SizeIs;
+
+class ResultRetrieverTest : public testing::Test {
+ protected:
+ ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ test_document1_ = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetCreationTimestampSecs(1574365086)
+ .Build();
+ test_document2_ = DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo 2")
+ .AddStringProperty("body", "body bar 2")
+ .SetCreationTimestampSecs(1574365087)
+ .Build();
+ test_document3_ = DocumentBuilder()
+ .SetKey("icing", "email/3")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo 3")
+ .AddStringProperty("body", "body bar 3")
+ .SetCreationTimestampSecs(1574365088)
+ .Build();
+ }
+
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ PropertyConfigProto* prop_config = type_config->add_properties();
+ prop_config->set_property_name("subject");
+ prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ prop_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ prop_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ prop_config = type_config->add_properties();
+ prop_config->set_property_name("body");
+ prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ prop_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ prop_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ result_spec_no_snippet_ = ResultSpecProto::default_instance();
+
+ result_spec_snippet_.mutable_snippet_spec()->set_num_to_snippet(
+ std::numeric_limits<int>::max());
+ result_spec_snippet_.mutable_snippet_spec()->set_num_matches_per_property(
+ std::numeric_limits<int>::max());
+ result_spec_snippet_.mutable_snippet_spec()->set_max_window_bytes(1024);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ ResultSpecProto result_spec_no_snippet_;
+ ResultSpecProto result_spec_snippet_;
+ const std::string test_dir_;
+ DocumentProto test_document1_;
+ DocumentProto test_document2_;
+ DocumentProto test_document3_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ FakeClock fake_clock_;
+};
+
+TEST_F(ResultRetrieverTest, Simple) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(test_document3_));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get());
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = test_document1_;
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = test_document2_;
+ SearchResultProto::ResultProto result3;
+ *result3.mutable_document() = test_document3_;
+
+ SectionRestrictQueryTermsMap query_terms{};
+ EXPECT_THAT(
+ result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY,
+ scored_document_hits),
+ IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2),
+ EqualsProto(result3))));
+}
+
+TEST_F(ResultRetrieverTest, OnlyOneResultRequested) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(test_document3_));
+
+ result_spec_no_snippet_.set_num_to_retrieve(1);
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get());
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = test_document1_;
+
+ SectionRestrictQueryTermsMap query_terms{};
+ EXPECT_THAT(result_retriever->RetrieveResults(
+ result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY, scored_document_hits),
+ IsOkAndHolds(ElementsAre(EqualsProto(result1))));
+}
+
+TEST_F(ResultRetrieverTest, IgnoreErrors) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+
+ DocumentId invalid_document_id = -1;
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get(),
+ /*ignore_bad_document_ids=*/true);
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = test_document1_;
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = test_document2_;
+
+ SectionRestrictQueryTermsMap query_terms{};
+ EXPECT_THAT(
+ result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY,
+ scored_document_hits),
+ IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2))));
+}
+
+TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+
+ DocumentId invalid_document_id = -1;
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get(),
+ /*ignore_bad_document_ids=*/false);
+
+ SectionRestrictQueryTermsMap query_terms{};
+ EXPECT_THAT(result_retriever->RetrieveResults(
+ result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY, scored_document_hits),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ DocumentId non_existing_document_id = 4;
+ scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {non_existing_document_id, /*hit_section_id_mask=*/0b00000011,
+ /*score=*/0}};
+ EXPECT_THAT(result_retriever->RetrieveResults(
+ result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY, scored_document_hits),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(ResultRetrieverTest, IOError) {
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+
+ SectionRestrictQueryTermsMap query_terms{};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get(),
+ /*ignore_bad_document_ids=*/true);
+ EXPECT_THAT(result_retriever->RetrieveResults(
+ result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY, scored_document_hits),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(ResultRetrieverTest, SnippetingDisabled) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(test_document3_));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = std::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get());
+
+ SectionRestrictQueryTermsMap query_terms{};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> results,
+ result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY,
+ scored_document_hits));
+ ASSERT_THAT(results, SizeIs(3));
+ EXPECT_THAT(results.at(0).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+ EXPECT_THAT(results.at(1).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+ EXPECT_THAT(results.at(2).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+}
+
+TEST_F(ResultRetrieverTest, SimpleSnippeted) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(test_document3_));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = absl::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get());
+
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> result,
+ result_retriever->RetrieveResults(result_spec_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY,
+ scored_document_hits));
+ EXPECT_THAT(result, SizeIs(3));
+ EXPECT_THAT(result[0].document(), EqualsProto(test_document1_));
+ EXPECT_THAT(
+ GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
+ Eq("subject foo"));
+ EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
+ Eq("foo"));
+ EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
+ Eq("body bar"));
+ EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
+ Eq("bar"));
+
+ EXPECT_THAT(result[1].document(), EqualsProto(test_document2_));
+ EXPECT_THAT(
+ GetWindow(result[1].document(), result[1].snippet(), "subject", 0),
+ Eq("subject foo 2"));
+ EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "subject", 0),
+ Eq("foo"));
+ EXPECT_THAT(GetWindow(result[1].document(), result[1].snippet(), "body", 0),
+ Eq("body bar 2"));
+ EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "body", 0),
+ Eq("bar"));
+
+ EXPECT_THAT(result[2].document(), EqualsProto(test_document3_));
+ EXPECT_THAT(
+ GetWindow(result[2].document(), result[2].snippet(), "subject", 0),
+ Eq("subject foo 3"));
+ EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "subject", 0),
+ Eq("foo"));
+ EXPECT_THAT(GetWindow(result[2].document(), result[2].snippet(), "body", 0),
+ Eq("body bar 3"));
+ EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "body", 0),
+ Eq("bar"));
+}
+
+TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(test_document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(test_document3_));
+
+ result_spec_snippet_.mutable_snippet_spec()->set_num_to_snippet(1);
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ auto result_retriever = absl::make_unique<ResultRetriever>(
+ doc_store.get(), schema_store_.get(), language_segmenter_.get());
+
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> result,
+ result_retriever->RetrieveResults(result_spec_snippet_, query_terms,
+ TermMatchType::EXACT_ONLY,
+ scored_document_hits));
+ EXPECT_THAT(result, SizeIs(3));
+ EXPECT_THAT(result[0].document(), EqualsProto(test_document1_));
+ EXPECT_THAT(
+ GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
+ Eq("subject foo"));
+ EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
+ Eq("foo"));
+ EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
+ Eq("body bar"));
+ EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
+ Eq("bar"));
+
+ EXPECT_THAT(result[1].document(), EqualsProto(test_document2_));
+ EXPECT_THAT(result[1].snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+
+ EXPECT_THAT(result[2].document(), EqualsProto(test_document3_));
+ EXPECT_THAT(result[2].snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
new file mode 100644
index 0000000..37cfb3f
--- /dev/null
+++ b/icing/schema/schema-store.cc
@@ -0,0 +1,434 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-store.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-proto.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
+constexpr char kSchemaFilename[] = "schema.pb";
+constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
+
+// A KeyMapper stores its data across 3 arrays internally. Giving each array
+// 128KiB for storage means the entire KeyMapper requires 384KiB.
+constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
+
+const std::string MakeHeaderFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
+}
+
+const std::string MakeSchemaFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
+}
+
+const std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
+}
+
+// Assuming that SchemaTypeIds are assigned to schema types based on their order
+// in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
+// change with the new schema.
+std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
+ const SchemaProto& old_schema, const SchemaProto& new_schema) {
+ std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
+
+ std::unordered_map<std::string, int> old_types_and_index;
+ for (int i = 0; i < old_schema.types_size(); ++i) {
+ old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
+ }
+
+ std::unordered_map<std::string, int> new_types_and_index;
+ for (int i = 0; i < new_schema.types_size(); ++i) {
+ new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
+ }
+
+ for (const auto& old_type_index : old_types_and_index) {
+ const auto& iter = new_types_and_index.find(old_type_index.first);
+ // We only care if the type exists in both the old and new schema. If the
+ // type has been deleted, then it'll be captured in
+ // SetSchemaResult.schema_types_deleted*. If the type has been added in the
+ // new schema then we also don't care because nothing needs to be updated.
+ if (iter != new_types_and_index.end()) {
+ // Since the SchemaTypeId of the schema type is just the index of it in
+ // the SchemaProto, compare the index and save it if it's not the same
+ if (old_type_index.second != iter->second) {
+ old_schema_type_ids_changed.emplace(old_type_index.second);
+ }
+ }
+ }
+
+ return old_schema_type_ids_changed;
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ std::unique_ptr<SchemaStore> schema_store =
+ std::unique_ptr<SchemaStore>(new SchemaStore(filesystem, base_dir));
+ ICING_RETURN_IF_ERROR(schema_store->Initialize());
+ return schema_store;
+}
+
+SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir)
+ : filesystem_(*filesystem),
+ base_dir_(std::move(base_dir)),
+ schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {}
+
+SchemaStore::~SchemaStore() {
+ if (initialized_) {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
+ }
+ }
+}
+
+libtextclassifier3::Status SchemaStore::Initialize() {
+ auto schema_proto_or = GetSchema();
+ if (absl_ports::IsNotFound(schema_proto_or.status())) {
+ // Don't have an existing schema proto, that's fine
+ return libtextclassifier3::Status::OK;
+ } else if (!schema_proto_or.ok()) {
+ // Real error when trying to read the existing schema
+ return schema_proto_or.status();
+ }
+
+ if (!InitializeDerivedFiles().ok()) {
+ ICING_VLOG(3)
+ << "Couldn't find derived files or failed to initialize them, "
+ "regenerating derived files for SchemaStore.";
+ ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ }
+
+ initialized_ = true;
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
+ if (!HeaderExists()) {
+ // Without a header, we don't know if things are consistent between each
+ // other so the caller should just regenerate everything from ground truth.
+ return absl_ports::InternalError("SchemaStore header doesn't exist");
+ }
+
+ SchemaStore::Header header;
+ if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header,
+ sizeof(header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
+ }
+
+ if (header.magic != SchemaStore::Header::kMagic) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ schema_type_mapper_,
+ KeyMapper<SchemaTypeId>::Create(filesystem_,
+ MakeSchemaTypeMapperFilename(base_dir_),
+ kSchemaTypeMapperMaxSize));
+
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ if (checksum.Get() != header.checksum) {
+ return absl_ports::InternalError(
+ "Combined checksum of SchemaStore was inconsistent");
+ }
+
+ // Update our in-memory data structures
+ type_config_map_.clear();
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
+ for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
+ // Update our type_config_map_
+ type_config_map_.emplace(type_config.schema_type(), type_config);
+ }
+ ICING_ASSIGN_OR_RETURN(
+ section_manager_,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() {
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
+
+ ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
+ type_config_map_.clear();
+
+ for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
+ // Update our type_config_map_
+ type_config_map_.emplace(type_config.schema_type(), type_config);
+
+ // Assign a SchemaTypeId to the type
+ ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
+ type_config.schema_type(), schema_type_mapper_->num_keys()));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ section_manager_,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Write the header
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+bool SchemaStore::HeaderExists() {
+ if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) {
+ return false;
+ }
+
+ int64_t file_size =
+ filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str());
+
+ // If it's been truncated to size 0 before, we consider it to be a new file
+ return file_size != 0 && file_size != Filesystem::kBadFileSize;
+}
+
+libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
+ // Write the header
+ SchemaStore::Header header;
+ header.magic = SchemaStore::Header::kMagic;
+ header.checksum = checksum.Get();
+
+ // This should overwrite the header.
+ if (!filesystem_.Write(MakeHeaderFilename(base_dir_).c_str(), &header,
+ sizeof(header))) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ schema_type_mapper_.reset();
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
+ filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old schema_type mapper";
+ return status;
+ }
+ ICING_ASSIGN_OR_RETURN(
+ schema_type_mapper_,
+ KeyMapper<SchemaTypeId>::Create(filesystem_,
+ MakeSchemaTypeMapperFilename(base_dir_),
+ kSchemaTypeMapperMaxSize));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
+ Crc32 total_checksum;
+
+ auto schema_proto_or = GetSchema();
+ if (absl_ports::IsNotFound(schema_proto_or.status())) {
+ // Nothing to checksum
+ return total_checksum;
+ } else if (!schema_proto_or.ok()) {
+ // Some real error. Pass it up
+ return schema_proto_or.status();
+ }
+
+ // Guaranteed to have a schema proto now
+ const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+ Crc32 schema_checksum;
+ schema_checksum.Append(schema_proto->SerializeAsString());
+
+ Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum();
+
+ total_checksum.Append(std::to_string(schema_checksum.Get()));
+ total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
+
+ return total_checksum;
+}
+
+libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
+ const {
+ return schema_file_.Read();
+}
+
+// TODO(cassiewang): Consider removing this definition of SetSchema if it's not
+// needed by production code. It's currently being used by our tests, but maybe
+// it's trivial to change our test code to also use the
+// SetSchema(SchemaProto&& new_schema)
+libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
+SchemaStore::SetSchema(const SchemaProto& new_schema,
+ bool ignore_errors_and_delete_documents) {
+ return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
+}
+
+libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
+SchemaStore::SetSchema(SchemaProto&& new_schema,
+ bool ignore_errors_and_delete_documents) {
+ SetSchemaResult result;
+
+ auto schema_proto_or = GetSchema();
+ if (absl_ports::IsNotFound(schema_proto_or.status())) {
+ // We don't have a pre-existing schema, so anything is valid.
+ result.success = true;
+ } else if (!schema_proto_or.ok()) {
+ // Real error
+ return schema_proto_or.status();
+ } else {
+ // At this point, we're guaranteed that we have a schema.
+ const SchemaProto old_schema = *schema_proto_or.ValueOrDie();
+
+ // Assume we can set the schema unless proven otherwise.
+ result.success = true;
+
+ if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
+ // Same schema as before. No need to update anything
+ return result;
+ }
+
+ // Different schema, track the differences and see if we can still write it
+ SchemaUtil::SchemaDelta schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema);
+
+ // An incompatible index is fine, we can just reindex
+ result.index_incompatible = schema_delta.index_incompatible;
+
+ for (const auto& schema_type : schema_delta.schema_types_deleted) {
+ // We currently don't support deletions, so mark this as not possible.
+ // This will change once we allow force-set schemas.
+ result.success = false;
+
+ result.schema_types_deleted_by_name.emplace(schema_type);
+
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ GetSchemaTypeId(schema_type));
+ result.schema_types_deleted_by_id.emplace(schema_type_id);
+ }
+
+ for (const auto& schema_type : schema_delta.schema_types_incompatible) {
+ // We currently don't support incompatible schemas, so mark this as
+ // not possible. This will change once we allow force-set schemas.
+ result.success = false;
+
+ result.schema_types_incompatible_by_name.emplace(schema_type);
+
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ GetSchemaTypeId(schema_type));
+ result.schema_types_incompatible_by_id.emplace(schema_type_id);
+ }
+
+ // SchemaTypeIds changing is fine, we can update the DocumentStore
+ result.old_schema_type_ids_changed =
+ SchemaTypeIdsChanged(old_schema, new_schema);
+ }
+
+ // We can force set the schema if the caller has told us to ignore any errors
+ result.success = result.success || ignore_errors_and_delete_documents;
+
+ if (result.success) {
+ // Write the schema (and potentially overwrite a previous schema)
+ ICING_RETURN_IF_ERROR(
+ schema_file_.Write(std::make_unique<SchemaProto>(new_schema)));
+
+ ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ }
+
+ return result;
+}
+
+libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
+SchemaStore::GetSchemaTypeConfig(const std::string& schema_type) const {
+ const auto& type_config_iter = type_config_map_.find(schema_type);
+ if (type_config_iter == type_config_map_.end()) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
+ }
+ return &type_config_iter->second;
+}
+
+libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
+ std::string_view schema_type) const {
+ return schema_type_mapper_->Get(schema_type);
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string>>
+SchemaStore::GetSectionContent(const DocumentProto& document,
+ std::string_view section_path) const {
+ return section_manager_->GetSectionContent(document, section_path);
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string>>
+SchemaStore::GetSectionContent(const DocumentProto& document,
+ SectionId section_id) const {
+ return section_manager_->GetSectionContent(document, section_id);
+}
+
+libtextclassifier3::StatusOr<const SectionMetadata*>
+SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
+ SectionId section_id) const {
+ return section_manager_->GetSectionMetadata(schema_type_id, section_id);
+}
+
+libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections(
+ const DocumentProto& document) const {
+ return section_manager_->ExtractSections(document);
+}
+
+libtextclassifier3::Status SchemaStore::PersistToDisk() {
+ if (schema_type_mapper_ != nullptr) {
+ // It's possible we haven't had a schema set yet, so SchemaTypeMapper hasn't
+ // been initialized and is still a nullptr
+ ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
+ }
+
+ // Write the header
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
new file mode 100644
index 0000000..cc65a32
--- /dev/null
+++ b/icing/schema/schema-store.h
@@ -0,0 +1,285 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SCHEMA_STORE_H_
+#define ICING_SCHEMA_SCHEMA_STORE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/file/file-backed-proto.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// Holds the ground truth schema proto. Tracks compatible changes to the schema
+// and will update any derived data based on the schema proto, such as Sections,
+// SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
+// the most up-to-date data, callers should not save instances themselves and
+// should always call Get* from the SchemaStore.
+class SchemaStore {
+ public:
+ struct Header {
+ static constexpr int32_t kMagic = 0x72650d0a;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic;
+
+ // Checksum of the SchemaStore's sub-component's checksums.
+ uint32_t checksum;
+ };
+
+ // Holds information on what may have been affected by the new schema. This is
+ // generally data that other classes may depend on from the SchemaStore,
+ // so that we can know if we should go update those classes as well.
+ struct SetSchemaResult {
+ // Whether we are able to write the schema as determined by SetSchema's
+ // arguments. This boolean reflects SetSchema's logic, and does not reflect
+ // any system level IO errors that may prevent the schema from being written
+ // to file.
+ bool success = false;
+
+ // Whether the new schema changes invalidate the index.
+ bool index_incompatible = false;
+
+ // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
+ // 1. Schema types are added in the middle of the SchemaProto
+ // 2. Schema types are removed from the middle of the SchemaProto
+ // 3. Schema types are reordered in the SchemaProto
+ //
+ // SchemaTypeIds are not changed if schema types are added/removed to the
+ // end of the SchemaProto.
+ std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
+
+ // Schema types that have been removed from the new schema. Represented by
+ // the `schema_type` field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_deleted_by_name;
+
+ // Schema types that have been removed from the new schema. Represented by
+ // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
+ // schema.
+ std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
+
+ // Schema types whose SchemaTypeConfigProto has changed in an incompatible
+ // manner in the new schema. Compatibility determined in
+ // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
+ // field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_incompatible_by_name;
+
+ // Schema types whose SchemaTypeConfigProto has changed in an incompatible
+ // manner in the new schema. Compatibility determined in
+ // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
+ // assigned to this SchemaTypeConfigProto in the *old* schema.
+ std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
+ };
+
+ // Create a SchemaStore instance. The base_dir must already exist. There does
+ // not need to be an existing schema already.
+ //
+ // Returns:
+ // unique_ptr to SchemaStore on success
+ // INTERNAL_ERROR on any IO errors
+ static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir);
+
+ // Not copyable
+ SchemaStore(const SchemaStore&) = delete;
+ SchemaStore& operator=(const SchemaStore&) = delete;
+
+ // Persists and updates checksum of subcomponents.
+ ~SchemaStore();
+
+ // Retrieve the current schema if it exists. Caller does not get ownership of
+ // the schema proto and modifying the returned pointer does not affect the
+ // underlying schema proto.
+ //
+ // Returns:
+ // SchemaProto* if exists
+ // INTERNAL_ERROR on any IO errors
+ // NOT_FOUND_ERROR if a schema hasn't been set before
+ libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
+
+ // Update our current schema if it's compatible. Does not accept incompatible
+ // schema. Compatibility rules defined by
+ // SchemaUtil::ComputeCompatibilityDelta.
+ //
+ // If ignore_errors_and_delete_documents is set to true, then incompatible
+ // schema are allowed and we'll force set the schema, meaning
+ // SetSchemaResult.success will always be true.
+ //
+ // Returns:
+ // SetSchemaResult that encapsulates the differences between the old and new
+ // schema, as well as if the new schema can be set.
+ // INTERNAL_ERROR on any IO errors
+ libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
+ const SchemaProto& new_schema,
+ bool ignore_errors_and_delete_documents = false);
+ libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
+ SchemaProto&& new_schema,
+ bool ignore_errors_and_delete_documents = false);
+
+ // Get the SchemaTypeConfigProto of schema_type name.
+ //
+ // Returns:
+ // SchemaTypeConfigProto on success
+ // NOT_FOUND if schema type name doesn't exist
+ libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
+ GetSchemaTypeConfig(const std::string& schema_type) const;
+
+ // Returns the SchemaTypeId of the passed in schema type
+ //
+ // Returns:
+ // SchemaTypeId on success
+ // NOT_FOUND_ERROR if we don't know about the schema type
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
+ std::string_view schema_type) const;
+
+ // Finds content of a section by section path (e.g. property1.property2)
+ //
+ // Returns:
+ // A string of content on success
+ // NOT_FOUND if:
+ // 1. Property is optional and not found in the document
+ // 2. section_path is invalid
+ // 3. Content is empty
+ libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
+ const DocumentProto& document, std::string_view section_path) const;
+
+ // Finds content of a section by id
+ //
+ // Returns:
+ // A string of content on success
+ // INVALID_ARGUMENT if section id is invalid
+ // NOT_FOUND if type config name of document not found
+ libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
+ const DocumentProto& document, SectionId section_id) const;
+
+ // Returns the SectionMetadata associated with the SectionId that's in the
+ // SchemaTypeId.
+ //
+ // Returns:
+ // pointer to SectionMetadata on success
+ // INVALID_ARGUMENT if schema type id or section is invalid
+ libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
+ SchemaTypeId schema_type_id, SectionId section_id) const;
+
+ // Extracts all sections from the given document, sections are sorted by
+ // section id in increasing order. Section ids start from 0. Sections with
+ // empty content won't be returned.
+ //
+ // Returns:
+ // A list of sections on success
+ // NOT_FOUND if type config name of document not found
+ libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ const DocumentProto& document) const;
+
+ // Syncs all the data changes to disk.
+ //
+ // Returns any encountered IO errors.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Computes the combined checksum of the schema store - includes the ground
+ // truth and all derived files.
+ //
+ // Returns:
+ // Combined checksum on success
+ // INTERNAL_ERROR on compute error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+
+ private:
+ // Use SchemaStore::Create instead.
+ explicit SchemaStore(const Filesystem* filesystem, std::string base_dir);
+
+ // Handles initializing the SchemaStore and regenerating any data if needed.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Initialize();
+
+ // Creates sub-components and verifies the integrity of each sub-component.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status InitializeDerivedFiles();
+
+ // Populates any derived data structures off of the schema.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if a schema proto has not been set
+ // INTERNAL_ERROR on any IO errors
+ libtextclassifier3::Status RegenerateDerivedFiles();
+
+ // Checks if the header exists already. This does not create the header file
+ // if it doesn't exist.
+ bool HeaderExists();
+
+ // Update and replace the header file. Creates the header file if it doesn't
+ // exist.
+ libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
+
+ // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
+ // file, and re-creates a new instance of the schema_type_mapper_. Does not
+ // populate the schema_type_mapper_.
+ //
+ // Returns any IO errors.
+ libtextclassifier3::Status ResetSchemaTypeMapper();
+
+ const Filesystem& filesystem_;
+ const std::string base_dir_;
+
+ // Used internally to indicate whether the class has been initialized. This is
+ // to guard against cases where the object has been created, but Initialize
+ // fails in the constructor. If we have successfully exited the constructor,
+ // then this field can be ignored. Clients of SchemaStore should not need to
+ // worry about this field.
+ bool initialized_ = false;
+
+ // Cached schema
+ FileBackedProto<SchemaProto> schema_file_;
+
+ // A hash map of (type config name -> type config), allows faster lookup of
+ // type config in schema. The O(1) type config access makes schema-related and
+ // section-related operations faster.
+ SchemaUtil::TypeConfigMap type_config_map_;
+
+ // Maps schema types to a densely-assigned unique id.
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
+
+ // Manager of indexed section related metadata.
+ std::unique_ptr<const SectionManager> section_manager_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SCHEMA_STORE_H_
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
new file mode 100644
index 0000000..410a681
--- /dev/null
+++ b/icing/schema/schema-store_test.cc
@@ -0,0 +1,647 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-store.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Not;
+using ::testing::Pointee;
+
+class SchemaStoreTest : public ::testing::Test {
+ protected:
+ SchemaStoreTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+
+ auto type = schema_.add_types();
+ type->set_schema_type("email");
+
+ // Add an indexed property so we generate section metadata on it
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ SchemaProto schema_;
+};
+
+TEST_F(SchemaStoreTest, CorruptSchemaError) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+ }
+
+ // "Corrupt" the ground truth schema by adding new data to it. This will mess
+ // up the checksum of the schema store
+
+ SchemaProto corrupt_schema;
+ auto type = corrupt_schema.add_types();
+ type->set_schema_type("corrupted");
+
+ const std::string schema_file = absl_ports::StrCat(test_dir_, "/schema.pb");
+ const std::string serialized_schema = corrupt_schema.SerializeAsString();
+
+ filesystem_.Write(schema_file.c_str(), serialized_schema.data(),
+ serialized_schema.size());
+
+ // If ground truth was corrupted, we won't know what to do
+ EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+ }
+
+ // "Corrupt" the derived SchemaTypeIds by deleting the entire directory. This
+ // will mess up the initialization of schema store, causing everything to be
+ // regenerated from ground truth
+
+ const std::string schema_type_mapper_dir =
+ absl_ports::StrCat(test_dir_, "/schema_type_mapper");
+ filesystem_.DeleteDirectoryRecursively(schema_type_mapper_dir.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Everything looks fine, ground truth and derived data
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+}
+
+TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+ }
+
+ // Change the SchemaStore's header combined checksum so that it won't match
+ // the recalculated checksum on initialization. This will force a regeneration
+ // of derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(test_dir_, "/schema_store_header");
+ SchemaStore::Header header;
+ header.magic = SchemaStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Everything looks fine, ground truth and derived data
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+}
+
+TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
+ EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_), IsOk());
+}
+
+TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+
+ schema_store.reset();
+ EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_), IsOk());
+}
+
+TEST_F(SchemaStoreTest, MultipleCreateOk) {
+ DocumentProto document;
+ document.set_schema("email");
+ auto properties = document.add_properties();
+ properties->set_name("subject");
+ properties->add_string_values("subject_content");
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+
+ // Verify that our in-memory structures are ok
+ EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
+ IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ schema_store->ExtractSections(document));
+ EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+
+ // Verify that our persisted data is ok
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+
+ schema_store.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Verify that our in-memory structures are ok
+ EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
+ IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(sections, schema_store->ExtractSections(document));
+ EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+
+ // Verify that our persisted data is ok
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+}
+
+TEST_F(SchemaStoreTest, SetNewSchemaOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+}
+
+TEST_F(SchemaStoreTest, SetSameSchemaOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+
+ // And one more for fun
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+}
+
+TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+
+ // Make the schema incompatible by removing a type.
+ schema_.clear_types();
+
+ // Set the incompatible schema
+ result.success = false;
+ result.schema_types_deleted_by_name.emplace("email");
+ result.schema_types_deleted_by_id.emplace(0);
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ // Add a type, shouldn't affect the index or cached SchemaTypeIds
+ type = schema.add_types();
+ type->set_schema_type("new_type");
+
+ // Set the compatible schema
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+ type = schema.add_types();
+ type->set_schema_type("message");
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ // Remove "email" type, this also changes previous SchemaTypeIds
+ schema.Clear();
+ type = schema.add_types();
+ type->set_schema_type("message");
+
+ SchemaStore::SetSchemaResult incompatible_result;
+ incompatible_result.success = false;
+ incompatible_result.old_schema_type_ids_changed.emplace(
+ old_message_schema_type_id);
+ incompatible_result.schema_types_deleted_by_name.emplace("email");
+ incompatible_result.schema_types_deleted_by_id.emplace(
+ old_email_schema_type_id);
+
+ // Can't set the incompatible schema
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(incompatible_result)));
+
+ SchemaStore::SetSchemaResult force_result;
+ force_result.success = true;
+ force_result.old_schema_type_ids_changed.emplace(old_message_schema_type_id);
+ force_result.schema_types_deleted_by_name.emplace("email");
+ force_result.schema_types_deleted_by_id.emplace(old_email_schema_type_id);
+
+ // Force set the incompatible schema
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true),
+ IsOkAndHolds(EqualsSetSchemaResult(force_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+ type = schema.add_types();
+ type->set_schema_type("message");
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ // Reorder the types
+ schema.clear_types();
+ type = schema.add_types();
+ type->set_schema_type("message");
+ type = schema.add_types();
+ type->set_schema_type("email");
+
+ // Since we assign SchemaTypeIds based on order in the SchemaProto, this will
+ // cause SchemaTypeIds to change
+ result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email"
+ result.old_schema_type_ids_changed.emplace(
+ 1); // Old SchemaTypeId of "message"
+
+ // Set the compatible schema
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+
+ // Add an unindexed property
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ // Make a previously unindexed property indexed
+ property = schema.mutable_types(0)->mutable_properties(0);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ // With a new indexed property, we'll need to reindex
+ result.index_incompatible = true;
+
+ // Set the compatible schema
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+
+ // Add a STRING property
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+
+ // Make a previously STRING property into DOUBLE
+ property = schema.mutable_types(0)->mutable_properties(0);
+ property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+
+ SchemaStore::SetSchemaResult incompatible_result;
+ incompatible_result.success = false;
+ incompatible_result.schema_types_incompatible_by_name.emplace("email");
+ incompatible_result.schema_types_incompatible_by_id.emplace(
+ old_email_schema_type_id);
+
+ // Can't set the incompatible schema
+ EXPECT_THAT(schema_store->SetSchema(schema),
+ IsOkAndHolds(EqualsSetSchemaResult(incompatible_result)));
+
+ SchemaStore::SetSchemaResult force_result;
+ force_result.success = true;
+ force_result.schema_types_incompatible_by_name.emplace("email");
+ force_result.schema_types_incompatible_by_id.emplace(
+ old_email_schema_type_id);
+
+ // Force set the incompatible schema
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true),
+ IsOkAndHolds(EqualsSetSchemaResult(force_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+TEST_F(SchemaStoreTest, GetSchemaTypeId) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ schema_.clear_types();
+
+ // Add a few schema types
+ const std::string first_type = "first";
+ auto type = schema_.add_types();
+ type->set_schema_type(first_type);
+
+ const std::string second_type = "second";
+ type = schema_.add_types();
+ type->set_schema_type(second_type);
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+
+ EXPECT_THAT(schema_store->GetSchemaTypeId(first_type), IsOkAndHolds(0));
+ EXPECT_THAT(schema_store->GetSchemaTypeId(second_type), IsOkAndHolds(1));
+}
+
+TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ Crc32 default_checksum;
+ EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(default_checksum));
+}
+
+TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto foo_schema;
+ auto type_config = foo_schema.add_types();
+ type_config->set_schema_type("foo");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
+
+ // Calling it again doesn't change the checksum
+ EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(checksum));
+}
+
+TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto foo_schema;
+ auto type_config = foo_schema.add_types();
+ type_config->set_schema_type("foo");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
+
+ // Destroy the previous instance and recreate SchemaStore
+ schema_store.reset();
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(checksum));
+}
+
+TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto foo_schema;
+ auto type_config = foo_schema.add_types();
+ type_config->set_schema_type("foo");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
+
+ // Modifying the SchemaStore changes the checksum
+ SchemaProto foo_bar_schema;
+ type_config = foo_bar_schema.add_types();
+ type_config->set_schema_type("foo");
+ type_config = foo_bar_schema.add_types();
+ type_config->set_schema_type("bar");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(foo_bar_schema));
+
+ EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+}
+
+TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ // Persisting is fine and shouldn't affect anything
+ ICING_EXPECT_OK(schema_store->PersistToDisk());
+}
+
+TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("foo");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Persisting shouldn't change anything
+ ICING_EXPECT_OK(schema_store->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+
+ // Modify the schema so that something different is persisted next time
+ type_config = schema.add_types();
+ type_config->set_schema_type("bar");
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Should also persist on destruction
+ schema_store.reset();
+
+ // And we get the same schema back on reinitialization
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
new file mode 100644
index 0000000..96d2575
--- /dev/null
+++ b/icing/schema/schema-util.cc
@@ -0,0 +1,392 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-util.h"
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "base/logging.h"
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+bool isAlphaNumeric(std::string_view str) {
+ for (char c : str) {
+ if (!std::isalnum(c)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool IsCardinalityCompatible(const PropertyConfigProto& old_property,
+ const PropertyConfigProto& new_property) {
+ if (old_property.cardinality() < new_property.cardinality()) {
+ // We allow a new, less restrictive cardinality (i.e. a REQUIRED field
+ // can become REPEATED or OPTIONAL, but not the other way around).
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Cardinality is more restrictive than before ",
+ PropertyConfigProto::Cardinality::Code_Name(old_property.cardinality()),
+ "->",
+ PropertyConfigProto::Cardinality::Code_Name(
+ new_property.cardinality()));
+ return false;
+ }
+ return true;
+}
+
+bool IsDataTypeCompatible(const PropertyConfigProto& old_property,
+ const PropertyConfigProto& new_property) {
+ if (old_property.data_type() != new_property.data_type()) {
+ // TODO(cassiewang): Maybe we can be a bit looser with this, e.g. we just
+ // string cast an int64_t to a string. But for now, we'll stick with
+ // simplistics.
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Data type ",
+ PropertyConfigProto::DataType::Code_Name(old_property.data_type()),
+ "->",
+ PropertyConfigProto::DataType::Code_Name(new_property.data_type()));
+ return false;
+ }
+ return true;
+}
+
+bool IsSchemaTypeCompatible(const PropertyConfigProto& old_property,
+ const PropertyConfigProto& new_property) {
+ if (old_property.schema_type() != new_property.schema_type()) {
+ ICING_VLOG(1) << absl_ports::StrCat("Schema type ",
+ old_property.schema_type(), "->",
+ new_property.schema_type());
+ return false;
+ }
+ return true;
+}
+
+bool IsPropertyCompatible(const PropertyConfigProto& old_property,
+ const PropertyConfigProto& new_property) {
+ return IsDataTypeCompatible(old_property, new_property) &&
+ IsSchemaTypeCompatible(old_property, new_property) &&
+ IsCardinalityCompatible(old_property, new_property);
+}
+
+bool IsTermMatchTypeCompatible(const IndexingConfig& old_indexed,
+ const IndexingConfig& new_indexed) {
+ return old_indexed.term_match_type() == new_indexed.term_match_type() &&
+ old_indexed.tokenizer_type() == new_indexed.tokenizer_type();
+}
+
+} // namespace
+
+libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) {
+ // Tracks SchemaTypeConfigs that we've validated already.
+ std::unordered_set<std::string_view> known_schema_types;
+
+ // Tracks SchemaTypeConfigs that have been mentioned (by other
+ // SchemaTypeConfigs), but we haven't validated yet.
+ std::unordered_set<std::string_view> unknown_schema_types;
+
+ // Tracks PropertyConfigs within a SchemaTypeConfig that we've validated
+ // already.
+ std::unordered_set<std::string_view> known_property_names;
+
+ for (const auto& type_config : schema.types()) {
+ std::string_view schema_type(type_config.schema_type());
+ ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type));
+
+ // We can't have duplicate schema_types
+ if (!known_schema_types.insert(schema_type).second) {
+ return absl_ports::AlreadyExistsError(absl_ports::StrCat(
+ "Field 'schema_type' '", schema_type, "' is already defined"));
+ }
+ unknown_schema_types.erase(schema_type);
+
+ // We only care about properties being unique within one type_config
+ known_property_names.clear();
+ for (const auto& property_config : type_config.properties()) {
+ std::string_view property_name(property_config.property_name());
+ ICING_RETURN_IF_ERROR(ValidatePropertyName(property_name, schema_type));
+
+ // Property names must be unique
+ if (!known_property_names.insert(property_name).second) {
+ return absl_ports::AlreadyExistsError(absl_ports::StrCat(
+ "Field 'property_name' '", property_name,
+ "' is already defined for schema '", schema_type, "'"));
+ }
+
+ auto data_type = property_config.data_type();
+ ICING_RETURN_IF_ERROR(
+ ValidateDataType(data_type, schema_type, property_name));
+
+ if (data_type == PropertyConfigProto::DataType::DOCUMENT) {
+ // Need to know what schema_type these Document properties should be
+ // validated against
+ std::string_view property_schema_type(property_config.schema_type());
+ ICING_RETURN_IF_ERROR(ValidatePropertySchemaType(
+ property_schema_type, schema_type, property_name));
+
+ // Need to make sure we eventually see/validate this schema_type
+ if (known_schema_types.count(property_schema_type) == 0) {
+ unknown_schema_types.insert(property_schema_type);
+ }
+ }
+
+ ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(),
+ schema_type, property_name));
+
+ ICING_RETURN_IF_ERROR(
+ ValidateIndexingConfig(property_config.indexing_config(), data_type));
+ }
+ }
+
+ // An Document property claimed to be of a schema_type that we never
+ // saw/validated
+ if (!unknown_schema_types.empty()) {
+ return absl_ports::UnknownError(
+ absl_ports::StrCat("Undefined 'schema_type's: ",
+ absl_ports::StrJoin(unknown_schema_types, ",")));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateSchemaType(
+ std::string_view schema_type) {
+ // Require a schema_type
+ if (schema_type.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Field 'schema_type' cannot be empty.");
+ }
+
+ // Only support alphanumeric values.
+ if (!isAlphaNumeric(schema_type)) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'schema_type' '", schema_type,
+ "' can only contain "
+ "alphanumeric characters."));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidatePropertyName(
+ std::string_view property_name, std::string_view schema_type) {
+ // Require a property_name
+ if (property_name.empty()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'property_name' for schema '", schema_type,
+ "' cannot be empty."));
+ }
+
+ // Only support alphanumeric values.
+ for (char c : property_name) {
+ if (!std::isalnum(c)) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'property_name' '", schema_type,
+ "' can only contain alphanumeric characters."));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateDataType(
+ PropertyConfigProto::DataType::Code data_type, std::string_view schema_type,
+ std::string_view property_name) {
+ // UNKNOWN is the default enum value and should only be used for backwards
+ // compatibility
+ if (data_type == PropertyConfigProto::DataType::UNKNOWN) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Field 'data_type' cannot be UNKNOWN for schema property '",
+ schema_type, " ", property_name, "'"));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidatePropertySchemaType(
+ std::string_view property_schema_type, std::string_view schema_type,
+ std::string_view property_name) {
+ if (property_schema_type.empty()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'schema_type' is required for DOCUMENT "
+ "data_types in schema property '",
+ schema_type, " ", property_name, "'"));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateCardinality(
+ PropertyConfigProto::Cardinality::Code cardinality,
+ std::string_view schema_type, std::string_view property_name) {
+ // UNKNOWN is the default enum value and should only be used for backwards
+ // compatibility
+ if (cardinality == PropertyConfigProto::Cardinality::UNKNOWN) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Field 'cardinality' cannot be UNKNOWN for schema property '",
+ schema_type, " ", property_name, "'"));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateIndexingConfig(
+ const IndexingConfig& config,
+ PropertyConfigProto::DataType::Code data_type) {
+ if (data_type == PropertyConfigProto::DataType::DOCUMENT) {
+ return libtextclassifier3::Status::OK;
+ }
+ if (config.term_match_type() != TermMatchType::UNKNOWN &&
+ config.tokenizer_type() == IndexingConfig::TokenizerType::NONE) {
+ return absl_ports::InvalidArgumentError(
+ "TermMatchType properties cannot have a tokenizer type of NONE");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+void SchemaUtil::BuildTypeConfigMap(
+ const SchemaProto& schema, SchemaUtil::TypeConfigMap* type_config_map) {
+ type_config_map->clear();
+ for (const SchemaTypeConfigProto& type_config : schema.types()) {
+ type_config_map->emplace(type_config.schema_type(), type_config);
+ }
+}
+
+void SchemaUtil::BuildPropertyConfigMap(
+ const SchemaTypeConfigProto& type_config,
+ std::unordered_map<std::string_view, const PropertyConfigProto*>*
+ property_config_map,
+ int32_t* num_required_properties) {
+ // TODO(samzheng): consider caching property_config_map for some properties,
+ // e.g. using LRU cache. Or changing schema.proto to use go/protomap.
+ *num_required_properties = 0;
+ property_config_map->clear();
+ for (const PropertyConfigProto& property_config : type_config.properties()) {
+ property_config_map->emplace(property_config.property_name(),
+ &property_config);
+ if (property_config.cardinality() ==
+ PropertyConfigProto::Cardinality::REQUIRED) {
+ (*num_required_properties)++;
+ }
+ }
+}
+
+const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
+ const SchemaProto& old_schema, const SchemaProto& new_schema) {
+ SchemaDelta schema_delta;
+ schema_delta.index_incompatible = false;
+
+ TypeConfigMap new_type_config_map;
+ BuildTypeConfigMap(new_schema, &new_type_config_map);
+
+ // Iterate through and check each field of the old schema
+ for (const auto& old_type_config : old_schema.types()) {
+ auto new_schema_type_and_config =
+ new_type_config_map.find(old_type_config.schema_type());
+
+ if (new_schema_type_and_config == new_type_config_map.end()) {
+ // Didn't find the old schema type in the new schema, all the old
+ // documents of this schema type are invalid without the schema
+ ICING_VLOG(1) << absl_ports::StrCat("Previously defined schema type ",
+ old_type_config.schema_type(),
+ " was not defined in new schema");
+ schema_delta.schema_types_deleted.insert(old_type_config.schema_type());
+ continue;
+ }
+
+ std::unordered_map<std::string_view, const PropertyConfigProto*>
+ new_property_map;
+ int32_t new_required_properties = 0;
+ BuildPropertyConfigMap(new_schema_type_and_config->second,
+ &new_property_map, &new_required_properties);
+
+ // We only need to check the old, existing properties to see if they're
+ // compatible since we'll have old data that may be invalidated or need to
+ // be reindexed. New properties don't have any data that would be
+ // invalidated or incompatible, so we blanket accept all new properties.
+ int32_t old_required_properties = 0;
+ for (const auto& old_property_config : old_type_config.properties()) {
+ auto new_property_name_and_config =
+ new_property_map.find(old_property_config.property_name());
+
+ if (new_property_name_and_config == new_property_map.end()) {
+ // Didn't find the old property
+ ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ",
+ old_type_config.schema_type(), ".",
+ old_property_config.property_name(),
+ " was not defined in new schema");
+ schema_delta.schema_types_incompatible.insert(
+ old_type_config.schema_type());
+ continue;
+ }
+
+ const PropertyConfigProto* new_property_config =
+ new_property_name_and_config->second;
+
+ if (!IsPropertyCompatible(old_property_config, *new_property_config)) {
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Property ", old_type_config.schema_type(), ".",
+ old_property_config.property_name(), " is incompatible.");
+ schema_delta.schema_types_incompatible.insert(
+ old_type_config.schema_type());
+ }
+
+ if (old_property_config.cardinality() ==
+ PropertyConfigProto::Cardinality::REQUIRED) {
+ ++old_required_properties;
+ }
+
+ // Any change in the indexed property requires a reindexing
+ if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
+ new_property_config->indexing_config())) {
+ schema_delta.index_incompatible = true;
+ }
+ }
+
+ // We can't have new properties that are REQUIRED since we won't know how
+ // to backfill the data, and the existing data will be invalid. We're
+ // guaranteed from our previous checks that all the old properties are also
+ // present in the new property config, so we can do a simple int comparison
+ // here to detect new required properties.
+ if (new_required_properties > old_required_properties) {
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "New schema ", old_type_config.schema_type(),
+ " has REQUIRED properties that are not "
+ "present in the previously defined schema");
+ schema_delta.schema_types_incompatible.insert(
+ old_type_config.schema_type());
+ }
+ }
+
+ return schema_delta;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
new file mode 100644
index 0000000..70a9ad2
--- /dev/null
+++ b/icing/schema/schema-util.h
@@ -0,0 +1,153 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SCHEMA_UTIL_H_
+#define ICING_SCHEMA_SCHEMA_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "utils/base/status.h"
+#include "icing/proto/schema.pb.h"
+
+namespace icing {
+namespace lib {
+
+class SchemaUtil {
+ public:
+ using TypeConfigMap =
+ std::unordered_map<std::string, const SchemaTypeConfigProto>;
+
+ struct SchemaDelta {
+ // Whether an indexing config has changed, requiring the index to be
+ // regenerated. We don't list out all the types that make the index
+ // incompatible because our index isn't optimized for that. It's much easier
+ // to reset the entire index and reindex every document.
+ bool index_incompatible = false;
+
+ // Which schema types were present in the old schema, but were deleted from
+ // the new schema.
+ std::unordered_set<std::string> schema_types_deleted;
+
+ // Which schema types had their SchemaTypeConfigProto changed in a way that
+ // could invalidate existing Documents of that schema type.
+ std::unordered_set<std::string> schema_types_incompatible;
+
+ bool operator==(const SchemaDelta& other) const {
+ return index_incompatible == other.index_incompatible &&
+ schema_types_deleted == other.schema_types_deleted &&
+ schema_types_incompatible == other.schema_types_incompatible;
+ }
+ };
+
+ // This function validates:
+ // 1. SchemaTypeConfigProto.schema_type's must be unique
+ // 2. Properties within one SchemaTypeConfigProto must be unique
+ // 3. SchemaTypeConfigProtos.schema_type must be non-empty
+ // 4. PropertyConfigProtos.property_name must be non-empty
+ // 5. PropertyConfigProtos.property_name's must be unique within one
+ // SchemaTypeConfigProto
+ // 6. PropertyConfigProtos.data_type cannot be UNKNOWN
+ // 7. PropertyConfigProtos.data_type of DOCUMENT must also have a
+ // schema_type
+ // 8. PropertyConfigProtos.cardinality cannot be UNKNOWN
+ // 9. PropertyConfigProtos.schema_type's must correspond to a
+ // SchemaTypeConfigProto.schema_type
+ // 10. All string fields must be alphanumeric.
+ //
+ // Returns:
+ // ALREADY_EXISTS for case 1 and 2
+ // INVALID_ARGUMENT for 3-10
+ // OK otherwise
+ static libtextclassifier3::Status Validate(const SchemaProto& schema);
+
+ // Creates a mapping of schema type -> schema type config proto. The
+ // type_config_map is cleared, and then each schema-type_config_proto pair is
+ // placed in the given type_config_map parameter.
+ static void BuildTypeConfigMap(const SchemaProto& schema,
+ TypeConfigMap* type_config_map);
+
+ // Calculate and return a hash map of (property name -> property config)
+ // from the given type config. The number of required properties will be
+ // assigned to output param num_required_properties.
+ static void BuildPropertyConfigMap(
+ const SchemaTypeConfigProto& type_config,
+ std::unordered_map<std::string_view, const PropertyConfigProto*>*
+ property_config_map,
+ int32_t* num_required_properties);
+
+ // Computes the delta between the old and new schema. There are a few
+ // differences that'll be reported:
+ // 1. The derived index would be incompatible. This is held in
+ // `SchemaDelta.index_incompatible`.
+ // 2. Some schema types existed in the old schema, but have been deleted
+ // from the new schema. This is held in
+ // `SchemaDelta.schema_types_deleted`
+ // 3. A schema type's new definition would mean any existing data of the old
+ // definition is now incompatible.
+ //
+ // For case 1, the two schemas would result in an incompatible index if:
+ // 1.1. The new SchemaProto has a different set of indexed properties than
+ // the old SchemaProto.
+ //
+ // For case 3, the two schemas would result in incompatible data if:
+ // 3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the
+ // new SchemaProto
+ // 3.2. A property exists in the old SchemaTypeConfig, but is not in the new
+ // SchemaTypeConfig
+ // 3.3. A property in the new SchemaTypeConfig and has a REQUIRED
+ // PropertyConfigProto.cardinality, but is not in the old
+ // SchemaTypeConfig
+ // 3.4. A property is in both the old and new SchemaTypeConfig, but its
+ // PropertyConfigProto.data_type is different
+ // 3.5. A property is in both the old and new SchemaTypeConfig, but its
+ // PropertyConfigProto.schema_type is different
+ // 3.6. A property is in both the old and new SchemaTypeConfig, but its new
+ // PropertyConfigProto.cardinality is more restrictive. Restrictive
+ // scale defined as:
+ // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST
+ //
+ // A property is defined by the combination of the
+ // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name.
+ //
+ // Returns a SchemaDelta that captures the aforementioned differences.
+ static const SchemaDelta ComputeCompatibilityDelta(
+ const SchemaProto& old_schema, const SchemaProto& new_schema);
+
+ private:
+ static libtextclassifier3::Status ValidateSchemaType(
+ std::string_view schema_type);
+ static libtextclassifier3::Status ValidatePropertyName(
+ std::string_view property_name, std::string_view schema_type);
+ static libtextclassifier3::Status ValidateDataType(
+ PropertyConfigProto::DataType::Code data_type,
+ std::string_view schema_type, std::string_view property_name);
+ static libtextclassifier3::Status ValidatePropertySchemaType(
+ std::string_view property_schema_type, std::string_view schema_type,
+ std::string_view property_name);
+ static libtextclassifier3::Status ValidateCardinality(
+ PropertyConfigProto::Cardinality::Code cardinality,
+ std::string_view schema_type, std::string_view property_name);
+ static libtextclassifier3::Status ValidateIndexingConfig(
+ const IndexingConfig& config,
+ PropertyConfigProto::DataType::Code data_type);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SCHEMA_UTIL_H_
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
new file mode 100644
index 0000000..67cfb50
--- /dev/null
+++ b/icing/schema/schema-util_test.cc
@@ -0,0 +1,575 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-util.h"
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+// Properties/fields in a schema type
+constexpr char kEmailType[] = "EmailMessage";
+constexpr char kPersonType[] = "Person";
+
+class SchemaUtilTest : public ::testing::Test {
+ protected:
+ SchemaProto schema_proto_;
+
+ static SchemaTypeConfigProto CreateSchemaTypeConfig(
+ const std::string_view schema_type,
+ const std::string_view nested_schema_type = "") {
+ SchemaTypeConfigProto type;
+ type.set_schema_type(std::string(schema_type));
+
+ auto string_property = type.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ string_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ auto int_property = type.add_properties();
+ int_property->set_property_name("int");
+ int_property->set_data_type(PropertyConfigProto::DataType::INT64);
+ int_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ auto double_property = type.add_properties();
+ double_property->set_property_name("double");
+ double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ double_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REPEATED);
+
+ auto bool_property = type.add_properties();
+ bool_property->set_property_name("boolean");
+ bool_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN);
+ bool_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+
+ auto bytes_property = type.add_properties();
+ bytes_property->set_property_name("bytes");
+ bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES);
+ bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+
+ if (!nested_schema_type.empty()) {
+ auto document_property = type.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REPEATED);
+ document_property->set_schema_type(std::string(nested_schema_type));
+ }
+
+ return type;
+ }
+};
+
+TEST_F(SchemaUtilTest, Valid_Empty) {
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+}
+
+TEST_F(SchemaUtilTest, Valid_Nested) {
+ auto email_type = schema_proto_.add_types();
+ *email_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto person_type = schema_proto_.add_types();
+ *person_type = CreateSchemaTypeConfig(kPersonType);
+
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+}
+
+TEST_F(SchemaUtilTest, Valid_ClearedPropertyConfigs) {
+ // No property fields is technically ok, but probably not realistic.
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type->clear_properties();
+
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+}
+
+TEST_F(SchemaUtilTest, Invalid_ClearedSchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type->clear_schema_type();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_EmptySchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type->set_schema_type("");
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_NotAlphanumericSchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type->set_schema_type("_");
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_ClearedPropertyName) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->clear_property_name();
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_EmptyPropertyName) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_NotAlphanumericPropertyName) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("_");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_DuplicatePropertyName) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto first_property = type->add_properties();
+ first_property->set_property_name("DuplicatedProperty");
+ first_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ first_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ auto second_property = type->add_properties();
+ second_property->set_property_name("DuplicatedProperty");
+ second_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ second_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS));
+}
+
+TEST_F(SchemaUtilTest, Invalid_ClearedDataType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->clear_data_type();
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_UnknownDataType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::UNKNOWN);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_ClearedCardinality) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->clear_cardinality();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_UnknownCardinality) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::UNKNOWN);
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_ClearedPropertySchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ property->clear_schema_type();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_EmptyPropertySchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ property->set_schema_type("");
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SchemaUtilTest, Invalid_NoMatchingSchemaType) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewProperty");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ property->set_schema_type("NewSchemaType");
+
+ ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ StatusIs(libtextclassifier3::StatusCode::UNKNOWN));
+}
+
+TEST_F(SchemaUtilTest, NewOptionalPropertyIsCompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ // Configure new schema with an optional field, not considered incompatible
+ // since it's fine if old data doesn't have this optional field
+ SchemaProto new_schema_with_optional;
+ type = new_schema_with_optional.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewOptional");
+ property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema,
+ new_schema_with_optional),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, NewRequiredPropertyIsIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ // Configure new schema with a required field, considered incompatible since
+ // old data won't have this required field
+ SchemaProto new_schema_with_required;
+ type = new_schema_with_required.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("NewRequired");
+ property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.emplace(kEmailType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema,
+ new_schema_with_required),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("OldOptional");
+ property->set_data_type(PropertyConfigProto::DataType::INT64);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Configure new schema, new schema needs to at least have all the previously
+ // defined properties
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.emplace(kEmailType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
+ // Configure less restrictive schema based on cardinality
+ SchemaProto less_restrictive_schema;
+ auto type = less_restrictive_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::INT64);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+
+ // Configure more restrictive schema based on cardinality
+ SchemaProto more_restrictive_schema;
+ type = more_restrictive_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::INT64);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // We can't have a new schema be less restrictive, REQUIRED->OPTIONAL
+ SchemaUtil::SchemaDelta incompatible_schema_delta;
+ incompatible_schema_delta.schema_types_incompatible.emplace(kEmailType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ /*old_schema=*/less_restrictive_schema,
+ /*new_schema=*/more_restrictive_schema),
+ Eq(incompatible_schema_delta));
+
+ // We can have the new schema be more restrictive, OPTIONAL->REPEATED;
+ SchemaUtil::SchemaDelta compatible_schema_delta;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ /*old_schema=*/more_restrictive_schema,
+ /*new_schema=*/less_restrictive_schema),
+ Eq(compatible_schema_delta));
+}
+
+TEST_F(SchemaUtilTest, DifferentDataTypeIsIncompatible) {
+ // Configure old schema, with an int64_t property
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::INT64);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+
+ // Configure new schema, with a double property
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.emplace(kEmailType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
+ // Configure old schema, where Property is supposed to be a Person type
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kPersonType);
+
+ *type = CreateSchemaTypeConfig(kEmailType);
+ auto property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ property->set_schema_type(kPersonType);
+
+ // Configure new schema, where Property is supposed to be an Email type
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kPersonType);
+
+ *type = CreateSchemaTypeConfig(kEmailType);
+ property = type->add_properties();
+ property->set_property_name("Property");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ property->set_schema_type(kEmailType);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.emplace(kEmailType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto old_type = old_schema.add_types();
+ *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto old_property = old_type->add_properties();
+ old_property->set_property_name("Property");
+ old_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Configure new schema
+ SchemaProto new_schema;
+ auto new_type = new_schema.add_types();
+ *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto new_property = new_type->add_properties();
+ new_property->set_property_name("Property");
+ new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.index_incompatible = true;
+
+ // New schema gained a new indexed property.
+ old_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::UNKNOWN);
+ new_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+
+ // New schema lost an indexed property.
+ old_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ new_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::UNKNOWN);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, AddingTypeIsCompatible) {
+ // Can add a new type, existing data isn't incompatible, since none of them
+ // are of this new schema type
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kPersonType);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, DeletingTypeIsNoted) {
+ // Can't remove an old type, new schema needs to at least have all the
+ // previously defined schema otherwise the Documents of the missing schema
+ // are invalid
+ SchemaProto old_schema;
+ auto type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+ type = old_schema.add_types();
+ *type = CreateSchemaTypeConfig(kPersonType);
+
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_deleted.emplace(kPersonType);
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
+TEST_F(SchemaUtilTest, ValidateNoTokenizer) {
+ SchemaProto schema;
+ auto* type = schema.add_types();
+ type->set_schema_type("MyType");
+
+ auto* prop = type->add_properties();
+ prop->set_property_name("Foo");
+ prop->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ prop->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(SchemaUtil::Validate(schema),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ prop->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
+}
+
+TEST_F(SchemaUtilTest, ValidateDocumentNoTokenizer) {
+ SchemaProto schema;
+ auto* type = schema.add_types();
+ type->set_schema_type("OtherType");
+
+ type = schema.add_types();
+ type->set_schema_type("MyType");
+
+ auto* prop = type->add_properties();
+ prop->set_property_name("SubType");
+ prop->set_schema_type("OtherType");
+ prop->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ prop->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ prop->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::NONE);
+
+ EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc
new file mode 100644
index 0000000..85f0768
--- /dev/null
+++ b/icing/schema/section-manager.cc
@@ -0,0 +1,371 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/section-manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using TypeSectionMap =
+ std::unordered_map<std::string, const std::vector<SectionMetadata>>;
+
+// This state helps detect infinite loops (e.g. two type configs referencing
+// each other) when assigning sections. The combination of 'number of section
+// assigned' and 'current schema name' represents a unique state in the
+// section-assign process. If the same state is seen the second time, that means
+// an infinite loop.
+struct SectionAssigningState {
+ size_t num_sections_assigned;
+ std::string current_schema_name;
+
+ SectionAssigningState(size_t num_sections_assigned_in,
+ std::string&& current_schema_name_in)
+ : num_sections_assigned(num_sections_assigned_in),
+ current_schema_name(std::move(current_schema_name_in)) {}
+};
+
+// Provides a hash value of this struct so that it can be stored in a hash
+// set.
+struct SectionAssigningStateHasher {
+ size_t operator()(const SectionAssigningState& state) {
+ size_t str_hash = std::hash<std::string>()(state.current_schema_name);
+ size_t int_hash = std::hash<size_t>()(state.num_sections_assigned);
+ // Combine the two hashes by taking the upper 16-bits of the string hash and
+ // the lower 16-bits of the int hash.
+ return (str_hash & 0xFFFF0000) | (int_hash & 0x0000FFFF);
+ }
+};
+
+bool operator==(const SectionAssigningState& lhs,
+ const SectionAssigningState& rhs) {
+ return lhs.num_sections_assigned == rhs.num_sections_assigned &&
+ lhs.current_schema_name == rhs.current_schema_name;
+}
+
+// Helper function to concatenate a path and a property name
+std::string ConcatenatePath(const std::string& path,
+ const std::string& next_property_name) {
+ if (path.empty()) {
+ return next_property_name;
+ }
+ return absl_ports::StrCat(path, kPropertySeparator, next_property_name);
+}
+
+// Helper function to recursively identify sections from a type config and add
+// them to a section metadata list
+libtextclassifier3::Status AssignSections(
+ const SchemaTypeConfigProto& type_config,
+ const std::string& current_section_path,
+ const SchemaUtil::TypeConfigMap& type_config_map,
+ std::unordered_set<SectionAssigningState, SectionAssigningStateHasher>*
+ visited_states,
+ std::vector<SectionMetadata>* metadata_list) {
+ if (!visited_states
+ ->emplace(metadata_list->size(),
+ std::string(type_config.schema_type()))
+ .second) {
+ // Failed to insert, the same state has been seen before, there's an
+ // infinite loop in type configs
+ return absl_ports::InvalidArgumentError(
+ "Infinite loop detected in type configs");
+ }
+
+ // Sorts properties by name's alphabetical order so that order doesn't affect
+ // section assigning.
+ auto sorted_properties = type_config.properties();
+ std::sort(sorted_properties.pointer_begin(), sorted_properties.pointer_end(),
+ [](const PropertyConfigProto* p1, const PropertyConfigProto* p2) {
+ return p1->property_name() < p2->property_name();
+ });
+ for (const auto& property_config : sorted_properties) {
+ if (property_config.indexing_config().term_match_type() ==
+ TermMatchType::UNKNOWN) {
+ // No need to create section for current property
+ continue;
+ }
+
+ // Creates section metadata according to data type
+ if (property_config.data_type() == PropertyConfigProto::DataType::STRING ||
+ property_config.data_type() == PropertyConfigProto::DataType::INT64 ||
+ property_config.data_type() == PropertyConfigProto::DataType::DOUBLE) {
+ // Validates next section id, makes sure that section id is the same as
+ // the list index so that we could find any section metadata by id in O(1)
+ // later.
+ auto new_section_id = static_cast<SectionId>(metadata_list->size());
+ if (!IsSectionIdValid(new_section_id)) {
+ // Max number of sections reached
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Too many properties to be indexed, max number of properties "
+ "allowed: %d",
+ kMaxSectionId - kMinSectionId + 1));
+ }
+ // Creates section metadata from property config
+ metadata_list->emplace_back(
+ new_section_id, property_config.indexing_config().term_match_type(),
+ property_config.indexing_config().tokenizer_type(),
+ ConcatenatePath(current_section_path,
+ property_config.property_name()));
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT) {
+ // Tries to find sections recursively
+ auto nested_type_config_iter =
+ type_config_map.find(property_config.schema_type());
+ if (nested_type_config_iter == type_config_map.end()) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "type config not found: ", property_config.schema_type()));
+ }
+ const SchemaTypeConfigProto& nested_type_config =
+ nested_type_config_iter->second;
+ ICING_RETURN_IF_ERROR(
+ AssignSections(nested_type_config,
+ ConcatenatePath(current_section_path,
+ property_config.property_name()),
+ type_config_map, visited_states, metadata_list));
+ }
+ // NOTE: we don't create sections for BOOLEAN and BYTES data types.
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Builds a vector of vectors that holds SectionMetadatas for all the schema
+// types. The outer vector's index corresponds with a type's SchemaTypeId. The
+// inner vector's index corresponds to the section's SectionId.
+libtextclassifier3::StatusOr<std::vector<std::vector<SectionMetadata>>>
+BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map,
+ const KeyMapper<SchemaTypeId>& schema_type_mapper) {
+ // Create our vector and reserve the number of schema types we have
+ std::vector<std::vector<SectionMetadata>> section_metadata_cache(
+ schema_type_mapper.num_keys());
+
+ std::unordered_set<SectionAssigningState, SectionAssigningStateHasher>
+ visited_states;
+ for (const auto& name_and_type : type_config_map) {
+ // Assigns sections for each type config
+ visited_states.clear();
+ const std::string& type_config_name = name_and_type.first;
+ const SchemaTypeConfigProto& type_config = name_and_type.second;
+ std::vector<SectionMetadata> metadata_list;
+ ICING_RETURN_IF_ERROR(
+ AssignSections(type_config, /*current_section_path*/ "",
+ type_config_map, &visited_states, &metadata_list));
+
+ // Insert the section metadata list at the index of the type's SchemaTypeId
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_type_mapper.Get(type_config_name));
+ section_metadata_cache[schema_type_id] = std::move(metadata_list);
+ }
+ return section_metadata_cache;
+}
+
+// Helper function to get string content from a property. Repeated values are
+// joined into one string. We only care about STRING, INT64, and DOUBLE data
+// types.
+std::vector<std::string> GetPropertyContent(const PropertyProto& property) {
+ std::vector<std::string> values;
+ if (!property.string_values().empty()) {
+ std::copy(property.string_values().begin(), property.string_values().end(),
+ std::back_inserter(values));
+ } else if (!property.int64_values().empty()) {
+ std::transform(
+ property.int64_values().begin(), property.int64_values().end(),
+ std::back_inserter(values),
+ [](int64_t i) { return IcingStringUtil::StringPrintf("%" PRId64, i); });
+ } else {
+ std::transform(
+ property.double_values().begin(), property.double_values().end(),
+ std::back_inserter(values),
+ [](double d) { return IcingStringUtil::StringPrintf("%f", d); });
+ }
+ return values;
+}
+
+// Helper function to get metadata list of a type config
+libtextclassifier3::StatusOr<std::vector<SectionMetadata>> GetMetadataList(
+ const KeyMapper<SchemaTypeId>& schema_type_mapper,
+ const std::vector<std::vector<SectionMetadata>>& section_metadata_cache,
+ const std::string& type_config_name) {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_type_mapper.Get(type_config_name));
+ return section_metadata_cache.at(schema_type_id);
+}
+
+} // namespace
+
+SectionManager::SectionManager(
+ const KeyMapper<SchemaTypeId>* schema_type_mapper,
+ std::vector<std::vector<SectionMetadata>>&& section_metadata_cache)
+ : schema_type_mapper_(*schema_type_mapper),
+ section_metadata_cache_(std::move(section_metadata_cache)) {}
+
+libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>>
+SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
+ const KeyMapper<SchemaTypeId>* schema_type_mapper) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::vector<SectionMetadata>> section_metadata_cache,
+ BuildSectionMetadataCache(type_config_map, *schema_type_mapper));
+ return std::unique_ptr<SectionManager>(new SectionManager(
+ schema_type_mapper, std::move(section_metadata_cache)));
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string>>
+SectionManager::GetSectionContent(const DocumentProto& document,
+ std::string_view section_path) const {
+ // Finds the first property name in section_path
+ size_t separator_position = section_path.find(kPropertySeparator);
+ std::string_view current_property_name =
+ (separator_position == std::string::npos)
+ ? section_path
+ : section_path.substr(0, separator_position);
+
+ // Tries to match the property name with the ones in document
+ auto property_iterator =
+ std::find_if(document.properties().begin(), document.properties().end(),
+ [current_property_name](const PropertyProto& property) {
+ return property.name() == current_property_name;
+ });
+
+ if (property_iterator == document.properties().end()) {
+ // Property name not found, it could be one of the following 2 cases:
+ // 1. The property is optional and it's not in the document
+ // 2. The property name is invalid
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Section path ", section_path,
+ " not found in type config ", document.schema()));
+ }
+
+ if (separator_position == std::string::npos) {
+ // Current property name is the last one in section path
+ std::vector<std::string> content = GetPropertyContent(*property_iterator);
+ if (content.empty()) {
+ // The content of property is explicitly set to empty, we'll treat it as
+ // NOT_FOUND because the index doesn't care about empty strings.
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Section path ", section_path,
+ " not found in type config ", document.schema()));
+ }
+ return content;
+ }
+
+ // Gets section content recursively
+ std::string_view sub_section_path =
+ section_path.substr(separator_position + 1);
+ std::vector<std::string> nested_document_content;
+ for (const auto& nested_document : property_iterator->document_values()) {
+ auto content_or = GetSectionContent(nested_document, sub_section_path);
+ if (content_or.ok()) {
+ std::vector<std::string> content = std::move(content_or).ValueOrDie();
+ std::move(content.begin(), content.end(),
+ std::back_inserter(nested_document_content));
+ }
+ }
+ if (nested_document_content.empty()) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Section path ", section_path,
+ " not found in type config ", document.schema()));
+ }
+ return nested_document_content;
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string>>
+SectionManager::GetSectionContent(const DocumentProto& document,
+ SectionId section_id) const {
+ if (!IsSectionIdValid(section_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Section id %d is greater than the max value %d", section_id,
+ kMaxSectionId));
+ }
+ ICING_ASSIGN_OR_RETURN(
+ const std::vector<SectionMetadata>& metadata_list,
+ GetMetadataList(schema_type_mapper_, section_metadata_cache_,
+ document.schema()));
+ if (section_id >= metadata_list.size()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Section with id %d doesn't exist in type config %s", section_id,
+ document.schema().c_str()));
+ }
+ // The index of metadata list is the same as the section id, so we can use
+ // section id as the index.
+ return GetSectionContent(document, metadata_list[section_id].path);
+}
+
+libtextclassifier3::StatusOr<const SectionMetadata*>
+SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
+ SectionId section_id) const {
+ if (!IsSectionIdValid(section_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Section id %d is greater than the max value %d", section_id,
+ kMaxSectionId));
+ }
+ const std::vector<SectionMetadata>& section_metadatas =
+ section_metadata_cache_[schema_type_id];
+ if (section_id >= section_metadatas.size()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Section with id %d doesn't exist in type config with id %d",
+ section_id, schema_type_id));
+ }
+
+ // The index of metadata list is the same as the section id, so we can use
+ // section id as the index.
+ return §ion_metadatas[section_id];
+}
+
+libtextclassifier3::StatusOr<std::vector<Section>>
+SectionManager::ExtractSections(const DocumentProto& document) const {
+ ICING_ASSIGN_OR_RETURN(
+ const std::vector<SectionMetadata>& metadata_list,
+ GetMetadataList(schema_type_mapper_, section_metadata_cache_,
+ document.schema()));
+ std::vector<Section> sections;
+ for (const auto& section_metadata : metadata_list) {
+ auto section_content_or =
+ GetSectionContent(document, section_metadata.path);
+ // Adds to result vector if section is found in document
+ if (section_content_or.ok()) {
+ sections.emplace_back(SectionMetadata(section_metadata),
+ std::move(section_content_or).ValueOrDie());
+ }
+ }
+ return sections;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/section-manager.h b/icing/schema/section-manager.h
new file mode 100644
index 0000000..56045e8
--- /dev/null
+++ b/icing/schema/section-manager.h
@@ -0,0 +1,117 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SECTION_MANAGER_H_
+#define ICING_SCHEMA_SECTION_MANAGER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "utils/base/statusor.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+
+namespace icing {
+namespace lib {
+
+inline constexpr char kPropertySeparator[] = ".";
+
+// This class provides section-related operations. It assigns sections according
+// to type configs and extracts section / sections from documents.
+class SectionManager {
+ public:
+ SectionManager(const SectionManager&) = delete;
+ SectionManager& operator=(const SectionManager&) = delete;
+
+ // Creates a SectionManager from a type config map (type config name -> type
+ // config)
+ //
+ // Returns:
+ // A SectionManager on success
+ // INVALID_ARGUMENT if infinite loop detected in the type configs
+ // OUT_OF_RANGE if number of properties need indexing exceeds the max number
+ // NOT_FOUND if any type config name not found in the map
+ static libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>> Create(
+ const SchemaUtil::TypeConfigMap& type_config_map,
+ const KeyMapper<SchemaTypeId>* schema_type_mapper);
+
+ // Finds content of a section by section path (e.g. property1.property2)
+ //
+ // Returns:
+ // A string of content on success
+ // NOT_FOUND if:
+ // 1. Property is optional and not found in the document
+ // 2. section_path is invalid
+ // 3. Content is empty
+ libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
+ const DocumentProto& document, std::string_view section_path) const;
+
+ // Finds content of a section by id
+ //
+ // Returns:
+ // A string of content on success
+ // INVALID_ARGUMENT if section id is invalid
+ // NOT_FOUND if type config name of document not found
+ libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
+ const DocumentProto& document, SectionId section_id) const;
+
+ // Returns the SectionMetadata associated with the SectionId that's in the
+ // SchemaTypeId.
+ //
+ // Returns:
+ // pointer to SectionMetadata on success
+ // INVALID_ARGUMENT if schema type id or section is invalid
+ libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
+ SchemaTypeId schema_type_id, SectionId section_id) const;
+
+ // Extracts all sections from the given document, sections are sorted by
+ // section id in increasing order. Section ids start from 0. Sections with
+ // empty content won't be returned.
+ //
+ // Returns:
+ // A list of sections on success
+ // NOT_FOUND if type config name of document not found
+ libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ const DocumentProto& document) const;
+
+ private:
+ // Use SectionManager::Create() to instantiate
+ explicit SectionManager(
+ const KeyMapper<SchemaTypeId>* schema_type_mapper,
+ std::vector<std::vector<SectionMetadata>>&& section_metadata_cache);
+
+ // Maps schema types to a densely-assigned unique id.
+ const KeyMapper<SchemaTypeId>& schema_type_mapper_;
+
+ // The index of section_metadata_cache_ corresponds to a schema type's
+ // SchemaTypeId. At that SchemaTypeId index, we store an inner vector. The
+ // inner vector's index corresponds to a section's SectionId. At the SectionId
+ // index, we store the SectionMetadata of that section.
+ //
+ // For example, pretend "email" had a SchemaTypeId of 0 and it had a section
+ // called "subject" with a SectionId of 1. Then there would exist a vector
+ // that holds the "subject" property's SectionMetadata at index 1. This vector
+ // would be stored at index 0 of the section_metadata_cache_ vector.
+ const std::vector<std::vector<SectionMetadata>> section_metadata_cache_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SECTION_MANAGER_H_
diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc
new file mode 100644
index 0000000..38fb8b4
--- /dev/null
+++ b/icing/schema/section-manager_test.cc
@@ -0,0 +1,446 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/section-manager.h"
+
+#include <limits>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/schema-util.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+
+// type and property names of EmailMessage
+constexpr char kTypeEmail[] = "EmailMessage";
+constexpr char kPropertySubject[] = "subject";
+constexpr char kPropertyText[] = "text";
+constexpr char kPropertyTimestamp[] = "timestamp";
+constexpr char kPropertyAttachment[] = "attachment";
+constexpr char kPropertyRecipients[] = "recipients";
+// type and property names of Conversation
+constexpr char kTypeConversation[] = "Conversation";
+constexpr char kPropertyName[] = "name";
+constexpr char kPropertyEmails[] = "emails";
+
+class SectionManagerTest : public ::testing::Test {
+ protected:
+ SectionManagerTest() : test_dir_(GetTestTempDir() + "/icing") {
+ auto email_type = CreateEmailTypeConfig();
+ auto conversation_type = CreateConversationTypeConfig();
+ type_config_map_.emplace(email_type.schema_type(), email_type);
+ type_config_map_.emplace(conversation_type.schema_type(),
+ conversation_type);
+
+ email_document_ =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema(kTypeEmail)
+ .AddStringProperty(kPropertySubject, "the subject")
+ .AddStringProperty(kPropertyText, "the text")
+ .AddInt64Property(kPropertyTimestamp, 1234567890)
+ .AddBytesProperty(kPropertyAttachment, "attachment bytes")
+ .AddStringProperty(kPropertyRecipients, "recipient1", "recipient2",
+ "recipient3")
+ .Build();
+
+ conversation_document_ =
+ DocumentBuilder()
+ .SetKey("icing", "conversation/1")
+ .SetSchema(kTypeConversation)
+ .AddDocumentProperty(kPropertyEmails,
+ DocumentProto(email_document_),
+ DocumentProto(email_document_))
+ .Build();
+ }
+
+ void SetUp() override {
+ // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so
+ // the total KeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ ICING_ASSERT_OK_AND_ASSIGN(schema_type_mapper_,
+ KeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_, key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeEmail, 0));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeConversation, 1));
+ }
+
+ static SchemaTypeConfigProto CreateEmailTypeConfig() {
+ SchemaTypeConfigProto type;
+ type.set_schema_type(kTypeEmail);
+
+ auto subject = type.add_properties();
+ subject->set_property_name(kPropertySubject);
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ subject->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ subject->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ auto text = type.add_properties();
+ text->set_property_name(kPropertyText);
+ text->set_data_type(PropertyConfigProto::DataType::STRING);
+ text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ text->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::UNKNOWN);
+
+ auto timestamp = type.add_properties();
+ timestamp->set_property_name(kPropertyTimestamp);
+ timestamp->set_data_type(PropertyConfigProto::DataType::INT64);
+ timestamp->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ timestamp->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ timestamp->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ auto attachment = type.add_properties();
+ attachment->set_property_name(kPropertyAttachment);
+ attachment->set_data_type(PropertyConfigProto::DataType::BYTES);
+ attachment->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ attachment->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ attachment->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ auto recipients = type.add_properties();
+ recipients->set_property_name(kPropertyRecipients);
+ recipients->set_data_type(PropertyConfigProto::DataType::STRING);
+ recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ recipients->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ recipients->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ return type;
+ }
+
+ static SchemaTypeConfigProto CreateConversationTypeConfig() {
+ SchemaTypeConfigProto type;
+ type.set_schema_type(kTypeConversation);
+
+ auto name = type.add_properties();
+ name->set_property_name(kPropertyName);
+ name->set_data_type(PropertyConfigProto::DataType::STRING);
+ name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ name->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ auto emails = type.add_properties();
+ emails->set_property_name(kPropertyEmails);
+ emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ emails->set_schema_type(kTypeEmail);
+ emails->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ return type;
+ }
+
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ SchemaUtil::TypeConfigMap type_config_map_;
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
+
+ DocumentProto email_document_;
+ DocumentProto conversation_document_;
+};
+
+TEST_F(SectionManagerTest, Create) {
+ {
+ ICING_ASSERT_OK(
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+ }
+ {
+ // Test infinite loop in schema
+ // Creates 2 type configs that reference each other
+ SchemaTypeConfigProto type_config1;
+ type_config1.set_schema_type("type1");
+ auto property1 = type_config1.add_properties();
+ property1->set_property_name("property1");
+ property1->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property1->set_schema_type("type2"); // Here we reference type2
+ property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property1->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaTypeConfigProto type_config2;
+ type_config2.set_schema_type("type2");
+ auto property2 = type_config2.add_properties();
+ property2->set_property_name("property2");
+ property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ // Here we reference type1, which references type2 causing the infinite loop
+ property2->set_schema_type("type1");
+ property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property2->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace("type1", type_config1);
+ type_config_map.emplace("type2", type_config2);
+
+ EXPECT_THAT(
+ SectionManager::Create(type_config_map, schema_type_mapper_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Infinite loop detected")));
+ }
+ {
+ // Also test infinite loop.
+ // Creates a type config that has a section and references to self.
+ SchemaTypeConfigProto type_config;
+ type_config.set_schema_type("type");
+ auto property1 = type_config.add_properties();
+ property1->set_property_name("property1");
+ property1->set_data_type(PropertyConfigProto::DataType::STRING);
+ property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property1->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ auto property2 = type_config.add_properties();
+ property2->set_property_name("property2");
+ property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ // Here we're referencing our own type, causing an infinite loop
+ property2->set_schema_type("type");
+ property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property2->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace("type", type_config);
+
+ EXPECT_THAT(
+ SectionManager::Create(type_config_map, schema_type_mapper_.get()),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
+ HasSubstr("Too many properties")));
+ }
+ {
+ // Test number of sections that is more than allowed
+ SchemaTypeConfigProto type_config;
+ type_config.set_schema_type("type");
+ // Adds more properties than allowed
+ int max_num_sections_allowed = kMaxSectionId - kMinSectionId + 1;
+ for (int i = 0; i < max_num_sections_allowed + 1; i++) {
+ auto property = type_config.add_properties();
+ property->set_property_name("property" + std::to_string(i));
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ }
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace("type", type_config);
+
+ EXPECT_THAT(
+ SectionManager::Create(type_config_map, schema_type_mapper_.get()),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
+ HasSubstr("Too many properties")));
+ }
+ {
+ // Test unknown schema name
+ SchemaTypeConfigProto type_config;
+ type_config.set_schema_type("type");
+ auto property = type_config.add_properties();
+ property->set_property_name("property");
+ property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property->set_schema_type("unknown_name");
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace("type", type_config);
+
+ EXPECT_THAT(
+ SectionManager::Create(type_config_map, schema_type_mapper_.get()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("type config not found")));
+ }
+}
+
+TEST_F(SectionManagerTest, GetSectionContent) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Test simple section paths
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "subject"),
+ IsOkAndHolds(ElementsAre("the subject")));
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "text"),
+ IsOkAndHolds(ElementsAre("the text")));
+
+ // Test repeated values, they are joined into one string
+ ICING_ASSERT_OK_AND_ASSIGN(auto content, section_manager->GetSectionContent(
+ email_document_,
+ /*section_path*/ "recipients"));
+ EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
+
+ // Test concatenated section paths: "property1.property2"
+ ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetSectionContent(
+ conversation_document_,
+ /*section_path*/ "emails.subject"));
+ EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetSectionContent(
+ conversation_document_,
+ /*section_path*/ "emails.text"));
+ EXPECT_THAT(content, ElementsAre("the text", "the text"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ content,
+ section_manager->GetSectionContent(conversation_document_,
+ /*section_path*/ "emails.recipients"));
+ EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3"));
+
+ // Test non-existing paths
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "name"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "invalid"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(
+ section_manager->GetSectionContent(conversation_document_,
+ /*section_path*/ "emails.invalid"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Test other data types
+ // INT64
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "timestamp"),
+ IsOkAndHolds(ElementsAre("1234567890")));
+ // BYTES type can't be indexed, so content won't be returned
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ /*section_path*/ "attachment"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // The following tests are similar to the ones above but use section ids
+ // instead of section paths
+
+ // EmailMessage (section id -> section path):
+ SectionId recipients_section_id = 0;
+ SectionId subject_section_id = 1;
+ SectionId timestamp_section_id = 2;
+ SectionId invalid_email_section_id = 3;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ content, section_manager->GetSectionContent(email_document_,
+ recipients_section_id));
+ EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
+
+ EXPECT_THAT(
+ section_manager->GetSectionContent(email_document_, subject_section_id),
+ IsOkAndHolds(ElementsAre("the subject")));
+ EXPECT_THAT(
+ section_manager->GetSectionContent(email_document_, timestamp_section_id),
+ IsOkAndHolds(ElementsAre("1234567890")));
+
+ EXPECT_THAT(section_manager->GetSectionContent(email_document_,
+ invalid_email_section_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Conversation (section id -> section path):
+ // 0 -> emails.recipients
+ // 1 -> emails.subject
+ // 2 -> emails.timestamp
+ // 3 -> name
+ SectionId emails_recipients_section_id = 0;
+ SectionId emails_subject_section_id = 1;
+ SectionId emails_timestamp_section_id = 2;
+ SectionId name_section_id = 3;
+ SectionId invalid_conversation_section_id = 4;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ content, section_manager->GetSectionContent(
+ conversation_document_, emails_recipients_section_id));
+ EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ content, section_manager->GetSectionContent(conversation_document_,
+ emails_subject_section_id));
+ EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ content, section_manager->GetSectionContent(conversation_document_,
+ emails_timestamp_section_id));
+ EXPECT_THAT(content, ElementsAre("1234567890", "1234567890"));
+
+ EXPECT_THAT(section_manager->GetSectionContent(conversation_document_,
+ name_section_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ EXPECT_THAT(section_manager->GetSectionContent(
+ conversation_document_, invalid_conversation_section_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SectionManagerTest, ExtractSections) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Extracts all sections from 'EmailMessage' document
+ ICING_ASSERT_OK_AND_ASSIGN(auto sections,
+ section_manager->ExtractSections(email_document_));
+ EXPECT_THAT(sections.size(), Eq(3));
+
+ EXPECT_THAT(sections[0].metadata.id, Eq(0));
+ EXPECT_THAT(sections[0].metadata.path, Eq("recipients"));
+ EXPECT_THAT(sections[0].content,
+ ElementsAre("recipient1", "recipient2", "recipient3"));
+
+ EXPECT_THAT(sections[1].metadata.id, Eq(1));
+ EXPECT_THAT(sections[1].metadata.path, Eq("subject"));
+ EXPECT_THAT(sections[1].content, ElementsAre("the subject"));
+
+ EXPECT_THAT(sections[2].metadata.id, Eq(2));
+ EXPECT_THAT(sections[2].metadata.path, Eq("timestamp"));
+ EXPECT_THAT(sections[2].content, ElementsAre("1234567890"));
+
+ // Extracts all sections from 'Conversation' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ sections, section_manager->ExtractSections(conversation_document_));
+ EXPECT_THAT(sections.size(), Eq(3));
+
+ // Section id 3 (name) not found in document, so the first section id found
+ // is 1 below.
+ EXPECT_THAT(sections[0].metadata.id, Eq(0));
+ EXPECT_THAT(sections[0].metadata.path, Eq("emails.recipients"));
+ EXPECT_THAT(sections[0].content,
+ ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3"));
+
+ EXPECT_THAT(sections[1].metadata.id, Eq(1));
+ EXPECT_THAT(sections[1].metadata.path, Eq("emails.subject"));
+ EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject"));
+
+ EXPECT_THAT(sections[2].metadata.id, Eq(2));
+ EXPECT_THAT(sections[2].metadata.path, Eq("emails.timestamp"));
+ EXPECT_THAT(sections[2].content, ElementsAre("1234567890", "1234567890"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/section.h b/icing/schema/section.h
new file mode 100644
index 0000000..daf4fd0
--- /dev/null
+++ b/icing/schema/section.h
@@ -0,0 +1,96 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SECTION_H_
+#define ICING_SCHEMA_SECTION_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+
+namespace icing {
+namespace lib {
+
+using SectionId = int8_t;
+// 4 bits for 16 values. NOTE: Increasing this value means that SectionIdMask
+// must increase from an int16_t to an int32_t
+inline constexpr int kSectionIdBits = 4;
+inline constexpr SectionId kInvalidSectionId = (1 << kSectionIdBits);
+inline constexpr SectionId kMaxSectionId = kInvalidSectionId - 1;
+inline constexpr SectionId kMinSectionId = 0;
+constexpr bool IsSectionIdValid(SectionId section_id) {
+ return section_id >= kMinSectionId && section_id <= kMaxSectionId;
+}
+
+using SectionIdMask = int16_t;
+inline constexpr SectionIdMask kSectionIdMaskAll = ~SectionIdMask{0};
+inline constexpr SectionIdMask kSectionIdMaskNone = SectionIdMask{0};
+
+static_assert(
+ kMaxSectionId < 8 * sizeof(SectionIdMask),
+ "SectionIdMask is not large enough to represent all section values!");
+
+// TODO(samzheng): add more metadata when needed, e.g. tokenizer type,
+struct SectionMetadata {
+ // Dot-joined property names, representing the location of section inside an
+ // document. E.g. "property1.property2"
+ std::string path;
+
+ // A unique id of property within a type config
+ SectionId id;
+
+ // How content in this section should be tokenized. It is invalid for a
+ // section to have tokenizer == 'NONE'.
+ IndexingConfig::TokenizerType::Code tokenizer;
+
+ // How tokens in this section should be matched.
+ //
+ // TermMatchType::UNKNOWN:
+ // Terms will not match anything
+ //
+ // TermMatchType::PREFIX:
+ // Terms will be stored as a prefix match, "fool" matches "foo" and "fool"
+ //
+ // TermMatchType::EXACT_ONLY:
+ // Terms will be only stored as an exact match, "fool" only matches "fool"
+ TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
+
+ SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in,
+ IndexingConfig::TokenizerType::Code tokenizer,
+ std::string&& path_in)
+ : path(std::move(path_in)),
+ id(id_in),
+ tokenizer(tokenizer),
+ term_match_type(term_match_type_in) {}
+};
+
+// Section is an icing internal concept similar to document property but with
+// extra metadata. The content can be a value or the combination of repeated
+// values of a property.
+struct Section {
+ SectionMetadata metadata;
+ std::vector<std::string> content;
+
+ Section(SectionMetadata&& metadata_in, std::vector<std::string>&& content_in)
+ : metadata(std::move(metadata_in)), content(std::move(content_in)) {}
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SECTION_H_
diff --git a/icing/scoring/ranker.cc b/icing/scoring/ranker.cc
new file mode 100644
index 0000000..e68fbd2
--- /dev/null
+++ b/icing/scoring/ranker.cc
@@ -0,0 +1,154 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/ranker.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "base/logging.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// For all the heap manipulations in this file, we use a vector to represent the
+// heap. The element at index 0 is the root node. For any node at index i, its
+// left child node is at 2 * i + 1, its right child node is at 2 * i + 2.
+
+// Helper function to wrap the heapify algorithm, it heapifies the target
+// subtree node in place.
+void Heapify(std::vector<ScoredDocumentHit>* scored_document_hits,
+ int target_subtree_root_index,
+ const ScoredDocumentHitComparator scored_document_hit_comparator) {
+ const int heap_size = scored_document_hits->size();
+ if (target_subtree_root_index >= heap_size) {
+ return;
+ }
+
+ // Initializes subtree root as the current best node.
+ int best = target_subtree_root_index;
+ // If we represent a heap in an array/vector, indices of left and right
+ // children can be calculated.
+ const int left = target_subtree_root_index * 2 + 1;
+ const int right = target_subtree_root_index * 2 + 2;
+
+ // If left child is better than current best
+ if (left < heap_size &&
+ scored_document_hit_comparator(scored_document_hits->at(left),
+ scored_document_hits->at(best))) {
+ best = left;
+ }
+
+ // If right child is better than current best
+ if (right < heap_size &&
+ scored_document_hit_comparator(scored_document_hits->at(right),
+ scored_document_hits->at(best))) {
+ best = right;
+ }
+
+ // If the best is not the subtree root, swap and continue heapifying the lower
+ // level subtree
+ if (best != target_subtree_root_index) {
+ std::swap(scored_document_hits->at(best),
+ scored_document_hits->at(target_subtree_root_index));
+ Heapify(scored_document_hits, best, scored_document_hit_comparator);
+ }
+}
+
+// Helper function to build a heap in place whose root is the best node defined
+// by scored_document_hit_comparator. Time complexity is O(n).
+void BuildHeap(
+ std::vector<ScoredDocumentHit>* scored_document_hits,
+ const ScoredDocumentHitComparator scored_document_hit_comparator) {
+ const int heap_size = scored_document_hits->size();
+ // Since we use a vector to represent the heap, [size / 2 - 1] is the index
+ // of the parent node of the last node.
+ for (int subtree_root_index = heap_size / 2 - 1; subtree_root_index >= 0;
+ subtree_root_index--) {
+ Heapify(scored_document_hits, subtree_root_index,
+ scored_document_hit_comparator);
+ }
+}
+
+// Helper function to extract the root from the heap. The heap structure will be
+// maintained.
+//
+// Returns:
+// The current root element on success
+// RESOURCE_EXHAUSTED_ERROR if heap is empty
+libtextclassifier3::StatusOr<ScoredDocumentHit> ExtractRoot(
+ std::vector<ScoredDocumentHit>* scored_document_hits,
+ ScoredDocumentHitComparator scored_document_hit_comparator) {
+ if (scored_document_hits->empty()) {
+ // An invalid ScoredDocumentHit
+ return absl_ports::ResourceExhaustedError("Heap is empty");
+ }
+
+ // Steps to extract root from heap:
+ // 1. copy out root
+ ScoredDocumentHit root = scored_document_hits->at(0);
+ const size_t last_node_index = scored_document_hits->size() - 1;
+ // 2. swap root and the last node
+ std::swap(scored_document_hits->at(0),
+ scored_document_hits->at(last_node_index));
+ // 3. remove last node
+ scored_document_hits->pop_back();
+ // 4. heapify root
+ Heapify(scored_document_hits, /*target_subtree_root_index=*/0,
+ scored_document_hit_comparator);
+ return root;
+}
+
+std::vector<ScoredDocumentHit> HeapifyAndProduceTopN(
+ std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
+ bool is_descending) {
+ // Build a heap in place
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ is_descending);
+ BuildHeap(&scored_document_hits, scored_document_hit_comparator);
+
+ // Get best nodes from heap one by one
+ std::vector<ScoredDocumentHit> scored_document_hit_result;
+ int result_size =
+ std::min(num_result, static_cast<int>(scored_document_hits.size()));
+ while (result_size-- > 0) {
+ libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
+ ExtractRoot(&scored_document_hits, scored_document_hit_comparator);
+ if (next_best_document_hit_or.ok()) {
+ scored_document_hit_result.push_back(
+ std::move(next_best_document_hit_or).ValueOrDie());
+ } else {
+ ICING_VLOG(1) << next_best_document_hit_or.status().error_message();
+ }
+ }
+ return scored_document_hit_result;
+}
+
+} // namespace
+
+std::vector<ScoredDocumentHit> GetTopNFromScoredDocumentHits(
+ std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
+ bool is_descending) {
+ return HeapifyAndProduceTopN(std::move(scored_document_hits), num_result,
+ is_descending);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/ranker.h b/icing/scoring/ranker.h
new file mode 100644
index 0000000..1acd06c
--- /dev/null
+++ b/icing/scoring/ranker.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_RANKER_H_
+#define ICING_SCORING_RANKER_H_
+
+#include <vector>
+
+#include "icing/scoring/scored-document-hit.h"
+
+// Provides functionality to get the top N results from an unsorted vector.
+namespace icing {
+namespace lib {
+
+// Returns the top num_result results from scored_document_hits. The returned
+// vector will be sorted and contain no more than num_result elements.
+// is_descending indicates whether the result is in a descending score order
+// or an ascending score order.
+std::vector<ScoredDocumentHit> GetTopNFromScoredDocumentHits(
+ std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
+ bool is_descending);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_RANKER_H_
diff --git a/icing/scoring/ranker_benchmark.cc b/icing/scoring/ranker_benchmark.cc
new file mode 100644
index 0000000..f47ea9f
--- /dev/null
+++ b/icing/scoring/ranker_benchmark.cc
@@ -0,0 +1,95 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdlib>
+
+#include "testing/base/public/benchmark.h"
+#include "icing/scoring/ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/scoring:ranker_benchmark
+//
+// $ blaze-bin/icing/scoring/ranker_benchmark --benchmarks=all
+// --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/scoring:ranker_benchmark
+//
+// $ adb push blaze-bin/icing/scoring/ranker_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/ranker_benchmark --benchmarks=all
+
+void BM_GetTopN(benchmark::State& state) {
+ int num_to_score = state.range(0);
+ int num_to_return = state.range(1);
+
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ uint seed = Clock().GetCurrentSeconds();
+ for (int i = 0; i < num_to_score; i++) {
+ int score = rand_r(&seed);
+ scored_document_hits.emplace_back(/*document_id=*/0,
+ /*hit_section_id_mask=*/0, score);
+ }
+
+ for (auto _ : state) {
+ auto result =
+ GetTopNFromScoredDocumentHits(scored_document_hits, num_to_return,
+ /*is_descending=*/true);
+ }
+}
+BENCHMARK(BM_GetTopN)
+ ->ArgPair(1000, 10) // (num_to_score, num_to_return)
+ ->ArgPair(3000, 10)
+ ->ArgPair(5000, 10)
+ ->ArgPair(7000, 10)
+ ->ArgPair(9000, 10)
+ ->ArgPair(11000, 10)
+ ->ArgPair(13000, 10)
+ ->ArgPair(15000, 10)
+ ->ArgPair(17000, 10)
+ ->ArgPair(19000, 10)
+ ->ArgPair(1000, 20)
+ ->ArgPair(3000, 20)
+ ->ArgPair(5000, 20)
+ ->ArgPair(7000, 20)
+ ->ArgPair(9000, 20)
+ ->ArgPair(11000, 20)
+ ->ArgPair(13000, 20)
+ ->ArgPair(15000, 20)
+ ->ArgPair(17000, 20)
+ ->ArgPair(19000, 20)
+ ->ArgPair(1000, 30)
+ ->ArgPair(3000, 30)
+ ->ArgPair(5000, 30)
+ ->ArgPair(7000, 30)
+ ->ArgPair(9000, 30)
+ ->ArgPair(11000, 30)
+ ->ArgPair(13000, 30)
+ ->ArgPair(15000, 30)
+ ->ArgPair(17000, 30)
+ ->ArgPair(19000, 30);
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/ranker_test.cc b/icing/scoring/ranker_test.cc
new file mode 100644
index 0000000..3aa94e6
--- /dev/null
+++ b/icing/scoring/ranker_test.cc
@@ -0,0 +1,179 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/ranker.h"
+
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/testing/common-matchers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::Test;
+
+class RankerTest : public Test {
+ protected:
+ RankerTest()
+ : test_scored_document_hit1_(/*document_id=*/3, /*hit_section_id_mask=*/3,
+ /*score=*/1),
+ test_scored_document_hit2_(/*document_id=*/1, /*hit_section_id_mask=*/1,
+ /*score=*/2),
+ test_scored_document_hit3_(/*document_id=*/2, /*hit_section_id_mask=*/2,
+ /*score=*/3),
+ test_scored_document_hit4_(/*document_id=*/5, /*hit_section_id_mask=*/5,
+ /*score=*/4),
+ test_scored_document_hit5_(/*document_id=*/4, /*hit_section_id_mask=*/4,
+ /*score=*/5) {}
+
+ const ScoredDocumentHit& test_scored_document_hit1() {
+ return test_scored_document_hit1_;
+ }
+
+ const ScoredDocumentHit& test_scored_document_hit2() {
+ return test_scored_document_hit2_;
+ }
+
+ const ScoredDocumentHit& test_scored_document_hit3() {
+ return test_scored_document_hit3_;
+ }
+
+ const ScoredDocumentHit& test_scored_document_hit4() {
+ return test_scored_document_hit4_;
+ }
+
+ const ScoredDocumentHit& test_scored_document_hit5() {
+ return test_scored_document_hit5_;
+ }
+
+ private:
+ ScoredDocumentHit test_scored_document_hit1_;
+ ScoredDocumentHit test_scored_document_hit2_;
+ ScoredDocumentHit test_scored_document_hit3_;
+ ScoredDocumentHit test_scored_document_hit4_;
+ ScoredDocumentHit test_scored_document_hit5_;
+};
+
+TEST_F(RankerTest, ShouldHandleEmpty) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {};
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/0,
+ /*is_descending=*/true),
+ IsEmpty());
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
+ /*is_descending=*/true),
+ IsEmpty());
+}
+
+TEST_F(RankerTest, ShouldCorrectlySortResults) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ test_scored_document_hit2(), test_scored_document_hit1(),
+ test_scored_document_hit5(), test_scored_document_hit4(),
+ test_scored_document_hit3()};
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/5,
+ /*is_descending=*/true),
+ ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
+ EqualsScoredDocumentHit(test_scored_document_hit4()),
+ EqualsScoredDocumentHit(test_scored_document_hit3()),
+ EqualsScoredDocumentHit(test_scored_document_hit2()),
+ EqualsScoredDocumentHit(test_scored_document_hit1())));
+}
+
+TEST_F(RankerTest, ShouldHandleSmallerNumResult) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ test_scored_document_hit2(), test_scored_document_hit1(),
+ test_scored_document_hit5(), test_scored_document_hit4(),
+ test_scored_document_hit3()};
+
+ // num_result = 3, smaller than the size 5
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
+ /*is_descending=*/true),
+ ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
+ EqualsScoredDocumentHit(test_scored_document_hit4()),
+ EqualsScoredDocumentHit(test_scored_document_hit3())));
+}
+
+TEST_F(RankerTest, ShouldHandleGreaterNumResult) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ test_scored_document_hit2(), test_scored_document_hit1(),
+ test_scored_document_hit5(), test_scored_document_hit4(),
+ test_scored_document_hit3()};
+
+ // num_result = 10, greater than the size 5
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/10,
+ /*is_descending=*/true),
+ ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
+ EqualsScoredDocumentHit(test_scored_document_hit4()),
+ EqualsScoredDocumentHit(test_scored_document_hit3()),
+ EqualsScoredDocumentHit(test_scored_document_hit2()),
+ EqualsScoredDocumentHit(test_scored_document_hit1())));
+}
+
+TEST_F(RankerTest, ShouldHandleAcsendingOrder) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ test_scored_document_hit2(), test_scored_document_hit1(),
+ test_scored_document_hit5(), test_scored_document_hit4(),
+ test_scored_document_hit3()};
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/5,
+ /*is_descending=*/false),
+ ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit1()),
+ EqualsScoredDocumentHit(test_scored_document_hit2()),
+ EqualsScoredDocumentHit(test_scored_document_hit3()),
+ EqualsScoredDocumentHit(test_scored_document_hit4()),
+ EqualsScoredDocumentHit(test_scored_document_hit5())));
+}
+
+TEST_F(RankerTest, ShouldRespectDocumentIdWhenScoresAreEqual) {
+ ScoredDocumentHit scored_document_hit1(
+ /*document_id=*/1, /*hit_section_id_mask=*/0, /*score=*/100);
+ ScoredDocumentHit scored_document_hit2(
+ /*document_id=*/2, /*hit_section_id_mask=*/0, /*score=*/100);
+ ScoredDocumentHit scored_document_hit3(
+ /*document_id=*/3, /*hit_section_id_mask=*/0, /*score=*/100);
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ scored_document_hit3, scored_document_hit1, scored_document_hit2};
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
+ /*is_descending=*/true),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit1)));
+
+ EXPECT_THAT(
+ GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
+ /*is_descending=*/false),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scored-document-hit.h b/icing/scoring/scored-document-hit.h
new file mode 100644
index 0000000..efe0e2a
--- /dev/null
+++ b/icing/scoring/scored-document-hit.h
@@ -0,0 +1,84 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORED_DOCUMENT_HIT_H_
+#define ICING_SCORING_SCORED_DOCUMENT_HIT_H_
+
+#include <type_traits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// A data class containing information about the document, hit sections, and a
+// score. The score is calculated against both the document and the hit
+// sections.
+class ScoredDocumentHit {
+ public:
+ ScoredDocumentHit(DocumentId document_id, SectionIdMask hit_section_id_mask,
+ float score)
+ : document_id_(document_id),
+ hit_section_id_mask_(hit_section_id_mask),
+ score_(score) {}
+
+ bool operator<(const ScoredDocumentHit& other) const {
+ if (score() < other.score()) return true;
+ if (score() > other.score()) return false;
+ return document_id() < other.document_id();
+ }
+
+ DocumentId document_id() const { return document_id_; }
+
+ SectionIdMask hit_section_id_mask() const { return hit_section_id_mask_; }
+
+ float score() const { return score_; }
+
+ private:
+ DocumentId document_id_;
+ SectionIdMask hit_section_id_mask_;
+ float score_;
+} __attribute__((packed));
+
+static_assert(sizeof(ScoredDocumentHit) == 10,
+ "Size of ScoredDocHit should be 10");
+static_assert(icing_is_packed_pod<ScoredDocumentHit>::value, "go/icing-ubsan");
+
+// A custom comparator for ScoredDocumentHit that determines which
+// ScoredDocumentHit is better (should come first) based off of
+// ScoredDocumentHit itself and the order of its score.
+//
+// Returns true if left is better than right according to score and order.
+// Comparison is based off of score with ties broken by
+// ScoredDocumentHit.document_id().
+class ScoredDocumentHitComparator {
+ public:
+ explicit ScoredDocumentHitComparator(bool is_descending = true)
+ : is_descending_(is_descending) {}
+
+ bool operator()(const ScoredDocumentHit& lhs,
+ const ScoredDocumentHit& rhs) const {
+ return is_descending_ == !(lhs < rhs);
+ }
+
+ private:
+ bool is_descending_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORED_DOCUMENT_HIT_H_
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
new file mode 100644
index 0000000..20e4690
--- /dev/null
+++ b/icing/scoring/scorer.cc
@@ -0,0 +1,87 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scorer.h"
+
+#include <memory>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/store/document-associated-score-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+class DocumentScoreScorer : public Scorer {
+ public:
+ explicit DocumentScoreScorer(const DocumentStore* document_store,
+ float default_score)
+ : document_store_(*document_store), default_score_(default_score) {}
+
+ float GetScore(DocumentId document_id) override {
+ ICING_ASSIGN_OR_RETURN_VAL(
+ DocumentAssociatedScoreData score_data,
+ document_store_.GetDocumentAssociatedScoreData(document_id),
+ default_score_);
+
+ return static_cast<float>(score_data.document_score());
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ float default_score_;
+};
+
+class DocumentCreationTimestampScorer : public Scorer {
+ public:
+ explicit DocumentCreationTimestampScorer(const DocumentStore* document_store,
+ float default_score)
+ : document_store_(*document_store), default_score_(default_score) {}
+
+ float GetScore(DocumentId document_id) override {
+ ICING_ASSIGN_OR_RETURN_VAL(
+ DocumentAssociatedScoreData score_data,
+ document_store_.GetDocumentAssociatedScoreData(document_id),
+ default_score_);
+
+ return score_data.creation_timestamp_secs();
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ float default_score_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
+ ScoringSpecProto::RankingStrategy::Code rank_by, float default_score,
+ const DocumentStore* document_store) {
+ switch (rank_by) {
+ case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE:
+ return std::make_unique<DocumentScoreScorer>(document_store,
+ default_score);
+ case ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP:
+ return std::make_unique<DocumentCreationTimestampScorer>(document_store,
+ default_score);
+ case ScoringSpecProto::RankingStrategy::NONE:
+ return absl_ports::InvalidArgumentError(
+ "RankingStrategy NONE not supported");
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scorer.h b/icing/scoring/scorer.h
new file mode 100644
index 0000000..e6a9f17
--- /dev/null
+++ b/icing/scoring/scorer.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORER_H_
+#define ICING_SCORING_SCORER_H_
+
+#include <memory>
+
+#include "utils/base/statusor.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// Scorer calculates scores for documents.
+class Scorer {
+ public:
+ virtual ~Scorer() = default;
+
+ // Factory function to create a Scorer according to the ranking strategy and
+ // default score. The default score will be returned only if the scorer fails
+ // to find or calculate a score for the document.
+ //
+ // Returns:
+ // A Scorer on success
+ // INVALID_ARGUMENT if fails to create an instance
+ static libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create(
+ ScoringSpecProto::RankingStrategy::Code rank_by, float default_score,
+ const DocumentStore* document_store);
+
+ // Returns a non-negative score of a document. The score can be a
+ // document-associated score which comes from the DocumentProto directly, an
+ // accumulated score, or even an inferred score. If it fails to find or
+ // calculate a score, the user-provided default score will be returned.
+ //
+ // Some examples of possible scores:
+ // 1. Document-associated scores: document score, creation timestamp score.
+ // 2. Accumulated scores: usage count score.
+ // 3. Inferred scores: a score calculated by a machine learning model.
+ //
+ // NOTE: This method is performance-sensitive as it's called for every
+ // potential result document. We're trying to avoid returning StatusOr<float>
+ // to save a little more time and memory.
+ virtual float GetScore(DocumentId document_id) = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORER_H_
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
new file mode 100644
index 0000000..af1d2bc
--- /dev/null
+++ b/icing/scoring/scorer_test.cc
@@ -0,0 +1,193 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scorer.h"
+
+#include <memory>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+using ::testing::Test;
+
+class ScorerTest : public Test {
+ protected:
+ ScorerTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ doc_store_dir_(test_dir_ + "/doc_store"),
+ schema_store_dir_(test_dir_ + "/schema_store") {}
+
+ void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ fake_clock1_.SetSeconds(1571100000);
+ fake_clock2_.SetSeconds(1572200000);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock1_,
+ schema_store_.get()));
+
+ // Creates a simple email schema
+ SchemaProto test_email_schema;
+ auto type_config = test_email_schema.add_types();
+ type_config->set_schema_type("email");
+ auto subject = type_config->add_properties();
+ subject->set_property_name("subject");
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema));
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ DocumentStore* document_store() { return document_store_.get(); }
+
+ const FakeClock& fake_clock1() { return fake_clock1_; }
+
+ const FakeClock& fake_clock2() { return fake_clock2_; }
+
+ private:
+ const std::string test_dir_;
+ const std::string doc_store_dir_;
+ const std::string schema_store_dir_;
+ Filesystem filesystem_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock1_;
+ FakeClock fake_clock2_;
+};
+
+TEST_F(ScorerTest, ShouldFailToCreate) {
+ EXPECT_THAT(Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
+ /*default_score=*/0, document_store()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(ScorerTest, ShouldGetDefaultScore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
+
+ DocumentId non_existing_document_id = 1;
+ // The caller-provided default score is returned
+ EXPECT_THAT(scorer->GetScore(non_existing_document_id), Eq(10));
+}
+
+TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
+ // Creates a test document with the default document score 0
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampSecs(fake_clock1().GetCurrentSeconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/10, document_store()));
+
+ EXPECT_THAT(scorer->GetScore(document_id), Eq(0));
+}
+
+TEST_F(ScorerTest, ShouldGetCorrectDocumentScore) {
+ // Creates a test document with document score 5
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetScore(5)
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampSecs(fake_clock2().GetCurrentSeconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ /*default_score=*/0, document_store()));
+
+ EXPECT_THAT(scorer->GetScore(document_id), Eq(5));
+}
+
+TEST_F(ScorerTest, ShouldGetCorrectCreationTimestampScore) {
+ // Creates test_document1 with fake timestamp1
+ DocumentProto test_document1 =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampSecs(fake_clock1().GetCurrentSeconds())
+ .Build();
+ // Creates test_document2 with fake timestamp2
+ DocumentProto test_document2 =
+ DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo 2")
+ .SetCreationTimestampSecs(fake_clock2().GetCurrentSeconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(test_document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(test_document2));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
+ /*default_score=*/0, document_store()));
+
+ EXPECT_THAT(scorer->GetScore(document_id1),
+ Eq(fake_clock1().GetCurrentSeconds()));
+ EXPECT_THAT(scorer->GetScore(document_id2),
+ Eq(fake_clock2().GetCurrentSeconds()));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc
new file mode 100644
index 0000000..8bf2ce9
--- /dev/null
+++ b/icing/scoring/scoring-processor.cc
@@ -0,0 +1,86 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scoring-processor.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/scoring/ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scorer.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+constexpr float kDefaultScoreInDescendingOrder = 0;
+constexpr float kDefaultScoreInAscendingOrder =
+ std::numeric_limits<float>::max();
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
+ScoringProcessor::Create(const ScoringSpecProto& scoring_spec,
+ const DocumentStore* document_store) {
+ bool is_descending_order =
+ scoring_spec.order_by() == ScoringSpecProto::Order::DESC;
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(scoring_spec.rank_by(),
+ is_descending_order ? kDefaultScoreInDescendingOrder
+ : kDefaultScoreInAscendingOrder,
+ document_store));
+
+ // Using `new` to access a non-public constructor.
+ return std::unique_ptr<ScoringProcessor>(
+ new ScoringProcessor(std::move(scorer), is_descending_order));
+}
+
+std::vector<ScoredDocumentHit> ScoringProcessor::ScoreAndRank(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
+ int num_to_return) {
+ std::vector<ScoredDocumentHit> scored_document_hits;
+
+ if (num_to_return <= 0) {
+ return scored_document_hits;
+ }
+
+ // TODO(b/145025400) Determine if we want to score all DocHitInfo or enforce
+ // an upper limit.
+ while (doc_hit_info_iterator->Advance().ok()) {
+ const DocHitInfo& doc_hit_info = doc_hit_info_iterator->doc_hit_info();
+ // TODO(b/144955274) Calculate hit demotion factor from HitScore
+ float hit_demotion_factor = 1.0;
+ // The final score of the doc_hit_info = score of doc * demotion factor of
+ // hit.
+ float score =
+ scorer_->GetScore(doc_hit_info.document_id()) * hit_demotion_factor;
+ scored_document_hits.emplace_back(
+ doc_hit_info.document_id(), doc_hit_info.hit_section_ids_mask(), score);
+ }
+
+ return GetTopNFromScoredDocumentHits(std::move(scored_document_hits),
+ num_to_return, is_descending_);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h
new file mode 100644
index 0000000..b472c14
--- /dev/null
+++ b/icing/scoring/scoring-processor.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORING_PROCESSOR_H_
+#define ICING_SCORING_SCORING_PROCESSOR_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "utils/base/statusor.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scorer.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// ScoringProcessor is the top-level class that handles scoring.
+class ScoringProcessor {
+ public:
+ // Factory function to create a Scorer with its subcomponents according to the
+ // scoring spec.
+ //
+ // Returns:
+ // A Scorer on success
+ // INVALID_ARGUMENT if unable to create what the spec specifies
+ static libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> Create(
+ const ScoringSpecProto& scoring_spec,
+ const DocumentStore* document_store);
+
+ // Returns a vector of ScoredDocHit sorted by their scores. The size of it is
+ // no more than num_to_return.
+ std::vector<ScoredDocumentHit> ScoreAndRank(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
+ int num_to_return);
+
+ private:
+ explicit ScoringProcessor(std::unique_ptr<Scorer> scorer, bool is_descending)
+ : scorer_(std::move(scorer)), is_descending_(is_descending) {}
+
+ std::unique_ptr<Scorer> scorer_;
+
+ // If true, the final result will be sorted in a descending order, otherwise
+ // ascending.
+ bool is_descending_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORING_PROCESSOR_H_
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
new file mode 100644
index 0000000..5f61cb6
--- /dev/null
+++ b/icing/scoring/scoring-processor_test.cc
@@ -0,0 +1,541 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scoring-processor.h"
+
+#include <cstdint>
+
+#include "utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::Test;
+
+class ScoringProcessorTest : public Test {
+ protected:
+ ScoringProcessorTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ doc_store_dir_(test_dir_ + "/doc_store"),
+ schema_store_dir_(test_dir_ + "/schema_store") {}
+
+ void SetUp() override {
+ // Creates file directories
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Creates a simple email schema
+ SchemaProto test_email_schema;
+ auto type_config = test_email_schema.add_types();
+ type_config->set_schema_type("email");
+ auto subject = type_config->add_properties();
+ subject->set_property_name("subject");
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema));
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ DocumentStore* document_store() { return document_store_.get(); }
+
+ private:
+ const std::string test_dir_;
+ const std::string doc_store_dir_;
+ const std::string schema_store_dir_;
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<SchemaStore> schema_store_;
+};
+
+constexpr int kDefaultScore = 0;
+constexpr int64_t kDefaultCreationTimestampSecs = 1571100001;
+
+DocumentProto CreateDocument(const std::string& name_space,
+ const std::string& uri, int score,
+ int64_t creation_timestamp_secs) {
+ return DocumentBuilder()
+ .SetKey(name_space, uri)
+ .SetSchema("email")
+ .SetScore(score)
+ .SetCreationTimestampSecs(creation_timestamp_secs)
+ .Build();
+}
+
+libtextclassifier3::StatusOr<
+ std::pair<std::vector<DocHitInfo>, std::vector<ScoredDocumentHit>>>
+CreateAndInsertsDocumentsWithScores(DocumentStore* document_store,
+ const std::vector<int>& scores) {
+ std::vector<DocHitInfo> doc_hit_infos;
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ for (int i = 0; i < scores.size(); i++) {
+ ICING_ASSIGN_OR_RETURN(DocumentId document_id,
+ document_store->Put(CreateDocument(
+ "icing", "email/" + std::to_string(i),
+ scores.at(i), kDefaultCreationTimestampSecs)));
+ doc_hit_infos.emplace_back(document_id);
+ scored_document_hits.emplace_back(document_id, kSectionIdMaskNone,
+ scores.at(i));
+ }
+ return std::pair(doc_hit_infos, scored_document_hits);
+}
+
+TEST_F(ScoringProcessorTest, FailToCreateOnInvalidRankingStrategy) {
+ ScoringSpecProto spec_proto;
+ EXPECT_THAT(ScoringProcessor::Create(spec_proto, document_store()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(ScoringProcessorTest, ShouldCreateInstance) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ICING_EXPECT_OK(ScoringProcessor::Create(spec_proto, document_store()));
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Creates an empty DocHitInfoIterator
+ std::vector<DocHitInfo> doc_hit_infos = {};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/5),
+ IsEmpty());
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToReturn) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Sets up documents
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(CreateDocument("icing", "email/1", /*score=*/1,
+ kDefaultCreationTimestampSecs)));
+ DocHitInfo doc_hit_info1(document_id1);
+
+ // Creates a dummy DocHitInfoIterator
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/-1),
+ IsEmpty());
+
+ doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/0),
+ IsEmpty());
+}
+
+TEST_F(ScoringProcessorTest, ShouldRespectNumToReturn) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Sets up documents
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/2),
+ SizeIs(2));
+
+ doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/4),
+ SizeIs(3));
+}
+
+TEST_F(ScoringProcessorTest, ShouldRankByDocumentScoreDesc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(0))));
+}
+
+TEST_F(ScoringProcessorTest, ShouldRankByDocumentScoreAsc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(2))));
+}
+
+TEST_F(ScoringProcessorTest, ShouldRankByCreationTimestampDesc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100001);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100002);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100003);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store()->Put(document3));
+ DocHitInfo doc_hit_info1(document_id1);
+ DocHitInfo doc_hit_info2(document_id2);
+ DocHitInfo doc_hit_info3(document_id3);
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ document1.creation_timestamp_secs());
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ document2.creation_timestamp_secs());
+ ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
+ document3.creation_timestamp_secs());
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info2, doc_hit_info3,
+ doc_hit_info1};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit1)));
+}
+
+TEST_F(ScoringProcessorTest, ShouldRankByCreationTimestampAsc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100001);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100002);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_secs=*/1571100003);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store()->Put(document3));
+ DocHitInfo doc_hit_info1(document_id1);
+ DocHitInfo doc_hit_info2(document_id2);
+ DocHitInfo doc_hit_info3(document_id3);
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ document1.creation_timestamp_secs());
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ document2.creation_timestamp_secs());
+ ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
+ document3.creation_timestamp_secs());
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info2, doc_hit_info3,
+ doc_hit_info1};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3)));
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleSameScoresDesc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Creates 3 documents with the same score.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {100, 100, 100}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ // Results should be ranked in descending document id order.
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(0))));
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleSameScoresAsc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
+
+ // Creates 3 documents with the same score.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {100, 100, 100}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Creates a ScoringProcessor which ranks in ascending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+
+ // Results should be ranked in ascending document id order.
+ EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(2))));
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleNoScoresDesc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 4 results one of which doesn't have
+ // a score.
+ doc_hit_infos.emplace(doc_hit_infos.begin(), /*document_id_in=*/4,
+ kSectionIdMaskNone);
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // The document hit without a score will be be assigned the default score 0 in
+ // a descending order.
+ ScoredDocumentHit scored_document_hit_default_desc =
+ ScoredDocumentHit(4, kSectionIdMaskNone, /*score=*/0.0);
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+ EXPECT_THAT(
+ scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/4),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(0)),
+ EqualsScoredDocumentHit(scored_document_hit_default_desc)));
+}
+
+TEST_F(ScoringProcessorTest, ShouldHandleNoScoresAsc) {
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
+
+ // Sets up documents, guaranteed relationship:
+ // document1 < document2 < document3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto doc_hit_result_pair,
+ CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
+ std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ std::move(doc_hit_result_pair.second);
+
+ // Disarrays doc_hit_infos
+ std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
+ std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+
+ // Creates a dummy DocHitInfoIterator with 4 results one of which doesn't have
+ // a score.
+ doc_hit_infos.emplace(doc_hit_infos.begin(), /*document_id_in=*/4,
+ kSectionIdMaskNone);
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // The document hit without a score will be be assigned the default score
+ // max of float in an ascending order.
+ ScoredDocumentHit scored_document_hit_default_asc = ScoredDocumentHit(
+ 4, kSectionIdMaskNone, /*score=*/std::numeric_limits<float>::max());
+
+ // Creates a ScoringProcessor which ranks in ascending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store()));
+ EXPECT_THAT(
+ scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
+ /*num_to_return=*/4),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(2)),
+ EqualsScoredDocumentHit(scored_document_hit_default_asc)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/snippet-retriever.cc b/icing/snippet-retriever.cc
new file mode 100644
index 0000000..d9242a3
--- /dev/null
+++ b/icing/snippet-retriever.cc
@@ -0,0 +1,341 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/snippet-retriever.h"
+
+#include <algorithm>
+#include <cctype>
+#include <memory>
+#include <string_view>
+#include <unordered_set>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/utf8.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class TokenMatcher {
+ public:
+ virtual ~TokenMatcher() = default;
+ virtual bool Matches(Token token) const = 0;
+};
+
+class TokenMatcherExact : public TokenMatcher {
+ public:
+ explicit TokenMatcherExact(
+ const std::unordered_set<std::string>& unrestricted_query_terms,
+ const std::unordered_set<std::string>& restricted_query_terms)
+ : unrestricted_query_terms_(unrestricted_query_terms),
+ restricted_query_terms_(restricted_query_terms) {}
+
+ bool Matches(Token token) const override {
+ // TODO(tjbarron) : Add normalization of token.
+ std::string s(token.text);
+ return (unrestricted_query_terms_.count(s) > 0) ||
+ (restricted_query_terms_.count(s) > 0);
+ }
+
+ private:
+ const std::unordered_set<std::string>& unrestricted_query_terms_;
+ const std::unordered_set<std::string>& restricted_query_terms_;
+};
+
+class TokenMatcherPrefix : public TokenMatcher {
+ public:
+ explicit TokenMatcherPrefix(
+ const std::unordered_set<std::string>& unrestricted_query_terms,
+ const std::unordered_set<std::string>& restricted_query_terms)
+ : unrestricted_query_terms_(unrestricted_query_terms),
+ restricted_query_terms_(restricted_query_terms) {}
+
+ bool Matches(Token token) const override {
+ if (std::any_of(unrestricted_query_terms_.begin(),
+ unrestricted_query_terms_.end(),
+ [&token](const std::string& term) {
+ return term.length() <= token.text.length() &&
+ token.text.compare(0, term.length(), term) == 0;
+ })) {
+ return true;
+ }
+ return std::any_of(restricted_query_terms_.begin(),
+ restricted_query_terms_.end(),
+ [token](const std::string& term) {
+ return term.length() <= token.text.length() &&
+ token.text.compare(0, term.length(), term) == 0;
+ });
+ }
+
+ private:
+ const std::unordered_set<std::string>& unrestricted_query_terms_;
+ const std::unordered_set<std::string>& restricted_query_terms_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
+ TermMatchType::Code match_type,
+ const std::unordered_set<std::string>& unrestricted_query_terms,
+ const std::unordered_set<std::string>& restricted_query_terms) {
+ switch (match_type) {
+ case TermMatchType::EXACT_ONLY:
+ return std::make_unique<TokenMatcherExact>(unrestricted_query_terms,
+ restricted_query_terms);
+ case TermMatchType::PREFIX:
+ return std::make_unique<TokenMatcherPrefix>(unrestricted_query_terms,
+ restricted_query_terms);
+ case TermMatchType::UNKNOWN:
+ U_FALLTHROUGH;
+ default:
+ return absl_ports::InvalidArgumentError("Invalid match type provided.");
+ }
+}
+
+// Returns true if token matches any of the terms in query terms according to
+// the provided match type.
+
+// Returns:
+// the position of the window start if successful
+// INTERNAL_ERROR - if a tokenizer error is encountered
+libtextclassifier3::StatusOr<int> DetermineWindowStart(
+ const ResultSpecProto::SnippetSpecProto& snippet_spec,
+ std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
+ int window_start_min =
+ std::max((match_mid - snippet_spec.max_window_bytes() / 2), 0);
+ if (window_start_min == 0) {
+ return 0;
+ }
+ if (!iterator->ResetToTokenAfter(window_start_min - 1)) {
+ return absl_ports::InternalError(
+ "Couldn't reset tokenizer to determine snippet window!");
+ }
+ return iterator->GetToken().text.data() - value.data();
+}
+
+// Increments window_end_exclusive so long as the character at the position
+// of window_end_exclusive is punctuation and does not exceed
+// window_end_max_exclusive.
+int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
+ int window_end_max_exclusive) {
+ while (window_end_exclusive < window_end_max_exclusive) {
+ int char_len = 0;
+ if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
+ break;
+ }
+ if (window_end_exclusive + char_len > window_end_max_exclusive) {
+ // This is punctuation, but it goes beyond the window end max. Don't
+ // include it.
+ break;
+ }
+ // Expand window by char_len and check the next character.
+ window_end_exclusive += char_len;
+ }
+ return window_end_exclusive;
+}
+
+// Returns:
+// the position of the window end if successful
+// INTERNAL_ERROR - if a tokenizer error is encountered
+libtextclassifier3::StatusOr<int> DetermineWindowEnd(
+ const ResultSpecProto::SnippetSpecProto& snippet_spec,
+ std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
+ int window_end_max_exclusive =
+ std::min((match_mid + snippet_spec.max_window_bytes() / 2),
+ static_cast<int>(value.length()));
+ if (window_end_max_exclusive == value.length()) {
+ return window_end_max_exclusive;
+ }
+ if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
+ return absl_ports::InternalError(
+ "Couldn't reset tokenizer to determine snippet window!");
+ }
+ int window_end_exclusive = iterator->GetToken().text.data() - value.data() +
+ iterator->GetToken().text.length();
+ return IncludeTrailingPunctuation(value, window_end_exclusive,
+ window_end_max_exclusive);
+}
+
+struct SectionData {
+ std::string_view section_name;
+ std::string_view section_subcontent;
+ // Identifies which subsection of the section content, section_subcontent has
+ // come from.
+ // Ex. "recipient.address" :
+ // ["foo@google.com", "bar@google.com", "baz@google.com"]
+ // The subcontent_index of "bar@google.com" is 1.
+ int subcontent_index;
+};
+
+libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
+ const ResultSpecProto::SnippetSpecProto& snippet_spec,
+ const SectionData& value, Tokenizer::Iterator* iterator) {
+ SnippetMatchProto snippet_match;
+ snippet_match.set_values_index(value.subcontent_index);
+
+ Token match = iterator->GetToken();
+ int match_pos = match.text.data() - value.section_subcontent.data();
+ int match_mid = match_pos + match.text.length() / 2;
+
+ snippet_match.set_exact_match_position(match_pos);
+ snippet_match.set_exact_match_bytes(match.text.length());
+
+ if (snippet_spec.max_window_bytes() > match.text.length()) {
+ // Find the beginning of the window.
+ ICING_ASSIGN_OR_RETURN(
+ int window_start,
+ DetermineWindowStart(snippet_spec, value.section_subcontent, match_mid,
+ iterator));
+ snippet_match.set_window_position(window_start);
+
+ // Find the end of the window.
+ ICING_ASSIGN_OR_RETURN(
+ int window_end_exclusive,
+ DetermineWindowEnd(snippet_spec, value.section_subcontent, match_mid,
+ iterator));
+ snippet_match.set_window_bytes(window_end_exclusive - window_start);
+
+ // Reset the iterator back to the original position.
+ if (!iterator->ResetToTokenAfter(match_pos - 1)) {
+ return absl_ports::InternalError(
+ "Couldn't reset tokenizer to determine snippet window!");
+ }
+ }
+
+ return snippet_match;
+}
+
+struct MatchOptions {
+ const ResultSpecProto::SnippetSpecProto& snippet_spec;
+ int max_matches_remaining;
+};
+
+libtextclassifier3::StatusOr<SnippetProto::EntryProto> RetrieveMatches(
+ const TokenMatcher* matcher, const MatchOptions& match_options,
+ const SectionData& value, const Tokenizer* tokenizer) {
+ SnippetProto::EntryProto snippet_entry;
+ snippet_entry.set_property_name(std::string(value.section_name));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ tokenizer->Tokenize(value.section_subcontent));
+ while (iterator->Advance()) {
+ if (snippet_entry.snippet_matches_size() >=
+ match_options.max_matches_remaining) {
+ break;
+ }
+ Token token = iterator->GetToken();
+ if (matcher->Matches(token)) {
+ // If there was an error while retrieving the match, the tokenizer
+ // iterator is probably in an invalid state. There's nothing we can do
+ // here, so just return.
+ ICING_ASSIGN_OR_RETURN(
+ SnippetMatchProto match,
+ RetrieveMatch(match_options.snippet_spec, value, iterator.get()));
+ snippet_entry.mutable_snippet_matches()->Add(std::move(match));
+ }
+ }
+ if (snippet_entry.snippet_matches().empty()) {
+ return absl_ports::NotFoundError("No matches found in value!");
+ }
+ return snippet_entry;
+}
+
+} // namespace
+
+SnippetProto SnippetRetriever::RetrieveSnippet(
+ const SectionRestrictQueryTermsMap& query_terms,
+ TermMatchType::Code match_type,
+ const ResultSpecProto::SnippetSpecProto& snippet_spec,
+ const DocumentProto& document, SectionIdMask section_id_mask) const {
+ SnippetProto snippet_proto;
+ ICING_ASSIGN_OR_RETURN_VAL(SchemaTypeId type_id,
+ schema_store_.GetSchemaTypeId(document.schema()),
+ snippet_proto);
+ const std::unordered_set<std::string> empty_set;
+ auto itr = query_terms.find("");
+ const std::unordered_set<std::string>& unrestricted_set =
+ (itr != query_terms.end()) ? itr->second : empty_set;
+ while (section_id_mask != kSectionIdMaskNone) {
+ SectionId section_id = __builtin_ctz(section_id_mask);
+ // Remove this section from the mask.
+ section_id_mask &= ~(1u << section_id);
+
+ // Determine the section name and match type.
+ auto section_metadata_or =
+ schema_store_.GetSectionMetadata(type_id, section_id);
+ if (!section_metadata_or.ok()) {
+ continue;
+ }
+ const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
+ MatchOptions match_options = {snippet_spec};
+ // Match type must be as restrictive as possible. Prefix matches for a
+ // snippet should only be included if both the query is Prefix and the
+ // section has prefixes enabled.
+ TermMatchType::Code section_match_type = TermMatchType::EXACT_ONLY;
+ if (match_type == TermMatchType::PREFIX &&
+ metadata->term_match_type == TermMatchType::PREFIX) {
+ section_match_type = TermMatchType::PREFIX;
+ }
+
+ itr = query_terms.find(metadata->path);
+ const std::unordered_set<std::string>& restricted_set =
+ (itr != query_terms.end()) ? itr->second : empty_set;
+ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
+ CreateTokenMatcher(section_match_type, unrestricted_set,
+ restricted_set);
+ if (!matcher_or.ok()) {
+ continue;
+ }
+ match_options.max_matches_remaining =
+ snippet_spec.num_matches_per_property();
+
+ // Retrieve values and snippet them.
+ auto values_or = schema_store_.GetSectionContent(document, metadata->path);
+ if (!values_or.ok()) {
+ continue;
+ }
+ auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
+ metadata->tokenizer, &language_segmenter_);
+ if (!tokenizer_or.ok()) {
+ // If we couldn't create the tokenizer properly, just skip this section.
+ continue;
+ }
+ std::vector<std::string> values = values_or.ValueOrDie();
+ for (int value_index = 0; value_index < values.size(); ++value_index) {
+ if (match_options.max_matches_remaining <= 0) {
+ break;
+ }
+ SectionData value = {metadata->path, values.at(value_index), value_index};
+ auto entry_or =
+ RetrieveMatches(matcher_or.ValueOrDie().get(), match_options, value,
+ tokenizer_or.ValueOrDie().get());
+
+ // Drop any entries that encountered errors or didn't find any matches.
+ if (entry_or.ok()) {
+ match_options.max_matches_remaining -=
+ entry_or.ValueOrDie().snippet_matches_size();
+ snippet_proto.mutable_entries()->Add(std::move(entry_or).ValueOrDie());
+ }
+ }
+ }
+ return snippet_proto;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/snippet-retriever.h b/icing/snippet-retriever.h
new file mode 100644
index 0000000..879b322
--- /dev/null
+++ b/icing/snippet-retriever.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SNIPPET_RETRIEVER_H_
+#define ICING_SNIPPET_RETRIEVER_H_
+
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+// This class provides functions to retrieve snippets from documents. Snippets
+// are retrieved anywhere that content in the document matches query_terms
+// according to match_type. The behavior of snippet population is determined by
+// the SnippetSpecProto.
+//
+// This class does not take ownership of any of the provided pointers. The only
+// constraint for the lifecycle of this class is that it must be shorter than
+// that of the provided pointers.
+class SnippetRetriever {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ explicit SnippetRetriever(const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter)
+ : schema_store_(*schema_store),
+ language_segmenter_(*language_segmenter) {}
+
+ // Retrieve the snippet information for content in document. terms in
+ // query_terms are matched to content in document according to match_type.
+ // Only sections identified in section_id_mask are considered.
+ //
+ // Returns an empty SnippetProto if no snippets were found.
+ SnippetProto RetrieveSnippet(
+ const SectionRestrictQueryTermsMap& query_terms,
+ TermMatchType::Code match_type,
+ const ResultSpecProto::SnippetSpecProto& snippet_spec,
+ const DocumentProto& document, SectionIdMask section_id_mask) const;
+
+ private:
+ const SchemaStore& schema_store_;
+ const LanguageSegmenter& language_segmenter_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SNIPPET_RETRIEVER_H_
diff --git a/icing/snippet-retriever_test.cc b/icing/snippet-retriever_test.cc
new file mode 100644
index 0000000..4c53fa3
--- /dev/null
+++ b/icing/snippet-retriever_test.cc
@@ -0,0 +1,560 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/snippet-retriever.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section-manager.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/snippet-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class SnippetRetrieverTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ // Setup the schema
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ SchemaProto schema;
+ SchemaTypeConfigProto* type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ PropertyConfigProto* prop_config = type_config->add_properties();
+ prop_config->set_property_name("subject");
+ prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ prop_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ prop_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ prop_config = type_config->add_properties();
+ prop_config->set_property_name("body");
+ prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ prop_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ prop_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+
+ snippet_retriever_ = std::make_unique<SnippetRetriever>(
+ schema_store_.get(), language_segmenter_.get());
+
+ // Set limits to max - effectively no limit. Enable matching and request a
+ // window of 64 bytes.
+ snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
+ snippet_spec_.set_num_matches_per_property(
+ std::numeric_limits<int32_t>::max());
+ snippet_spec_.set_max_window_bytes(64);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SnippetRetriever> snippet_retriever_;
+ ResultSpecProto::SnippetSpecProto snippet_spec_;
+ std::string test_dir_;
+};
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window starts at the beginning of "three" and ends in the middle of
+ // "three". len=4, orig_window= "thre"
+ snippet_spec_.set_max_window_bytes(4);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq(""));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window starts at the space between "one" and "two". Window ends in the
+ // middle of "four".
+ // len=14, orig_window=" two three fou"
+ snippet_spec_.set_max_window_bytes(14);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("two three"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window starts in the middle of "one" and ends at the end of "four".
+ // len=16, orig_window="e two three four"
+ snippet_spec_.set_max_window_bytes(16);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("two three four"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window ends in the middle of all the punctuation and window starts at 0.
+ // len=20, orig_window="one two three four.."
+ snippet_spec_.set_max_window_bytes(20);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four.."));
+}
+
+TEST_F(SnippetRetrieverTest,
+ SnippetingWindowMaxWindowEndsInMiddleOfMultiBytePunctuation) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body",
+ "Is everything upside down in Australia¿ Crikey!")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
+
+ // Window ends in the middle of all the punctuation and window starts at 0.
+ // len=26, orig_window="pside down in Australia\xC2"
+ snippet_spec_.set_max_window_bytes(24);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("down in Australia"));
+}
+
+TEST_F(SnippetRetrieverTest,
+ SnippetingWindowMaxWindowEndsInMultiBytePunctuation) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body",
+ "Is everything upside down in Australia¿ Crikey!")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
+
+ // Window ends in the middle of all the punctuation and window starts at 0.
+ // len=26, orig_window="upside down in Australia\xC2\xBF"
+ snippet_spec_.set_max_window_bytes(26);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("upside down in Australia¿"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window starts before 0.
+ // len=22, orig_window="one two three four..."
+ snippet_spec_.set_max_window_bytes(22);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four..."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window ends before "five" but after all the punctuation
+ // len=26, orig_window="one two three four.... "
+ snippet_spec_.set_max_window_bytes(26);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four...."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window ends in the middle of "five"
+ // len=32, orig_window="one two three four.... fiv"
+ snippet_spec_.set_max_window_bytes(32);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four...."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Max window size equals the size of the value.
+ // len=34, orig_window="one two three four.... five"
+ snippet_spec_.set_max_window_bytes(34);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Max window size exceeds the size of the value.
+ // len=36, orig_window="one two three four.... five"
+ snippet_spec_.set_max_window_bytes(36);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
+ Eq("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "Only a fool would match this content.")
+ .Build();
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets. 'f' should match prefix-enabled property 'subject', but
+ // not exact-only property 'body'
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+}
+
+TEST_F(SnippetRetrieverTest, ExactSnippeting) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "Only a fool would match this content.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), IsEmpty());
+}
+
+TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "Only a fool would match this content.")
+ .Build();
+
+ snippet_spec_.set_max_window_bytes(0);
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), IsEmpty());
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), SizeIs(2));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(
+ GetWindow(document, snippet, "body", 0),
+ Eq("Concerning the subject of foo, we need to begin considering"));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
+ EXPECT_THAT(GetWindow(document, snippet, "body", 1),
+ Eq("our options regarding body bar."));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+ // Section 1 "subject" is not in the section_mask, so no snippet information
+ // from that section should be returned by the SnippetRetriever.
+ SectionIdMask section_mask = 0b00000001;
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(
+ GetWindow(document, snippet, "body", 0),
+ Eq("Concerning the subject of foo, we need to begin considering"));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
+ EXPECT_THAT(GetWindow(document, snippet, "body", 1),
+ Eq("our options regarding body bar."));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+ SectionIdMask section_mask = 0b00000011;
+ // "subject" should match in both sections, but "foo" is restricted to "body"
+ // so it should only match in the 'body' section and not the 'subject'
+ // section.
+ SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
+ {"body", {"foo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), SizeIs(2));
+ // 'subject' section should only have the one match for "subject".
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("subject"));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 1), IsEmpty());
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 1), IsEmpty());
+
+ // 'body' section should have matches for "subject" and "foo".
+ EXPECT_THAT(GetWindow(document, snippet, "body", 0),
+ Eq("Concerning the subject of foo, we need to begin"));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("subject"));
+ EXPECT_THAT(
+ GetWindow(document, snippet, "body", 1),
+ Eq("Concerning the subject of foo, we need to begin considering"));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("foo"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ snippet_spec_.set_num_matches_per_property(1);
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ // Check the snippets
+ EXPECT_THAT(snippet.entries(), SizeIs(2));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(
+ GetWindow(document, snippet, "body", 0),
+ Eq("Concerning the subject of foo, we need to begin considering"));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
+ EXPECT_THAT(GetWindow(document, snippet, "body", 1), IsEmpty());
+ EXPECT_THAT(GetMatch(document, snippet, "body", 1), IsEmpty());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/document-associated-score-data.h b/icing/store/document-associated-score-data.h
new file mode 100644
index 0000000..65b35e1
--- /dev/null
+++ b/icing/store/document-associated-score-data.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DOCUMENT_ASSOCIATED_SCORE_DATA_H_
+#define ICING_STORE_DOCUMENT_ASSOCIATED_SCORE_DATA_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+// This is the cache entity of document-associated scores. It contains scores
+// that are related to the document itself. The ground-truth data is stored
+// somewhere else. The cache includes:
+// 1. Document score. It's defined in and passed from DocumentProto.score.
+// Positive values are required.
+// 2. Document creation timestamp. Unix timestamp of when the document is
+// created and inserted into Icing.
+class DocumentAssociatedScoreData {
+ public:
+ explicit DocumentAssociatedScoreData(int document_score,
+ int64_t creation_timestamp_secs)
+ : document_score_(document_score),
+ creation_timestamp_secs_(creation_timestamp_secs) {}
+
+ bool operator==(const DocumentAssociatedScoreData& other) const {
+ return document_score_ == other.document_score() &&
+ creation_timestamp_secs_ == other.creation_timestamp_secs();
+ }
+
+ int document_score() const { return document_score_; }
+
+ int64_t creation_timestamp_secs() const { return creation_timestamp_secs_; }
+
+ private:
+ int document_score_;
+ int64_t creation_timestamp_secs_;
+} __attribute__((packed));
+
+static_assert(sizeof(DocumentAssociatedScoreData) == 12,
+ "Size of DocumentAssociatedScoreData should be 12");
+static_assert(icing_is_packed_pod<DocumentAssociatedScoreData>::value,
+ "go/icing-ubsan");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DOCUMENT_ASSOCIATED_SCORE_DATA_H_
diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
new file mode 100644
index 0000000..86d0efd
--- /dev/null
+++ b/icing/store/document-filter-data.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DOCUMENT_FILTER_DATA_H_
+#define ICING_STORE_DOCUMENT_FILTER_DATA_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+using NamespaceId = int16_t;
+using SchemaTypeId = int16_t;
+
+class DocumentFilterData {
+ public:
+ explicit DocumentFilterData(NamespaceId namespace_id,
+ SchemaTypeId schema_type_id,
+ int64_t expiration_timestamp_secs)
+ : expiration_timestamp_secs_(expiration_timestamp_secs),
+ namespace_id_(namespace_id),
+ schema_type_id_(schema_type_id) {}
+
+ bool operator==(const DocumentFilterData& other) const {
+ return namespace_id_ == other.namespace_id() &&
+ schema_type_id_ == other.schema_type_id() &&
+ expiration_timestamp_secs_ == other.expiration_timestamp_secs();
+ }
+
+ NamespaceId namespace_id() const { return namespace_id_; }
+
+ SchemaTypeId schema_type_id() const { return schema_type_id_; }
+ void set_schema_type_id(SchemaTypeId schema_type_id) {
+ schema_type_id_ = schema_type_id;
+ }
+
+ int64_t expiration_timestamp_secs() const {
+ return expiration_timestamp_secs_;
+ }
+
+ private:
+ int64_t expiration_timestamp_secs_;
+ NamespaceId namespace_id_;
+ SchemaTypeId schema_type_id_;
+} __attribute__((packed));
+
+static_assert(sizeof(DocumentFilterData) == 12, "");
+static_assert(icing_is_packed_pod<DocumentFilterData>::value, "go/icing-ubsan");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DOCUMENT_FILTER_DATA_H_
diff --git a/icing/store/document-id.h b/icing/store/document-id.h
new file mode 100644
index 0000000..cbe9959
--- /dev/null
+++ b/icing/store/document-id.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DOCUMENT_ID_H_
+#define ICING_STORE_DOCUMENT_ID_H_
+
+#include <cstdint>
+
+namespace icing {
+namespace lib {
+
+// Id of a document
+using DocumentId = int32_t;
+
+// We use 20 bits to encode document_ids and use the largest value (1M - 1) to
+// represent an invalid document_id.
+inline constexpr int kDocumentIdBits = 20;
+inline constexpr DocumentId kInvalidDocumentId = (1u << kDocumentIdBits) - 1;
+inline constexpr DocumentId kMinDocumentId = 0;
+inline constexpr DocumentId kMaxDocumentId = kInvalidDocumentId - 1;
+
+constexpr bool IsDocumentIdValid(DocumentId document_id) {
+ return document_id >= kMinDocumentId && document_id <= kMaxDocumentId;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DOCUMENT_ID_H_
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
new file mode 100644
index 0000000..b9b6738
--- /dev/null
+++ b/icing/store/document-store.cc
@@ -0,0 +1,1214 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/document-store.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "utils/hash/farmhash.h"
+#include "icing/absl_ports/annotate.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-proto-log.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-associated-score-data.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/clock.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Used in DocumentId mapper to mark a document as deleted
+constexpr int64_t kDocDeletedFlag = -1;
+constexpr char kDocumentLogFilename[] = "document_log";
+constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
+constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
+constexpr char kScoreCacheFilename[] = "score_cache";
+constexpr char kFilterCacheFilename[] = "filter_cache";
+constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
+
+constexpr int32_t kUriMapperMaxSize = 12 * 1024 * 1024; // 12 MiB
+
+// 384 KiB for a KeyMapper would allow each internal array to have a max of
+// 128 KiB for storage.
+constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
+
+DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
+ DocumentWrapper document_wrapper;
+ *document_wrapper.mutable_document() = std::move(document);
+ return document_wrapper;
+}
+
+DocumentWrapper CreateDocumentTombstone(std::string_view document_namespace,
+ std::string_view document_uri) {
+ DocumentWrapper document_wrapper;
+ document_wrapper.set_deleted(true);
+ DocumentProto* document = document_wrapper.mutable_document();
+ document->set_namespace_(std::string(document_namespace));
+ document->set_uri(std::string(document_uri));
+ return document_wrapper;
+}
+
+DocumentWrapper CreateNamespaceTombstone(std::string_view document_namespace) {
+ DocumentWrapper document_wrapper;
+ document_wrapper.set_deleted(true);
+ DocumentProto* document = document_wrapper.mutable_document();
+ document->set_namespace_(std::string(document_namespace));
+ return document_wrapper;
+}
+
+DocumentWrapper CreateSchemaTypeTombstone(
+ std::string_view document_schema_type) {
+ DocumentWrapper document_wrapper;
+ document_wrapper.set_deleted(true);
+ DocumentProto* document = document_wrapper.mutable_document();
+ document->set_schema(std::string(document_schema_type));
+ return document_wrapper;
+}
+
+std::string MakeHeaderFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
+}
+
+std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
+}
+
+std::string MakeDocumentLogFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentLogFilename);
+}
+
+std::string MakeScoreCacheFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
+}
+
+std::string MakeFilterCacheFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
+}
+
+std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
+}
+
+// TODO(adorokhine): This class internally uses an 8-byte fingerprint of the
+// Key and stores the key/value in a file-backed-trie that adds an ~80 byte
+// overhead per key. As we know that these fingerprints are always 8-bytes in
+// length and that they're random, we might be able to store them more
+// compactly.
+std::string MakeFingerprint(std::string_view name_space, std::string_view uri) {
+ // Using a 64-bit fingerprint to represent the key could lead to collisions.
+ // But, even with 200K unique keys, the probability of collision is about
+ // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
+ uint64_t fprint =
+ tc3farmhash::Fingerprint64(absl_ports::StrCat(name_space, uri));
+
+ std::string encoded_fprint;
+ // DynamicTrie cannot handle keys with '0' as bytes. So, we encode it in
+ // base128 and add 1 to make sure that no byte is '0'. This increases the
+ // size of the encoded_fprint from 8-bytes to 10-bytes.
+ while (fprint) {
+ encoded_fprint.push_back((fprint & 0x7F) + 1);
+ fprint >>= 7;
+ }
+ return encoded_fprint;
+}
+
+int64_t CalculateExpirationTimestampSecs(int64_t creation_timestamp_secs,
+ int64_t ttl_secs) {
+ if (ttl_secs == 0) {
+ // Special case where a TTL of 0 indicates the document should never
+ // expire. int64_t max, interpreted as seconds since epoch, represents
+ // some point in the year 292,277,026,596. So we're probably ok to use
+ // this as "never reaching this point".
+ return std::numeric_limits<int64_t>::max();
+ }
+
+ int64_t expiration_timestamp_secs;
+ if (__builtin_add_overflow(creation_timestamp_secs, ttl_secs,
+ &expiration_timestamp_secs)) {
+ // Overflow detected. Treat overflow as the same behavior of just int64_t
+ // max
+ return std::numeric_limits<int64_t>::max();
+ }
+
+ return expiration_timestamp_secs;
+}
+
+} // namespace
+
+DocumentStore::DocumentStore(const Filesystem* filesystem,
+ const std::string_view base_dir,
+ const Clock* clock,
+ const SchemaStore* schema_store)
+ : filesystem_(filesystem),
+ base_dir_(base_dir),
+ clock_(*clock),
+ schema_store_(schema_store),
+ document_validator_(schema_store) {}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
+ const DocumentProto& document) {
+ return Put(DocumentProto(document));
+}
+
+DocumentStore::~DocumentStore() {
+ if (initialized_) {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(ERROR)
+ << "Error persisting to disk in DocumentStore destructor";
+ }
+ }
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>>
+DocumentStore::Create(const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ auto document_store = std::unique_ptr<DocumentStore>(
+ new DocumentStore(filesystem, base_dir, clock, schema_store));
+ ICING_RETURN_IF_ERROR(document_store->Initialize());
+ return document_store;
+}
+
+libtextclassifier3::Status DocumentStore::Initialize() {
+ auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create(
+ filesystem_, MakeDocumentLogFilename(base_dir_),
+ FileBackedProtoLog<DocumentWrapper>::Options(
+ /*compress_in=*/true));
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ if (!create_result_or.ok()) {
+ ICING_LOG(ERROR) << create_result_or.status().error_message()
+ << "Failed to initialize DocumentLog";
+ return create_result_or.status();
+ }
+ FileBackedProtoLog<DocumentWrapper>::CreateResult create_result =
+ std::move(create_result_or).ValueOrDie();
+ document_log_ = std::move(create_result.proto_log);
+
+ if (create_result.data_loss) {
+ ICING_LOG(WARNING)
+ << "Data loss in document log, regenerating derived files.";
+ libtextclassifier3::Status status = RegenerateDerivedFiles();
+ if (!status.ok()) {
+ ICING_LOG(ERROR)
+ << "Failed to regenerate derived files for DocumentStore";
+ return status;
+ }
+ } else {
+ if (!InitializeDerivedFiles().ok()) {
+ ICING_VLOG(1)
+ << "Couldn't find derived files or failed to initialize them, "
+ "regenerating derived files for DocumentStore.";
+ libtextclassifier3::Status status = RegenerateDerivedFiles();
+ if (!status.ok()) {
+ ICING_LOG(ERROR)
+ << "Failed to regenerate derived files for DocumentStore";
+ return status;
+ }
+ }
+ }
+
+ initialized_ = true;
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
+ if (!HeaderExists()) {
+ // Without a header, we don't know if things are consistent between each
+ // other so the caller should just regenerate everything from ground
+ // truth.
+ return absl_ports::InternalError("DocumentStore header doesn't exist");
+ }
+
+ DocumentStore::Header header;
+ if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
+ sizeof(header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
+ }
+
+ if (header.magic != DocumentStore::Header::kMagic) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
+ }
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto document_key_mapper_or =
+ KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
+ if (!document_key_mapper_or.ok()) {
+ ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
+ << "Failed to initialize KeyMapper";
+ return document_key_mapper_or.status();
+ }
+ document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
+ *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ if (!document_id_mapper_or.ok()) {
+ ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
+ << "Failed to initialize DocumentIdMapper";
+ return document_id_mapper_or.status();
+ }
+ document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
+
+ ICING_ASSIGN_OR_RETURN(score_cache_,
+ FileBackedVector<DocumentAssociatedScoreData>::Create(
+ *filesystem_, MakeScoreCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+
+ ICING_ASSIGN_OR_RETURN(filter_cache_,
+ FileBackedVector<DocumentFilterData>::Create(
+ *filesystem_, MakeFilterCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+
+ ICING_ASSIGN_OR_RETURN(
+ namespace_mapper_,
+ KeyMapper<NamespaceId>::Create(*filesystem_,
+ MakeNamespaceMapperFilename(base_dir_),
+ kNamespaceMapperMaxSize));
+
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ if (checksum.Get() != header.checksum) {
+ return absl_ports::InternalError(
+ "Combined checksum of DocStore was inconsistent");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
+ ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
+ ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
+ ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
+ ICING_RETURN_IF_ERROR(ResetFilterCache());
+ ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
+
+ // Iterates through document log
+ auto iterator = document_log_->GetIterator();
+ auto iterator_status = iterator.Advance();
+ while (iterator_status.ok()) {
+ ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper,
+ document_log_->ReadProto(iterator.GetOffset()));
+ if (document_wrapper.deleted()) {
+ if (!document_wrapper.document().uri().empty()) {
+ // Individual document deletion.
+ auto document_id_or =
+ GetDocumentId(document_wrapper.document().namespace_(),
+ document_wrapper.document().uri());
+ // Updates document_id mapper with deletion
+ if (document_id_or.ok()) {
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(
+ document_id_or.ValueOrDie(), kDocDeletedFlag));
+ } else if (!absl_ports::IsNotFound(document_id_or.status())) {
+ // Real error
+ return absl_ports::Annotate(
+ document_id_or.status(),
+ absl_ports::StrCat("Failed to find document id. namespace: ",
+ document_wrapper.document().namespace_(),
+ ", uri: ", document_wrapper.document().uri()));
+ }
+ } else if (!document_wrapper.document().namespace_().empty()) {
+ // Namespace deletion.
+ ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted(
+ document_wrapper.document().namespace_()));
+
+ } else if (!document_wrapper.document().schema().empty()) {
+ // SchemaType deletion.
+ auto schema_type_id_or = schema_store_->GetSchemaTypeId(
+ document_wrapper.document().schema());
+
+ if (schema_type_id_or.ok()) {
+ ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(
+ schema_type_id_or.ValueOrDie()));
+ } else {
+ // The deleted schema type doesn't have a SchemaTypeId we can refer
+ // to in the FilterCache.
+ //
+ // TODO(cassiewang): We could avoid reading out all the documents.
+ // When we see a schema type doesn't have a SchemaTypeId, assign the
+ // unknown schema type a unique, temporary SchemaTypeId and store
+ // that in the FilterCache. Then, when we see the schema type
+ // tombstone here, we can look up its temporary SchemaTypeId and
+ // just iterate through the FilterCache to mark those documents as
+ // deleted.
+ int size = document_id_mapper_->num_elements();
+ for (DocumentId document_id = 0; document_id < size; document_id++) {
+ auto document_or = Get(document_id);
+ if (absl_ports::IsNotFound(document_or.status())) {
+ // Skip nonexistent documents
+ continue;
+ } else if (!document_or.ok()) {
+ // Real error, pass up
+ return absl_ports::Annotate(
+ document_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to retrieve Document for DocumentId %d",
+ document_id));
+ }
+
+ // Guaranteed to have a document now.
+ DocumentProto document = document_or.ValueOrDie();
+
+ if (document.schema() == document_wrapper.document().schema()) {
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ }
+ }
+ }
+ } else {
+ return absl_ports::InternalError(
+ "Encountered an invalid tombstone during recovery!");
+ }
+ } else {
+ // Updates key mapper and document_id mapper with the new document
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+ MakeFingerprint(document_wrapper.document().namespace_(),
+ document_wrapper.document().uri()),
+ new_document_id));
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(
+ document_wrapper.document().score(),
+ document_wrapper.document().creation_timestamp_secs())));
+
+ SchemaTypeId schema_type_id;
+ auto schema_type_id_or =
+ schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
+ if (absl_ports::IsNotFound(schema_type_id_or.status())) {
+ // Didn't find a SchemaTypeId. This means that the DocumentStore and
+ // the SchemaStore are out of sync. But DocumentStore can't do
+ // anything about it so just ignore this for now. This should be
+ // detected/handled by the owner of DocumentStore. Set it to some
+ // arbitrary invalid value for now, it'll get updated to the correct
+ // ID later.
+ schema_type_id = -1;
+ } else if (!schema_type_id_or.ok()) {
+ // Real error. Pass it up
+ return schema_type_id_or.status();
+ } else {
+ // We're guaranteed that SchemaTypeId is valid now
+ schema_type_id = schema_type_id_or.ValueOrDie();
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+ namespace_mapper_->num_keys()));
+
+ int64_t expiration_timestamp_secs = CalculateExpirationTimestampSecs(
+ document_wrapper.document().creation_timestamp_secs(),
+ document_wrapper.document().ttl_secs());
+
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+ expiration_timestamp_secs)));
+ }
+ iterator_status = iterator.Advance();
+ }
+
+ if (!absl_ports::IsOutOfRange(iterator_status)) {
+ ICING_LOG(WARNING)
+ << "Failed to iterate through proto log while regenerating "
+ "derived files";
+ return absl_ports::Annotate(iterator_status,
+ "Failed to iterate through proto log.");
+ }
+
+ // Write the header
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ document_key_mapper_.reset();
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status =
+ KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old key mapper";
+ return status;
+ }
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto document_key_mapper_or =
+ KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
+ if (!document_key_mapper_or.ok()) {
+ ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
+ << "Failed to re-init key mapper";
+ return document_key_mapper_or.status();
+ }
+ document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ document_id_mapper_.reset();
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
+ *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old document_id mapper";
+ return status;
+ }
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
+ *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ if (!document_id_mapper_or.ok()) {
+ ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
+ << "Failed to re-init document_id mapper";
+ return document_id_mapper_or.status();
+ }
+ document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ score_cache_.reset();
+ ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
+ *filesystem_, MakeScoreCacheFilename(base_dir_)));
+ ICING_ASSIGN_OR_RETURN(score_cache_,
+ FileBackedVector<DocumentAssociatedScoreData>::Create(
+ *filesystem_, MakeScoreCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetFilterCache() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ filter_cache_.reset();
+ ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
+ *filesystem_, MakeFilterCacheFilename(base_dir_)));
+ ICING_ASSIGN_OR_RETURN(filter_cache_,
+ FileBackedVector<DocumentFilterData>::Create(
+ *filesystem_, MakeFilterCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ namespace_mapper_.reset();
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
+ *filesystem_, MakeNamespaceMapperFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old namespace_id mapper";
+ return status;
+ }
+ ICING_ASSIGN_OR_RETURN(
+ namespace_mapper_,
+ KeyMapper<NamespaceId>::Create(*filesystem_,
+ MakeNamespaceMapperFilename(base_dir_),
+ kNamespaceMapperMaxSize));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
+ Crc32 total_checksum;
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto checksum_or = document_log_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of DocumentLog";
+ return checksum_or.status();
+ }
+ Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
+
+ Crc32 document_key_mapper_checksum = document_key_mapper_->ComputeChecksum();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = document_id_mapper_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of DocumentIdMapper";
+ return checksum_or.status();
+ }
+ Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = score_cache_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of score cache";
+ return checksum_or.status();
+ }
+ Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = filter_cache_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of filter cache";
+ return checksum_or.status();
+ }
+ Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
+
+ Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum();
+
+ total_checksum.Append(std::to_string(document_log_checksum.Get()));
+ total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
+ total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
+ total_checksum.Append(std::to_string(score_cache_checksum.Get()));
+ total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
+ total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
+
+ return total_checksum;
+}
+
+bool DocumentStore::HeaderExists() {
+ if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
+ return false;
+ }
+
+ int64_t file_size =
+ filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
+
+ // If it's been truncated to size 0 before, we consider it to be a new file
+ return file_size != 0 && file_size != Filesystem::kBadFileSize;
+}
+
+libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
+ // Write the header
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = checksum.Get();
+
+ // This should overwrite the header.
+ if (!filesystem_->Write(MakeHeaderFilename(base_dir_).c_str(), &header,
+ sizeof(header))) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
+ DocumentProto&& document) {
+ ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
+
+ // Copy fields needed before they are moved
+ std::string name_space = document.namespace_();
+ std::string uri = document.uri();
+ std::string schema = document.schema();
+ int document_score = document.score();
+ int64_t creation_timestamp_secs = document.creation_timestamp_secs();
+
+ // Sets the creation timestamp if caller hasn't specified.
+ if (document.creation_timestamp_secs() == 0) {
+ creation_timestamp_secs = clock_.GetCurrentSeconds();
+ document.set_creation_timestamp_secs(creation_timestamp_secs);
+ }
+
+ int64_t expiration_timestamp_secs = CalculateExpirationTimestampSecs(
+ creation_timestamp_secs, document.ttl_secs());
+
+ // Update ground truth first
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto offset_or =
+ document_log_->WriteProto(CreateDocumentWrapper(std::move(document)));
+ if (!offset_or.ok()) {
+ ICING_LOG(ERROR) << offset_or.status().error_message()
+ << "Failed to write document";
+ return offset_or.status();
+ }
+ int64_t file_offset = std::move(offset_or).ValueOrDie();
+
+ // Get existing document id
+ auto old_document_id_or = GetDocumentId(name_space, uri);
+ if (!old_document_id_or.ok() &&
+ !absl_ports::IsNotFound(old_document_id_or.status())) {
+ return absl_ports::InternalError("Failed to read from key mapper");
+ }
+
+ // Creates a new document id, updates key mapper and document_id mapper
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+ MakeFingerprint(name_space, uri), new_document_id));
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(document_score, creation_timestamp_secs)));
+
+ // Update namespace maps
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
+
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_store_->GetSchemaTypeId(schema));
+
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+ expiration_timestamp_secs)));
+
+ if (old_document_id_or.ok()) {
+ // Mark the old document id as deleted.
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(
+ old_document_id_or.ValueOrDie(), kDocDeletedFlag));
+ }
+
+ return new_document_id;
+}
+
+libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
+ const std::string_view name_space, const std::string_view uri) const {
+ ICING_ASSIGN_OR_RETURN(DocumentId document_id,
+ GetDocumentId(name_space, uri));
+ return Get(document_id);
+}
+
+libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
+ DocumentId document_id) const {
+ ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
+ DoesDocumentExistAndGetFileOffset(document_id));
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
+ if (!document_wrapper_or.ok()) {
+ ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
+ << "Failed to read from document log";
+ return document_wrapper_or.status();
+ }
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
+
+ return std::move(*document_wrapper.mutable_document());
+}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
+ const std::string_view name_space, const std::string_view uri) const {
+ auto document_id_or =
+ document_key_mapper_->Get(MakeFingerprint(name_space, uri));
+ if (!document_id_or.ok()) {
+ return absl_ports::Annotate(
+ document_id_or.status(),
+ absl_ports::StrCat("Failed to find DocumentId by key: ", name_space,
+ ", ", uri));
+ }
+
+ // Guaranteed to have a DocumentId now
+ return document_id_or.ValueOrDie();
+}
+
+libtextclassifier3::StatusOr<int64_t>
+DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(
+ IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id));
+ }
+
+ auto file_offset_or = document_id_mapper_->Get(document_id);
+
+ bool deleted =
+ file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag;
+ if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) {
+ // Document has been deleted or doesn't exist
+ return absl_ports::NotFoundError(
+ IcingStringUtil::StringPrintf("Document %d not found", document_id));
+ }
+
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+ filter_cache_->Get(document_id));
+ if (clock_.GetCurrentSeconds() >= filter_data->expiration_timestamp_secs()) {
+ // Past the expiration time, so also return NOT FOUND since it *shouldn't*
+ // exist anymore.
+ return absl_ports::NotFoundError(
+ IcingStringUtil::StringPrintf("Document %d not found", document_id));
+ }
+
+ ICING_RETURN_IF_ERROR(file_offset_or.status());
+ return *file_offset_or.ValueOrDie();
+}
+
+bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
+ // If we can successfully get the document log offset, the document exists.
+ return DoesDocumentExistAndGetFileOffset(document_id).ok();
+}
+
+libtextclassifier3::Status DocumentStore::Delete(
+ const std::string_view name_space, const std::string_view uri) {
+ // Try to get the DocumentId first
+ auto document_id_or = GetDocumentId(name_space, uri);
+ if (absl_ports::IsNotFound(document_id_or.status())) {
+ // No need to delete nonexistent (name_space, uri)
+ return libtextclassifier3::Status::OK;
+ } else if (!document_id_or.ok()) {
+ // Real error
+ return absl_ports::Annotate(
+ document_id_or.status(),
+ absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
+ ", uri: ", uri));
+ }
+
+ // Check if the DocumentId's Document still exists.
+ DocumentId document_id = document_id_or.ValueOrDie();
+ auto file_offset_or = DoesDocumentExistAndGetFileOffset(document_id);
+ if (absl_ports::IsNotFound(file_offset_or.status())) {
+ // No need to delete nonexistent documents
+ return libtextclassifier3::Status::OK;
+ } else if (!file_offset_or.ok()) {
+ // Real error, pass it up
+ return absl_ports::Annotate(
+ file_offset_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to retrieve file offset for DocumentId %d", document_id));
+ }
+
+ // Update ground truth first.
+ // To delete a proto we don't directly remove it. Instead, we mark it as
+ // deleted first by appending a tombstone of it and actually remove it from
+ // file later in Optimize()
+ // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete Document. namespace: " << name_space
+ << ", uri: " << uri;
+ return status;
+ }
+
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
+ std::string_view name_space) const {
+ return namespace_mapper_->Get(name_space);
+}
+
+libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
+DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
+ auto score_data_or = score_cache_->Get(document_id);
+ if (!score_data_or.ok()) {
+ ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
+ << " from score_cache_";
+ return score_data_or.status();
+ }
+ return *std::move(score_data_or).ValueOrDie();
+}
+
+libtextclassifier3::StatusOr<DocumentFilterData>
+DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
+ auto filter_data_or = filter_cache_->Get(document_id);
+ if (!filter_data_or.ok()) {
+ ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
+ << " from filter_cache_";
+ return filter_data_or.status();
+ }
+ return *std::move(filter_data_or).ValueOrDie();
+}
+
+libtextclassifier3::Status DocumentStore::DeleteByNamespace(
+ std::string_view name_space) {
+ auto namespace_id_or = namespace_mapper_->Get(name_space);
+ if (absl_ports::IsNotFound(namespace_id_or.status())) {
+ // Namespace doesn't exist. Don't need to delete anything.
+ return libtextclassifier3::Status::OK;
+ } else if (!namespace_id_or.ok()) {
+ // Real error, pass it up.
+ return namespace_id_or.status();
+ }
+
+ // Update ground truth first.
+ // To delete an entire namespace, we append a tombstone that only contains
+ // the deleted bit and the name of the deleted namespace.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete namespace. namespace = "
+ << name_space;
+ return status;
+ }
+
+ return UpdateDerivedFilesNamespaceDeleted(name_space);
+}
+
+libtextclassifier3::Status DocumentStore::UpdateDerivedFilesNamespaceDeleted(
+ std::string_view name_space) {
+ auto namespace_id_or = namespace_mapper_->Get(name_space);
+ if (absl_ports::IsNotFound(namespace_id_or.status())) {
+ // Namespace doesn't exist. Don't need to delete anything.
+ return libtextclassifier3::Status::OK;
+ } else if (!namespace_id_or.ok()) {
+ // Real error, pass it up.
+ return namespace_id_or.status();
+ }
+
+ // Guaranteed to have a NamespaceId now.
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+
+ // Traverse FilterCache and delete all docs that match namespace_id
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ // filter_cache_->Get can only fail if document_id is < 0
+ // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
+ filter_cache_->Get(document_id));
+ if (data->namespace_id() == namespace_id) {
+ // docid_mapper_->Set can only fail if document_id is < 0
+ // or >= docid_mapper_->num_elements. So the only possible way to get an
+ // error here would be if filter_cache_->num_elements >
+ // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
+ std::string_view schema_type) {
+ auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
+ if (absl_ports::IsNotFound(schema_type_id_or.status())) {
+ // SchemaType doesn't exist. Don't need to delete anything.
+ return libtextclassifier3::Status::OK;
+ } else if (!schema_type_id_or.ok()) {
+ // Real error, pass it up.
+ return schema_type_id_or.status();
+ }
+
+ // Update ground truth first.
+ // To delete an entire schema type, we append a tombstone that only contains
+ // the deleted bit and the name of the deleted schema type.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete schema_type. schema_type = "
+ << schema_type;
+ return status;
+ }
+
+ // Guaranteed to have a SchemaTypeId now
+ SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+
+ ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted(
+ SchemaTypeId schema_type_id) {
+ // Traverse FilterCache and delete all docs that match schema_type_id.
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ // filter_cache_->Get can only fail if document_id is < 0
+ // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
+ filter_cache_->Get(document_id));
+ if (data->schema_type_id() == schema_type_id) {
+ // docid_mapper_->Set can only fail if document_id is < 0
+ // or >= docid_mapper_->num_elements. So the only possible way to get an
+ // error here would be if filter_cache_->num_elements >
+ // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::PersistToDisk() {
+ ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
+
+ // Update the combined checksum and write to header file.
+ ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
+ ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int64_t> DocumentStore::GetDiskUsage() const {
+ ICING_ASSIGN_OR_RETURN(const int64_t document_log_disk_usage,
+ document_log_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_disk_usage,
+ document_key_mapper_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_disk_usage,
+ document_id_mapper_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t score_cache_disk_usage,
+ score_cache_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_disk_usage,
+ filter_cache_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t namespace_mapper_disk_usage,
+ namespace_mapper_->GetDiskUsage());
+
+ return document_log_disk_usage + document_key_mapper_disk_usage +
+ document_id_mapper_disk_usage + score_cache_disk_usage +
+ filter_cache_disk_usage + namespace_mapper_disk_usage;
+}
+
+libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
+ const SchemaStore* schema_store) {
+ // Update all references to the SchemaStore
+ schema_store_ = schema_store;
+ document_validator_.UpdateSchemaStore(schema_store);
+
+ int size = document_id_mapper_->num_elements();
+ for (DocumentId document_id = 0; document_id < size; document_id++) {
+ auto document_or = Get(document_id);
+ if (absl_ports::IsNotFound(document_or.status())) {
+ // Skip nonexistent documents
+ continue;
+ } else if (!document_or.ok()) {
+ // Real error, pass up
+ return absl_ports::Annotate(
+ document_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to retrieve Document for DocumentId %d", document_id));
+ }
+
+ // Guaranteed to have a document now.
+ DocumentProto document = document_or.ValueOrDie();
+
+ // Revalidate that this document is still compatible
+ if (document_validator_.Validate(document).ok()) {
+ // Update the SchemaTypeId for this entry
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_store_->GetSchemaTypeId(document.schema()));
+ filter_cache_->mutable_array()[document_id].set_schema_type_id(
+ schema_type_id);
+ } else {
+ // Document is no longer valid with the new SchemaStore. Mark as
+ // deleted
+ ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
+ const SchemaStore* schema_store,
+ const SchemaStore::SetSchemaResult& set_schema_result) {
+ if (!set_schema_result.success) {
+ // No new schema was set, no work to be done
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Update all references to the SchemaStore
+ schema_store_ = schema_store;
+ document_validator_.UpdateSchemaStore(schema_store);
+
+ // Append a tombstone for each deleted schema type. This way, we don't have
+ // to read out each document, check if the schema type has been deleted, and
+ // append a tombstone per-document.
+ for (const auto& schema_type :
+ set_schema_result.schema_types_deleted_by_name) {
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete schema_type. schema_type = "
+ << schema_type;
+ return status;
+ }
+ }
+
+ int size = document_id_mapper_->num_elements();
+ for (DocumentId document_id = 0; document_id < size; document_id++) {
+ auto exists_or = DoesDocumentExistAndGetFileOffset(document_id);
+ if (absl_ports::IsNotFound(exists_or.status())) {
+ // Skip nonexistent documents
+ continue;
+ } else if (!exists_or.ok()) {
+ // Real error, pass up
+ return absl_ports::Annotate(
+ exists_or.status(),
+ IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d",
+ document_id));
+ }
+
+ // Guaranteed that the document exists now.
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+ filter_cache_->Get(document_id));
+
+ if (set_schema_result.schema_types_deleted_by_id.count(
+ filter_data->schema_type_id()) != 0) {
+ // We already created a tombstone for this deleted type. Just update the
+ // derived files now.
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ continue;
+ }
+
+ // Check if we need to update the FilterCache entry for this document. It
+ // may have been assigned a different SchemaTypeId in the new SchemaStore.
+ bool update_filter_cache =
+ set_schema_result.old_schema_type_ids_changed.count(
+ filter_data->schema_type_id()) != 0;
+
+ // Check if we need to revalidate this document if the type is now
+ // incompatible
+ bool revalidate_document =
+ set_schema_result.schema_types_incompatible_by_id.count(
+ filter_data->schema_type_id()) != 0;
+
+ if (update_filter_cache || revalidate_document) {
+ ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
+
+ if (update_filter_cache) {
+ ICING_ASSIGN_OR_RETURN(
+ SchemaTypeId schema_type_id,
+ schema_store_->GetSchemaTypeId(document.schema()));
+ filter_cache_->mutable_array()[document_id].set_schema_type_id(
+ schema_type_id);
+ }
+
+ if (revalidate_document) {
+ if (!document_validator_.Validate(document).ok()) {
+ // Document is no longer valid with the new SchemaStore. Mark as
+ // deleted
+ ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ }
+ }
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+// TODO(b/121227117): Implement Optimize()
+libtextclassifier3::Status DocumentStore::Optimize() {
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::OptimizeInto(
+ const std::string& new_directory) {
+ // Validates directory
+ if (new_directory == base_dir_) {
+ return absl_ports::InvalidArgumentError(
+ "New directory is the same as the current one.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(auto new_doc_store,
+ DocumentStore::Create(filesystem_, new_directory,
+ &clock_, schema_store_));
+
+ // Writes all valid docs into new document store (new directory)
+ int size = document_id_mapper_->num_elements();
+ for (DocumentId document_id = 0; document_id < size; document_id++) {
+ auto document_or = Get(document_id);
+ if (absl_ports::IsNotFound(document_or.status())) {
+ // Skip nonexistent documents
+ continue;
+ } else if (!document_or.ok()) {
+ // Real error, pass up
+ return absl_ports::Annotate(
+ document_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to retrieve Document for DocumentId %d", document_id));
+ }
+
+ // Guaranteed to have a document now.
+ DocumentProto document_to_keep = document_or.ValueOrDie();
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ new_doc_store->Put(std::move(document_to_keep)).status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to write into new document store";
+ return status;
+ }
+ }
+
+ ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk());
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
+ DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
+ return score_cache_->Set(document_id, score_data);
+}
+
+libtextclassifier3::Status DocumentStore::UpdateFilterCache(
+ DocumentId document_id, const DocumentFilterData& filter_data) {
+ return filter_cache_->Set(document_id, filter_data);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
new file mode 100644
index 0000000..018e19e
--- /dev/null
+++ b/icing/store/document-store.h
@@ -0,0 +1,450 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DOCUMENT_STORE_H_
+#define ICING_STORE_DOCUMENT_STORE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/file/file-backed-proto-log.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-associated-score-data.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/clock.h"
+#include "icing/util/crc32.h"
+#include "icing/util/document-validator.h"
+
+namespace icing {
+namespace lib {
+
+// Provides storage interfaces for documents.
+class DocumentStore {
+ public:
+ struct Header {
+ static constexpr int32_t kMagic = 0x746f7265;
+
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic;
+
+ // Checksum of the DocumentStore's sub-component's checksums.
+ uint32_t checksum;
+ };
+
+ // Not copyable
+ DocumentStore(const DocumentStore&) = delete;
+ DocumentStore& operator=(const DocumentStore&) = delete;
+
+ // Persists and updates checksum of subcomponents.
+ ~DocumentStore();
+
+ // Factory method to create, initialize, and return a DocumentStore. The base
+ // directory is used to persist document store files. If document store was
+ // previously initialized with this directory, it will reload the files saved
+ // by the last instance.
+ //
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ //
+ // TODO(cassiewang): Consider returning a status indicating that derived files
+ // were regenerated. This may be helpful in logs.
+ //
+ // Returns:
+ // A valid document store on success
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store);
+
+ // Returns the maximum DocumentId that the DocumentStore has assigned. If
+ // there has not been any DocumentIds assigned, i.e. the DocumentStore is
+ // empty, then kInvalidDocumentId is returned. This does not filter out
+ // DocumentIds of deleted documents.
+ const DocumentId last_added_document_id() const {
+ if (document_id_mapper_->num_elements() == 0) {
+ return kInvalidDocumentId;
+ }
+ return document_id_mapper_->num_elements() - 1;
+ }
+
+ // Puts the document into document store.
+ //
+ // Returns:
+ // A newly generated document id on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentId> Put(const DocumentProto& document);
+ libtextclassifier3::StatusOr<DocumentId> Put(DocumentProto&& document);
+
+ // Finds and returns the document identified by the given key (namespace +
+ // uri)
+ //
+ // Returns:
+ // The document found on success
+ // NOT_FOUND if the key doesn't exist or document has been deleted
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentProto> Get(std::string_view name_space,
+ std::string_view uri) const;
+
+ // Finds and returns the document identified by the given document id
+ //
+ // Returns:
+ // The document found on success
+ // INVALID_ARGUMENT if document_id is less than 0 or greater than the
+ // maximum value
+ // NOT_FOUND if the document doesn't exist or has been deleted
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentProto> Get(DocumentId document_id) const;
+
+ // Returns true if there's an existing document associated with the given
+ // document id.
+ bool DoesDocumentExist(DocumentId document_id) const;
+
+ // Deletes the document identified by the given namespace and uri
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Delete(std::string_view name_space,
+ std::string_view uri);
+
+ // Returns the NamespaceId of the string namespace
+ //
+ // Returns:
+ // NamespaceId on success
+ // NOT_FOUND if the namespace doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
+ std::string_view name_space) const;
+
+ // Returns the DocumentAssociatedScoreData of the document specified by the
+ // DocumentId.
+ //
+ // NOTE: This does not check if the document exists and will return the
+ // DocumentFilterData of the document even if it has been deleted. Users
+ // should check DoesDocumentExist(document_id) if they only want existing
+ // documents' DocumentFilterData.
+ //
+ // Returns:
+ // DocumentAssociatedScoreData on success
+ // OUT_OF_RANGE if document_id is negative or exceeds previously seen
+ // DocumentIds
+ libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
+ GetDocumentAssociatedScoreData(DocumentId document_id) const;
+
+ // Returns the DocumentFilterData of the document specified by the DocumentId.
+ //
+ // NOTE: This does not check if the document exists and will return the
+ // DocumentFilterData of the document even if it has been deleted. Users
+ // should check DoesDocumentExist(document_id) if they only want existing
+ // documents' DocumentFilterData.
+ //
+ // Returns:
+ // DocumentFilterData on success
+ // OUT_OF_RANGE if document_id is negative or exceeds previously seen
+ // DocumentIds
+ libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
+ DocumentId document_id) const;
+
+ // Deletes all documents belonging to the given namespace.
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
+
+ // Deletes all documents belonging to the given schema type
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
+
+ // Syncs all the data and metadata changes to disk.
+ // Returns any encountered IO errors.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates and returns the disk usage in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ //
+ // TODO(samzheng): consider returning a struct which has the breakdown of each
+ // component.
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Update any derived data off of the SchemaStore with the new SchemaStore.
+ // This may include pointers, SchemaTypeIds, etc.
+ //
+ // NOTE: This function may delete documents. A document may be invalidated by
+ // the new SchemaStore, such as failing validation or having its schema type
+ // deleted from the schema.
+ //
+ // This is best used if the caller is unsure about what's changed in the
+ // SchemaStore, and wants to update all information no matter what. If the
+ // caller does know what has changed, then it's recommended to call
+ // OptimizedUpdateSchemaStore.
+ //
+ // Returns;
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
+
+ // Performs the same funtionality as UpdateSchemaStore, but this can be more
+ // optimized in terms of less disk reads and less work if we know exactly
+ // what's changed between the old and new SchemaStore.
+ //
+ // Returns;
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status OptimizedUpdateSchemaStore(
+ const SchemaStore* schema_store,
+ const SchemaStore::SetSchemaResult& set_schema_result);
+
+ // Reduces internal file sizes by reclaiming space of deleted documents and
+ // regenerating derived files.
+ //
+ // NOTE: The tasks in this method are too expensive to be executed in
+ // real-time. The caller should decide how frequently and when to call this
+ // method based on device usage.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Optimize();
+
+ // Copy data from current base directory into a new directory. Any outdated or
+ // deleted data won't be copied. During the process, document ids will be
+ // reassigned so any files / classes that are based on old document ids may be
+ // outdated.
+ //
+ // NOTE: The tasks in this method are too expensive to be executed in
+ // real-time. The caller should decide how frequently and when to call this
+ // method based on device usage.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if new_directory is same as current base directory
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
+
+ // Computes the combined checksum of the document store - includes the ground
+ // truth and all derived files.
+ //
+ // Returns:
+ // Combined checksum on success
+ // INTERNAL_ERROR on compute error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+
+ private:
+ // Use DocumentStore::Create() to instantiate.
+ DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
+ const Clock* clock, const SchemaStore* schema_store);
+
+ const Filesystem* const filesystem_;
+ const std::string base_dir_;
+ const Clock& clock_;
+
+ // Handles the ground truth schema and all of the derived data off of the
+ // schema
+ const SchemaStore* schema_store_;
+
+ // Used to validate incoming documents
+ DocumentValidator document_validator_;
+
+ // A log used to store all documents, it serves as a ground truth of doc
+ // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
+ std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log_;
+
+ // Key (namespace + uri) to DocumentId mapping
+ std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_;
+
+ // DocumentId to file offset mapping
+ std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
+
+ // A cache of document associated scores. The ground truth of the scores is
+ // DocumentProto stored in document_log_. This cache contains:
+ // - Document score
+ // - Document creation timestamp in seconds
+ std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
+
+ // A cache of data, indexed by DocumentId, used to filter documents. Currently
+ // contains:
+ // - NamespaceId
+ // - SchemaTypeId
+ // - Expiration timestamp in seconds
+ std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
+
+ // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
+ // id when the first document belonging to that namespace is added to the
+ // DocumentStore. Namespaces may be removed from the mapper during compaction.
+ std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
+
+ // Used internally to indicate whether the class has been initialized. This is
+ // to guard against cases where the object has been created, but Initialize
+ // fails in the constructor. If we have successfully exited the constructor,
+ // then this field can be ignored. Clients of DocumentStore should not need to
+ // worry about this field.
+ bool initialized_ = false;
+
+ libtextclassifier3::Status Initialize();
+
+ // Creates sub-components and verifies the integrity of each sub-component.
+ //
+ // Returns an error if subcomponents failed to initialize successfully.
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status InitializeDerivedFiles();
+
+ // Re-generates all files derived from the ground truth: the document log.
+ //
+ // NOTE: if this function fails, the only thing we can do is to retry it until
+ // it succeeds or prevent the initialization of a DocumentStore. The
+ // DocumentStore object wouldn't work reliably if this fails.
+ //
+ // Steps:
+ // 1. Delete all derived files.
+ // 2. Iterate through document log, put data into new key mapper and
+ // document_id
+ // mapper.
+ // 3. Create header and store the updated combined checksum
+ libtextclassifier3::Status RegenerateDerivedFiles();
+
+ // Resets the unique_ptr to the document_key_mapper, deletes the underlying
+ // file, and re-creates a new instance of the document_key_mapper .
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetDocumentKeyMapper();
+
+ // Resets the unique_ptr to the document_id_mapper, deletes the underlying
+ // file, and re-creates a new instance of the document_id_mapper.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetDocumentIdMapper();
+
+ // Resets the unique_ptr to the score_cache, deletes the underlying file, and
+ // re-creates a new instance of the score_cache.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
+
+ // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
+ // re-creates a new instance of the filter_cache.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetFilterCache();
+
+ // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
+ // and re-creates a new instance of the namespace_mapper.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetNamespaceMapper();
+
+ // Checks if the header exists already. This does not create the header file
+ // if it doesn't exist.
+ bool HeaderExists();
+
+ // Update and replace the header file. Creates the header file if it doesn't
+ // exist.
+ libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
+
+ // Update derived files that `name_space` has been deleted. This is primarily
+ // useful if we're trying to update derived files when we've already seen a
+ // namespace tombstone, and don't need to write another tombstone.
+ //
+ // NOTE: Space is not reclaimed in the derived files until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status UpdateDerivedFilesNamespaceDeleted(
+ std::string_view name_space);
+
+ // Update derived files that the schema type schema_type_id has been deleted.
+ // This is primarily useful if we're trying to update derived files when we've
+ // already seen a schema type tombstone, and don't need to write another
+ // tombstone.
+ //
+ // NOTE: Space is not reclaimed in the derived files until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted(
+ SchemaTypeId schema_type_id);
+
+ // Helper method to find a DocumentId that is associated with the given
+ // namespace and uri.
+ //
+ // NOTE: The DocumentId may refer to a invalid document (deleted
+ // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
+ // refers to a valid Document.
+ //
+ // Returns:
+ // A DocumentId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
+ std::string_view name_space, std::string_view uri) const;
+
+ // Helper method to validate the document id and return the file offset of the
+ // associated document in document_log_.
+ //
+ // This can be a more informative call than just DoesDocumentExist because it
+ // can return more status errors on whether the Document actually doesn't
+ // exist or if there was an internal error while accessing files.
+ //
+ // Returns:
+ // The file offset on success
+ // INVALID_ARGUMENT if document_id is less than 0 or greater than the
+ // maximum value
+ // NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
+ DocumentId document_id) const;
+
+ // Updates the entry in the score cache for document_id.
+ libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
+ DocumentId document_id, const DocumentAssociatedScoreData& score_data);
+
+ // Updates the entry in the filter cache for document_id.
+ libtextclassifier3::Status UpdateFilterCache(
+ DocumentId document_id, const DocumentFilterData& filter_data);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DOCUMENT_STORE_H_
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
new file mode 100644
index 0000000..45e2b9c
--- /dev/null
+++ b/icing/store/document-store_test.cc
@@ -0,0 +1,1886 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/document-store.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::_;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Not;
+using ::testing::Return;
+
+class DocumentStoreTest : public ::testing::Test {
+ protected:
+ DocumentStoreTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ document_store_dir_(test_dir_ + "/document_store"),
+ schema_store_dir_(test_dir_ + "/schema_store") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+ test_document1_ =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampSecs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlSecs(document1_ttl_)
+ .Build();
+ test_document2_ =
+ DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo 2")
+ .AddStringProperty("body", "body bar 2")
+ .SetScore(document2_score_)
+ .SetCreationTimestampSecs(
+ document2_creation_timestamp_) // A random timestamp
+ .SetTtlSecs(document2_ttl_)
+ .Build();
+ }
+
+ void SetUp() override {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ auto subject = type_config->add_properties();
+ subject->set_property_name("subject");
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ subject->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ subject->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ auto body = type_config->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ body->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ FakeClock fake_clock_;
+ const std::string document_store_dir_;
+ const std::string schema_store_dir_;
+ DocumentProto test_document1_;
+ DocumentProto test_document2_;
+ std::unique_ptr<SchemaStore> schema_store_;
+
+ // Document1 values
+ const int document1_score_ = 1;
+ const int64_t document1_creation_timestamp_ = 1;
+ const int64_t document1_ttl_ = 0;
+ const int64_t document1_expiration_timestamp_ =
+ std::numeric_limits<int64_t>::max(); // special_case where ttl=0
+
+ // Document2 values
+ const int document2_score_ = 2;
+ const int64_t document2_creation_timestamp_ = 2;
+ const int64_t document2_ttl_ = 1;
+ const int64_t document2_expiration_timestamp_ = 3; // creation + ttl
+};
+
+TEST_F(DocumentStoreTest, InitializationFailure) {
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, OpenForWrite(_)).WillByDefault(Return(false));
+
+ EXPECT_THAT(DocumentStore::Create(&mock_filesystem, document_store_dir_,
+ &fake_clock_, schema_store_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Both documents have namespace of "icing"
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+}
+
+TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Can handle different namespaces with same url
+ DocumentProto foo_document = DocumentBuilder()
+ .SetKey("foo", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+ DocumentProto bar_document = DocumentBuilder()
+ .SetKey("bar", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(foo_document));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(DocumentProto(bar_document)));
+
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(foo_document)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(bar_document)));
+}
+
+// Validates that putting an document with the same key will overwrite previous
+// document and old doc ids are not getting reused.
+TEST_F(DocumentStoreTest, PutSameKey) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Creates two documents with the same key (namespace + uri)
+ DocumentProto document1 = DocumentProto(test_document1_);
+ DocumentProto document2 = DocumentProto(test_document1_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(document2));
+ EXPECT_THAT(document_id1, Not(document_id2));
+ // document2 overrides document1, so document_id1 becomes invalid
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // Makes sure that old doc ids are not getting reused.
+ DocumentProto document3 = DocumentProto(test_document1_);
+ document3.set_uri("another/uri/1");
+ EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1)));
+}
+
+TEST_F(DocumentStoreTest, IsDocumentExisting) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(DocumentProto(test_document1_)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+
+ EXPECT_THAT(doc_store->DoesDocumentExist(document_id1), IsTrue());
+ EXPECT_THAT(doc_store->DoesDocumentExist(document_id2), IsTrue());
+
+ DocumentId invalid_document_id_negative = -1;
+ EXPECT_THAT(doc_store->DoesDocumentExist(invalid_document_id_negative),
+ IsFalse());
+
+ DocumentId invalid_document_id_greater_than_max = kMaxDocumentId + 2;
+ EXPECT_THAT(
+ doc_store->DoesDocumentExist(invalid_document_id_greater_than_max),
+ IsFalse());
+
+ EXPECT_THAT(doc_store->DoesDocumentExist(kInvalidDocumentId), IsFalse());
+
+ DocumentId invalid_document_id_out_of_range = document_id2 + 1;
+ EXPECT_THAT(doc_store->DoesDocumentExist(invalid_document_id_out_of_range),
+ IsFalse());
+}
+
+TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(10)
+ .SetTtlSecs(100)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(document));
+ EXPECT_THAT(document_store->Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(document)));
+
+ // Some arbitrary time before the document's creation time (10) + ttl (100)
+ fake_clock_.SetSeconds(109);
+ EXPECT_THAT(document_store->Get("namespace", "uri"),
+ IsOkAndHolds(EqualsProto(document)));
+
+ // Some arbitrary time equal to the document's creation time (10) + ttl (100)
+ fake_clock_.SetSeconds(110);
+ EXPECT_THAT(document_store->Get("namespace", "uri"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Some arbitrary time past the document's creation time (10) + ttl (100)
+ fake_clock_.SetSeconds(200);
+ EXPECT_THAT(document_store->Get("namespace", "uri"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, GetInvalidDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(DocumentProto(test_document1_)));
+
+ DocumentId invalid_document_id_negative = -1;
+ EXPECT_THAT(doc_store->Get(invalid_document_id_negative),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ DocumentId invalid_document_id_greater_than_max = kMaxDocumentId + 2;
+ EXPECT_THAT(doc_store->Get(invalid_document_id_greater_than_max),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(doc_store->Get(kInvalidDocumentId),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ DocumentId invalid_document_id_out_of_range = document_id + 1;
+ EXPECT_THAT(doc_store->Get(invalid_document_id_out_of_range),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, Delete) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Get() after Delete() returns NOT_FOUND
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ EXPECT_THAT(doc_store->Get(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ // icing + email/1 has already been deleted.
+ EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document1 = test_document1_;
+ document1.set_namespace_("namespace.1");
+ document1.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document1));
+
+ DocumentProto document2 = test_document1_;
+ document2.set_namespace_("namespace.2");
+ document2.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ DocumentProto document3 = test_document1_;
+ document3.set_namespace_("namespace.3");
+ document3.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document3));
+
+ DocumentProto document4 = test_document1_;
+ document4.set_namespace_("namespace.1");
+ document4.set_uri("uri2");
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ // DELETE namespace.1. document1 and document 4 should be deleted. document2
+ // and document3 should still be retrievable.
+ ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+ EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
+ IsOkAndHolds(EqualsProto(document2)));
+ EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
+ IsOkAndHolds(EqualsProto(document3)));
+ EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ ICING_EXPECT_OK(doc_store->DeleteByNamespace("nonexistent_namespace"));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
+ DocumentProto document1 = test_document1_;
+ document1.set_namespace_("namespace.1");
+ document1.set_uri("uri1");
+
+ DocumentProto document2 = test_document1_;
+ document2.set_namespace_("namespace.2");
+ document2.set_uri("uri1");
+
+ DocumentProto document3 = test_document1_;
+ document3.set_namespace_("namespace.3");
+ document3.set_uri("uri1");
+
+ DocumentProto document4 = test_document1_;
+ document4.set_namespace_("namespace.1");
+ document4.set_uri("uri2");
+
+ int64_t ground_truth_size_before;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK(doc_store->Put(document1));
+ ICING_ASSERT_OK(doc_store->Put(document2));
+ ICING_ASSERT_OK(doc_store->Put(document3));
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ // DELETE namespace.1. document1 and document 4 should be deleted. document2
+ // and document3 should still be retrievable.
+ ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+
+ ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ } // Destructors should update checksum and persist all data to file.
+
+ // Change the DocStore's header combined checksum so that it won't match the
+ // recalculated checksum on initialization. This will force a regeneration of
+ // derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Make sure we didn't add anything to the ground truth after we recovered.
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+
+ EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
+ IsOkAndHolds(EqualsProto(document2)));
+ EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
+ IsOkAndHolds(EqualsProto(document3)));
+ EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ type_config = schema.add_types();
+ type_config->set_schema_type("person");
+
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ DocumentProto email_document_1 = DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id,
+ document_store->Put(email_document_1));
+
+ DocumentProto email_document_2 = DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id,
+ document_store->Put(email_document_2));
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+
+ DocumentProto person_document = DocumentBuilder()
+ .SetKey("namespace", "4")
+ .SetSchema("person")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
+ document_store->Put(person_document));
+
+ // Delete the "email" type and ensure that it works across both
+ // email_document's namespaces. And that other documents aren't affected.
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+
+ // Delete the "message" type and check that other documents aren't affected
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType("message"));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+}
+
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType("nonexistent_type"));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ DocumentId email_document_id;
+ DocumentId message_document_id;
+
+ DocumentProto email_document = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(1)
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ int64_t ground_truth_size_before;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(email_document_id,
+ document_store->Put(email_document));
+ ICING_ASSERT_OK_AND_ASSIGN(message_document_id,
+ document_store->Put(message_document));
+
+ // Delete "email". "message" documents should still be retrievable.
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+
+ ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ } // Destructors should update checksum and persist all data to file.
+
+ // Change the DocumentStore's header combined checksum so that it won't match
+ // the recalculated checksum on initialization. This will force a regeneration
+ // of derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ // Make sure we didn't add anything to the ground truth after we recovered.
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+
+ EXPECT_THAT(document_store->Get(email_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+}
+
+TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ DocumentId email_document_id;
+ DocumentId message_document_id;
+
+ DocumentProto email_document = DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(1)
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "message")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(1)
+ .Build();
+ int64_t ground_truth_size_before;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(email_document_id,
+ document_store->Put(email_document));
+ ICING_ASSERT_OK_AND_ASSIGN(message_document_id,
+ document_store->Put(message_document));
+
+ // Delete "email". "message" documents should still be retrievable.
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+
+ EXPECT_THAT(document_store->Get(email_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+
+ ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ } // Destructors should update checksum and persist all data to file.
+
+ // Change the DocumentStore's header combined checksum so that it won't match
+ // the recalculated checksum on initialization. This will force a regeneration
+ // of derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ SchemaProto new_schema;
+ type_config = new_schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/true));
+
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ // Make sure we didn't add anything to the ground truth after we recovered.
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+
+ EXPECT_THAT(document_store->Get(email_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+}
+
+TEST_F(DocumentStoreTest, OptimizeInto) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(1000)
+ .Build();
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(1000)
+ .Build();
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(100)
+ .Build();
+
+ // Nothing should have expired yet.
+ fake_clock_.SetSeconds(100);
+
+ ICING_ASSERT_OK(doc_store->Put(document1));
+ ICING_ASSERT_OK(doc_store->Put(document2));
+ ICING_ASSERT_OK(doc_store->Put(document3));
+
+ std::string original_document_log = document_store_dir_ + "/document_log";
+ int64_t original_size =
+ filesystem_.GetFileSize(original_document_log.c_str());
+
+ // Optimizing into the same directory is not allowed
+ EXPECT_THAT(doc_store->OptimizeInto(document_store_dir_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("directory is the same")));
+
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ std::string optimized_document_log = optimized_dir + "/document_log";
+
+ // Validates that the optimized document log has the same size if nothing is
+ // deleted
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ int64_t optimized_size1 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_EQ(original_size, optimized_size1);
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1"));
+ ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ int64_t optimized_size2 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(original_size, Gt(optimized_size2));
+
+ // Document3 has expired since this is past its creation (100) + ttl (100).
+ // But document1 and document2 should be fine since their ttl's were 1000.
+ fake_clock_.SetSeconds(300);
+
+ // Validates that the optimized document log has a smaller size if something
+ // expired
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ int64_t optimized_size3 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size2, Gt(optimized_size3));
+}
+
+TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
+ DocumentId document_id1, document_id2;
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id1,
+ doc_store->Put(DocumentProto(test_document1_)));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ }
+
+ // "Corrupt" the content written in the log by adding non-checksummed data to
+ // it. This will mess up the checksum of the proto log, forcing it to rewind
+ // to the last saved point.
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ const std::string serialized_document = document.SerializeAsString();
+
+ const std::string document_log_file =
+ absl_ports::StrCat(document_store_dir_, "/document_log");
+ int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str());
+ filesystem_.PWrite(document_log_file.c_str(), file_size,
+ serialized_document.data(), serialized_document.size());
+
+ // Successfully recover from a data loss issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+
+ // Checks derived filter cache
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0, document2_expiration_timestamp_)));
+ // Checks derived score cache
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ document2_score_, document2_creation_timestamp_)));
+}
+
+TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
+ DocumentId document_id1, document_id2;
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id1,
+ doc_store->Put(DocumentProto(test_document1_)));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ }
+
+ // "Corrupt" one of the derived files by adding non-checksummed data to
+ // it. This will mess up the checksum and throw an error on the derived file's
+ // initialization.
+ const std::string document_id_mapper_file =
+ absl_ports::StrCat(document_store_dir_, "/document_id_mapper");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper,
+ FileBackedVector<int64_t>::Create(
+ filesystem_, document_id_mapper_file,
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ int64_t corrupt_document_id = 3;
+ int64_t corrupt_offset = 3;
+ EXPECT_THAT(document_id_mapper->Set(corrupt_document_id, corrupt_offset),
+ IsOk());
+
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+
+ // Checks derived filter cache
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0, document2_expiration_timestamp_)));
+ // Checks derived score cache
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ document2_score_, document2_creation_timestamp_)));
+}
+
+TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
+ DocumentId document_id1, document_id2;
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id1,
+ doc_store->Put(DocumentProto(test_document1_)));
+ ICING_ASSERT_OK_AND_ASSIGN(document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ }
+
+ // Change the DocStore's header combined checksum so that it won't match the
+ // recalculated checksum on initialization. This will force a regeneration of
+ // derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+
+ // Checks derived filter cache
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0, document2_expiration_timestamp_)));
+ // Checks derived score cache
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ document2_score_, document2_creation_timestamp_)));
+}
+
+TEST_F(DocumentStoreTest, GetDiskUsage) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t size1, doc_store->GetDiskUsage());
+ EXPECT_THAT(size1, Gt(0));
+
+ ICING_ASSERT_OK(doc_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t size2, doc_store->GetDiskUsage());
+ EXPECT_THAT(size2, Gt(size1));
+
+ ICING_ASSERT_OK(doc_store->Put(test_document2_));
+ EXPECT_THAT(doc_store->GetDiskUsage(), IsOkAndHolds(Gt(size2)));
+ doc_store.reset();
+
+ // Bad file system
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, GetDiskUsage(A<const char *>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store_with_mock_filesystem,
+ DocumentStore::Create(&mock_filesystem, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ EXPECT_THAT(doc_store_with_mock_filesystem->GetDiskUsage(),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(DocumentStoreTest, MaxDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Since the DocumentStore is empty, we get an invalid DocumentId
+ EXPECT_THAT(doc_store->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id1));
+
+ // Still returns the last DocumentId even if it was deleted
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(DocumentProto(test_document2_)));
+ EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id2));
+}
+
+TEST_F(DocumentStoreTest, GetNamespaceId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document_namespace1 =
+ DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
+ DocumentProto document_namespace2 =
+ DocumentBuilder().SetKey("namespace2", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(DocumentProto(document_namespace1)));
+ ICING_ASSERT_OK(doc_store->Put(DocumentProto(document_namespace2)));
+
+ // NamespaceId of 0 since it was the first namespace seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0)));
+
+ // NamespaceId of 1 since it was the second namespace seen by the
+ // DocumentStore
+ EXPECT_THAT(doc_store->GetNamespaceId("namespace2"), IsOkAndHolds(Eq(1)));
+
+ // NamespaceMapper doesn't care if the document has been deleted
+ EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0)));
+}
+
+TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(document1));
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ // NamespaceId of 0 since it was the first namespace seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetNamespaceId("namespace"), IsOkAndHolds(Eq(0)));
+}
+
+TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ EXPECT_THAT(doc_store->GetNamespaceId("nonexistent_namespace"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentFilterData(document_id),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_secs=*/document1_expiration_timestamp_)));
+
+ // FilterCache doesn't care if the document has been deleted
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ EXPECT_THAT(
+ doc_store->GetDocumentFilterData(document_id),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_secs=*/document1_expiration_timestamp_)));
+}
+
+TEST_F(DocumentStoreTest,
+ ExpirationTimestampIsSumOfNonZeroTtlAndCreationTimestamp) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(1000)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentFilterData(document_id),
+ IsOkAndHolds(DocumentFilterData(/*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_secs=*/1100)));
+}
+
+TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(100)
+ .SetTtlSecs(0)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentFilterData(document_id),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_secs=*/std::numeric_limits<int64_t>::max())));
+}
+
+TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(std::numeric_limits<int64_t>::max() - 1)
+ .SetTtlSecs(std::numeric_limits<int64_t>::max() - 1)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentFilterData(document_id),
+ IsOkAndHolds(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_secs=*/std::numeric_limits<int64_t>::max())));
+}
+
+TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) {
+ // Creates a document without a given creation timestamp
+ DocumentProto document_without_creation_timestamp =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .Build();
+
+ std::time_t fake_real_time = 100;
+ fake_clock_.SetSeconds(fake_real_time);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ doc_store->Put(document_without_creation_timestamp));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentProto document_with_creation_timestamp,
+ doc_store->Get(document_id));
+
+ // Now the creation timestamp should be set by document store.
+ EXPECT_THAT(document_with_creation_timestamp.creation_timestamp_secs(),
+ Eq(fake_real_time));
+}
+
+TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ // With default doc score 0
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetScore(5)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(document2));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/0, /*creation_timestamp_secs=*/0)));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/5, /*creation_timestamp_secs=*/0)));
+}
+
+TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
+
+ // Calling ComputeChecksum again shouldn't change anything
+ EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum));
+}
+
+TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
+
+ // Destroy the previous instance and recreate DocumentStore
+ document_store.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store, DocumentStore::Create(&filesystem_, document_store_dir_,
+ &fake_clock_, schema_store_.get()));
+
+ EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum));
+}
+
+TEST_F(DocumentStoreTest, ComputeChecksumChangesOnModification) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
+
+ ICING_EXPECT_OK(document_store->Put(test_document2_));
+ EXPECT_THAT(document_store->ComputeChecksum(),
+ IsOkAndHolds(Not(Eq(checksum))));
+}
+
+TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
+ const std::string schema_store_dir = schema_store_dir_ + "_custom";
+
+ DocumentId email_document_id;
+ NamespaceId email_namespace_id;
+ int64_t email_expiration_timestamp;
+ DocumentProto email_document = DocumentBuilder()
+ .SetKey("namespace", "email_uri")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ DocumentId message_document_id;
+ NamespaceId message_namespace_id;
+ int64_t message_expiration_timestamp;
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "message_uri")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ {
+ // Set a schema with "email" and "message"
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ // Insert and verify a "email "document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ email_document_id, document_store->Put(DocumentProto(email_document)));
+ EXPECT_THAT(document_store->Get(email_document_id),
+ IsOkAndHolds(EqualsProto(email_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData email_data,
+ document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(email_schema_type_id));
+ email_namespace_id = email_data.namespace_id();
+ email_expiration_timestamp = email_data.expiration_timestamp_secs();
+
+ // Insert and verify a "message" document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ message_document_id,
+ document_store->Put(DocumentProto(message_document)));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData message_data,
+ document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(message_schema_type_id));
+ message_namespace_id = message_data.namespace_id();
+ message_expiration_timestamp = message_data.expiration_timestamp_secs();
+ } // Everything destructs and commits changes to file
+
+ // Change the DocumentStore's header combined checksum so that it won't match
+ // the recalculated checksum on initialization. This will force a regeneration
+ // of derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::kMagic;
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+
+ // Change the schema so that we don't know of the Document's type anymore.
+ // Since we can't set backwards incompatible changes, we do some file-level
+ // hacks to "reset" the schema. Without a previously existing schema, the new
+ // schema isn't considered backwards incompatible
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+
+ // Successfully recover from a corrupt derived file issue. We don't fail just
+ // because the "message" schema type is missing
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ // "email" document is fine
+ EXPECT_THAT(document_store->Get(email_document_id),
+ IsOkAndHolds(EqualsProto(email_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData email_data,
+ document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(email_schema_type_id));
+ // Make sure that all the other fields are stll valid/the same
+ EXPECT_THAT(email_data.namespace_id(), Eq(email_namespace_id));
+ EXPECT_THAT(email_data.expiration_timestamp_secs(),
+ Eq(email_expiration_timestamp));
+
+ // "message" document has an invalid SchemaTypeId
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData message_data,
+ document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(-1));
+ // Make sure that all the other fields are stll valid/the same
+ EXPECT_THAT(message_data.namespace_id(), Eq(message_namespace_id));
+ EXPECT_THAT(message_data.expiration_timestamp_secs(),
+ Eq(message_expiration_timestamp));
+}
+
+TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ DocumentProto email_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri")
+ .SetSchema("email")
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("message_uri")
+ .SetSchema("message")
+ .Build();
+
+ // Add the documents and check SchemaTypeIds match
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store->Put(email_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData email_data,
+ document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(old_email_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData message_data,
+ document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(old_message_schema_type_id));
+
+ // Rearrange the schema types. Since SchemaTypeId is assigned based on order,
+ // this should change the SchemaTypeIds.
+ schema.clear_types();
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ // SchemaTypeIds should have changed.
+ EXPECT_NE(old_email_schema_type_id, new_email_schema_type_id);
+ EXPECT_NE(old_message_schema_type_id, new_message_schema_type_id);
+
+ ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
+
+ // Check that the FilterCache holds the new SchemaTypeIds
+ ICING_ASSERT_OK_AND_ASSIGN(
+ email_data, document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(new_email_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ message_data, document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id));
+}
+
+TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ auto property_config = type_config->add_properties();
+ property_config->set_property_name("subject");
+ property_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Add two documents, with and without a subject
+ DocumentProto email_without_subject = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri_without_subject")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ DocumentProto email_with_subject = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri_with_subject")
+ .SetSchema("email")
+ .AddStringProperty("subject", "foo")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ // Insert documents and check they're ok
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_without_subject_document_id,
+ document_store->Put(email_without_subject));
+ EXPECT_THAT(document_store->Get(email_without_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_without_subject)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_with_subject_document_id,
+ document_store->Put(email_with_subject));
+ EXPECT_THAT(document_store->Get(email_with_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_with_subject)));
+
+ // Changing an OPTIONAL field to REQUIRED is backwards incompatible, and will
+ // invalidate all documents that don't have this property set
+ schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
+
+ // The email without a subject should be marked as deleted
+ EXPECT_THAT(document_store->Get(email_without_subject_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // The email with a subject should be unaffected
+ EXPECT_THAT(document_store->Get(email_with_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_with_subject)));
+}
+
+TEST_F(DocumentStoreTest,
+ UpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Add a "email" and "message" document
+ DocumentProto email_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("message_uri")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ // Insert documents and check they're ok
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store->Put(email_document));
+ EXPECT_THAT(document_store->Get(email_document_id),
+ IsOkAndHolds(EqualsProto(email_document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+
+ SchemaProto new_schema;
+ type_config = new_schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_EXPECT_OK(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
+
+ // The "email" type is unknown now, so the "email" document should be deleted
+ EXPECT_THAT(document_store->Get(email_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // The "message" document should be unaffected
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+}
+
+TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ DocumentProto email_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri")
+ .SetSchema("email")
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("message_uri")
+ .SetSchema("message")
+ .Build();
+
+ // Add the documents and check SchemaTypeIds match
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store->Put(email_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData email_data,
+ document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(old_email_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentFilterData message_data,
+ document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(old_message_schema_type_id));
+
+ // Rearrange the schema types. Since SchemaTypeId is assigned based on order,
+ // this should change the SchemaTypeIds.
+ schema.clear_types();
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaStore::SetSchemaResult set_schema_result,
+ schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_email_schema_type_id,
+ schema_store->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_message_schema_type_id,
+ schema_store->GetSchemaTypeId("message"));
+
+ // SchemaTypeIds should have changed.
+ EXPECT_NE(old_email_schema_type_id, new_email_schema_type_id);
+ EXPECT_NE(old_message_schema_type_id, new_message_schema_type_id);
+
+ ICING_EXPECT_OK(document_store->OptimizedUpdateSchemaStore(
+ schema_store.get(), set_schema_result));
+
+ // Check that the FilterCache holds the new SchemaTypeIds
+ ICING_ASSERT_OK_AND_ASSIGN(
+ email_data, document_store->GetDocumentFilterData(email_document_id));
+ EXPECT_THAT(email_data.schema_type_id(), Eq(new_email_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ message_data, document_store->GetDocumentFilterData(message_document_id));
+ EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id));
+}
+
+TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ auto property_config = type_config->add_properties();
+ property_config->set_property_name("subject");
+ property_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Add two documents, with and without a subject
+ DocumentProto email_without_subject = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri_without_subject")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ DocumentProto email_with_subject = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri_with_subject")
+ .SetSchema("email")
+ .AddStringProperty("subject", "foo")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ // Insert documents and check they're ok
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_without_subject_document_id,
+ document_store->Put(email_without_subject));
+ EXPECT_THAT(document_store->Get(email_without_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_without_subject)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_with_subject_document_id,
+ document_store->Put(email_with_subject));
+ EXPECT_THAT(document_store->Get(email_with_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_with_subject)));
+
+ // Changing an OPTIONAL field to REQUIRED is backwards incompatible, and will
+ // invalidate all documents that don't have this property set
+ schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaStore::SetSchemaResult set_schema_result,
+ schema_store->SetSchema(schema,
+ /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_EXPECT_OK(document_store->OptimizedUpdateSchemaStore(
+ schema_store.get(), set_schema_result));
+
+ // The email without a subject should be marked as deleted
+ EXPECT_THAT(document_store->Get(email_without_subject_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // The email with a subject should be unaffected
+ EXPECT_THAT(document_store->Get(email_with_subject_document_id),
+ IsOkAndHolds(EqualsProto(email_with_subject)));
+}
+
+TEST_F(DocumentStoreTest,
+ OptimizedUpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) {
+ const std::string schema_store_dir = test_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ // Set a schema
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+ ICING_EXPECT_OK(schema_store->SetSchema(schema));
+
+ // Add a "email" and "message" document
+ DocumentProto email_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("email_uri")
+ .SetSchema("email")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri("message_uri")
+ .SetSchema("message")
+ .SetCreationTimestampSecs(0)
+ .Build();
+
+ // Insert documents and check they're ok
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store->Put(email_document));
+ EXPECT_THAT(document_store->Get(email_document_id),
+ IsOkAndHolds(EqualsProto(email_document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+
+ SchemaProto new_schema;
+ type_config = new_schema.add_types();
+ type_config->set_schema_type("message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaStore::SetSchemaResult set_schema_result,
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_EXPECT_OK(document_store->OptimizedUpdateSchemaStore(
+ schema_store.get(), set_schema_result));
+
+ // The "email" type is unknown now, so the "email" document should be deleted
+ EXPECT_THAT(document_store->Get(email_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // The "message" document should be unaffected
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
new file mode 100644
index 0000000..1d2d455
--- /dev/null
+++ b/icing/store/key-mapper.h
@@ -0,0 +1,267 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_KEY_MAPPER_H_
+#define ICING_STORE_KEY_MAPPER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// File-backed mapping between the string key and a trivially copyable value
+// type.
+//
+// KeyMapper is thread-compatible
+template <typename T>
+class KeyMapper {
+ public:
+ // Returns an initialized instance of KeyMapper that can immediately handle
+ // read/write operations.
+ // Returns any encountered IO errors.
+ //
+ // base_dir : Base directory used to save all the files required to persist
+ // KeyMapper. If this base_dir was previously used to create a
+ // KeyMapper, then this existing data would be loaded. Otherwise,
+ // an empty KeyMapper would be created.
+ // maximum_size_bytes : The maximum allowable size of the key mapper storage.
+ static libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<T>>> Create(
+ const Filesystem& filesystem, std::string_view base_dir,
+ int maximum_size_bytes);
+
+ // Deletes all the files associated with the KeyMapper. Returns success or any
+ // encountered IO errors
+ //
+ // base_dir : Base directory used to save all the files required to persist
+ // KeyMapper. Should be the same as passed into Create().
+ static libtextclassifier3::Status Delete(const Filesystem& filesystem,
+ std::string_view base_dir);
+
+ ~KeyMapper() = default;
+
+ // Inserts/Updates value for key.
+ // Returns any encountered IO errors.
+ //
+ // NOTE: Put() doesn't automatically flush changes to disk and relies on
+ // either explicit calls to PersistToDisk() or a clean shutdown of the class.
+ libtextclassifier3::Status Put(std::string_view key, T value);
+
+ // Finds the current value for key and returns it. If key is not present, it
+ // is inserted with next_value and next_value is returned.
+ //
+ // Returns any IO errors that may occur during Put.
+ libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key, T next_value);
+
+ // Returns the value corresponding to the key.
+ //
+ // Returns NOT_FOUND error if the key was missing.
+ // Returns any encountered IO errors.
+ libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
+
+ // Count of unique keys stored in the KeyMapper.
+ int32_t num_keys() const { return trie_.size(); }
+
+ // Syncs all the changes made to the KeyMapper to disk.
+ // Returns any encountered IO errors.
+ //
+ // NOTE: To control disk-churn, Put() doesn't automatically persist every
+ // change to disk. The caller should explicitly call PersistToDisk() to make
+ // sure that the data is durable.
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates and returns the disk usage in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Computes and returns the checksum of the header and contents.
+ Crc32 ComputeChecksum();
+
+ private:
+ static constexpr char kKeyMapperDir[] = "key_mapper_dir";
+ static constexpr char kKeyMapperPrefix[] = "key_mapper";
+
+ // Use KeyMapper::Create() to instantiate.
+ explicit KeyMapper(std::string_view key_mapper_dir);
+
+ // Load any existing KeyMapper data from disk, or creates a new instance
+ // of KeyMapper on disk and gets ready to process read/write operations.
+ //
+ // Returns any encountered IO errors.
+ libtextclassifier3::Status Initialize(int maximum_size_bytes);
+
+ const std::string file_prefix_;
+
+ // TODO(adorokhine) Filesystem is a forked class that's available both in
+ // icing and icing namespaces. We will need icing::Filesystem in order
+ // to use IcingDynamicTrie. Filesystem class should be fully refactored
+ // to have a single definition across both namespaces. Such a class should
+ // use icing (and general google3) coding conventions and behave like
+ // a proper C++ class.
+ const IcingFilesystem icing_filesystem_;
+ IcingDynamicTrie trie_;
+
+ static_assert(std::is_trivially_copyable<T>::value,
+ "T must be trivially copyable");
+};
+
+template <typename T>
+libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<T>>>
+KeyMapper<T>::Create(const Filesystem& filesystem, std::string_view base_dir,
+ int maximum_size_bytes) {
+ // We create a subdirectory since the trie creates and stores multiple files.
+ // This makes it easier to isolate the trie files away from other files that
+ // could potentially be in the same base_dir, and makes it easier to delete.
+ const std::string key_mapper_dir =
+ absl_ports::StrCat(base_dir, "/", kKeyMapperDir);
+ if (!filesystem.CreateDirectoryRecursively(key_mapper_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to create KeyMapper directory: ", key_mapper_dir));
+ }
+ auto mapper = std::unique_ptr<KeyMapper<T>>(new KeyMapper<T>(key_mapper_dir));
+ ICING_RETURN_IF_ERROR(mapper->Initialize(maximum_size_bytes));
+ return mapper;
+}
+
+template <typename T>
+libtextclassifier3::Status KeyMapper<T>::Delete(const Filesystem& filesystem,
+ std::string_view base_dir) {
+ std::string key_mapper_dir = absl_ports::StrCat(base_dir, "/", kKeyMapperDir);
+ if (!filesystem.DeleteDirectoryRecursively(key_mapper_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to delete KeyMapper directory: ", key_mapper_dir));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+KeyMapper<T>::KeyMapper(std::string_view key_mapper_dir)
+ : file_prefix_(absl_ports::StrCat(key_mapper_dir, "/", kKeyMapperPrefix)),
+ trie_(file_prefix_,
+ IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc),
+ &icing_filesystem_) {}
+
+template <typename T>
+libtextclassifier3::Status KeyMapper<T>::Initialize(int maximum_size_bytes) {
+ IcingDynamicTrie::Options options;
+ // Divide the max space between the three internal arrays: nodes, nexts and
+ // suffixes. MaxNodes and MaxNexts are in units of their own data structures.
+ // MaxSuffixesSize is in units of bytes.
+ options.max_nodes = maximum_size_bytes / (3 * sizeof(IcingDynamicTrie::Node));
+ options.max_nexts = options.max_nodes;
+ options.max_suffixes_size =
+ sizeof(IcingDynamicTrie::Node) * options.max_nodes;
+ options.value_size = sizeof(T);
+
+ if (!trie_.CreateIfNotExist(options)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create KeyMapper file: ", file_prefix_));
+ }
+ if (!trie_.Init()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to init KeyMapper file: ", file_prefix_));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<T> KeyMapper<T>::GetOrPut(std::string_view key,
+ T next_value) {
+ std::string string_key(key);
+ uint32_t value_index;
+ if (!trie_.Insert(string_key.c_str(), &next_value, &value_index,
+ /*replace=*/false)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to insert key ", key, " into KeyMapper ", file_prefix_, "."));
+ }
+ // This memory address could be unaligned since we're just grabbing the value
+ // from somewhere in the trie's suffix array. The suffix array is filled with
+ // chars, so the address might not be aligned to T values.
+ const T* unaligned_value =
+ static_cast<const T*>(trie_.GetValueAtIndex(value_index));
+
+ // memcpy the value to ensure that the returned value here is in a T-aligned
+ // address
+ T aligned_value;
+ memcpy(&aligned_value, unaligned_value, sizeof(T));
+ return aligned_value;
+}
+
+template <typename T>
+libtextclassifier3::Status KeyMapper<T>::Put(std::string_view key, T value) {
+ std::string string_key(key);
+ if (!trie_.Insert(string_key.c_str(), &value)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to insert key ", key, " into KeyMapper ", file_prefix_, "."));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<T> KeyMapper<T>::Get(std::string_view key) const {
+ std::string string_key(key);
+ T value;
+ if (!trie_.Find(string_key.c_str(), &value)) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Key not found ", key, " in KeyMapper ", file_prefix_, "."));
+ }
+ return value;
+}
+
+template <typename T>
+libtextclassifier3::Status KeyMapper<T>::PersistToDisk() {
+ if (!trie_.Sync()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to sync KeyMapper file: ", file_prefix_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetDiskUsage() const {
+ int64_t size = trie_.GetDiskUsage();
+ if (size == IcingFilesystem::kBadFileSize || size < 0) {
+ return absl_ports::InternalError("Failed to get disk usage of key mapper");
+ }
+ return size;
+}
+
+template <typename T>
+Crc32 KeyMapper<T>::ComputeChecksum() {
+ return Crc32(trie_.UpdateCrc());
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_KEY_MAPPER_H_
diff --git a/icing/store/key-mapper_test.cc b/icing/store/key-mapper_test.cc
new file mode 100644
index 0000000..c75c203
--- /dev/null
+++ b/icing/store/key-mapper_test.cc
@@ -0,0 +1,168 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/key-mapper.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+using ::testing::_;
+using ::testing::HasSubstr;
+
+namespace icing {
+namespace lib {
+namespace {
+constexpr int kMaxKeyMapperSize = 3 * 1024 * 1024; // 3 MiB
+
+class KeyMapperTest : public testing::Test {
+ protected:
+ void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::string base_dir_;
+ Filesystem filesystem_;
+};
+
+TEST_F(KeyMapperTest, InvalidBaseDir) {
+ ASSERT_THAT(
+ KeyMapper<DocumentId>::Create(filesystem_, "/dev/null", kMaxKeyMapperSize)
+ .status()
+ .error_message(),
+ HasSubstr("Failed to create KeyMapper"));
+}
+
+TEST_F(KeyMapperTest, NegativeMaxKeyMapperSizeReturnsInternalError) {
+ ASSERT_THAT(KeyMapper<DocumentId>::Create(filesystem_, base_dir_, -1),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(KeyMapperTest, TooLargeMaxKeyMapperSizeReturnsInternalError) {
+ ASSERT_THAT(KeyMapper<DocumentId>::Create(filesystem_, base_dir_,
+ std::numeric_limits<int>::max()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(KeyMapperTest, CreateNewKeyMapper) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ EXPECT_THAT(key_mapper->num_keys(), 0);
+}
+
+TEST_F(KeyMapperTest, CanUpdateSameKeyMultipleTimes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
+ ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 50));
+
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100));
+
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 200));
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(200));
+ EXPECT_THAT(key_mapper->num_keys(), 2);
+
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300));
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300));
+ EXPECT_THAT(key_mapper->num_keys(), 2);
+}
+
+TEST_F(KeyMapperTest, GetOrPutOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+
+ EXPECT_THAT(key_mapper->Get("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(key_mapper->GetOrPut("foo", 1), IsOkAndHolds(1));
+ EXPECT_THAT(key_mapper->Get("foo"), IsOkAndHolds(1));
+}
+
+TEST_F(KeyMapperTest, CanPersistToDiskRegularly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ // Can persist an empty KeyMapper.
+ ICING_EXPECT_OK(key_mapper->PersistToDisk());
+ EXPECT_THAT(key_mapper->num_keys(), 0);
+
+ // Can persist the smallest KeyMapper.
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
+ ICING_EXPECT_OK(key_mapper->PersistToDisk());
+ EXPECT_THAT(key_mapper->num_keys(), 1);
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100));
+
+ // Can continue to add keys after PersistToDisk().
+ ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200));
+ EXPECT_THAT(key_mapper->num_keys(), 2);
+ EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200));
+
+ // Can continue to update the same key after PersistToDisk().
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300));
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300));
+ EXPECT_THAT(key_mapper->num_keys(), 2);
+}
+
+TEST_F(KeyMapperTest, CanUseAcrossMultipleInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
+ ICING_EXPECT_OK(key_mapper->PersistToDisk());
+
+ key_mapper.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ EXPECT_THAT(key_mapper->num_keys(), 1);
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100));
+
+ // Can continue to read/write to the KeyMapper.
+ ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200));
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300));
+ EXPECT_THAT(key_mapper->num_keys(), 2);
+ EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200));
+ EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300));
+}
+
+TEST_F(KeyMapperTest, CanDeleteAndRestartKeyMapping) {
+ // Can delete even if there's nothing there
+ ICING_EXPECT_OK(KeyMapper<DocumentId>::Delete(filesystem_, base_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
+ ICING_EXPECT_OK(key_mapper->PersistToDisk());
+ ICING_EXPECT_OK(KeyMapper<DocumentId>::Delete(filesystem_, base_dir_));
+
+ key_mapper.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ EXPECT_THAT(key_mapper->num_keys(), 0);
+ ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
+ EXPECT_THAT(key_mapper->num_keys(), 1);
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h
new file mode 100644
index 0000000..3e007d1
--- /dev/null
+++ b/icing/testing/common-matchers.h
@@ -0,0 +1,295 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_COMMON_MATCHERS_H_
+#define ICING_TESTING_COMMON_MATCHERS_H_
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+// Used to match Token(Token::Type type, std::string_view text)
+MATCHER_P2(EqualsToken, type, text, "") {
+ if (arg.type != type || arg.text != text) {
+ *result_listener << IcingStringUtil::StringPrintf(
+ "(Expected: type=%d, text=\"%s\". Actual: type=%d, text=\"%s\")", type,
+ &text[0], arg.type, arg.text.data());
+ return false;
+ }
+ return true;
+}
+
+// Used to match a DocHitInfo
+MATCHER_P2(EqualsDocHitInfo, document_id, section_ids, "") {
+ const DocHitInfo& actual = arg;
+ SectionIdMask section_mask = kSectionIdMaskNone;
+ for (SectionId section_id : section_ids) {
+ section_mask |= 1U << section_id;
+ }
+ *result_listener << IcingStringUtil::StringPrintf(
+ "(actual is {document_id=%d, section_mask=%d}, but expected was "
+ "{document_id=%d, section_mask=%d}.)",
+ actual.document_id(), actual.hit_section_ids_mask(), document_id,
+ section_mask);
+ return actual.document_id() == document_id &&
+ actual.hit_section_ids_mask() == section_mask;
+}
+
+// Used to match a ScoredDocumentHit
+MATCHER_P(EqualsScoredDocumentHit, expected_scored_document_hit, "") {
+ if (arg.document_id() != expected_scored_document_hit.document_id() ||
+ arg.hit_section_id_mask() !=
+ expected_scored_document_hit.hit_section_id_mask() ||
+ arg.score() != expected_scored_document_hit.score()) {
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected: document_id=%d, hit_section_id_mask=%d, score=%.2f. Actual: "
+ "document_id=%d, hit_section_id_mask=%d, score=%.2f",
+ expected_scored_document_hit.document_id(),
+ expected_scored_document_hit.hit_section_id_mask(),
+ expected_scored_document_hit.score(), arg.document_id(),
+ arg.hit_section_id_mask(), arg.score());
+ return false;
+ }
+ return true;
+}
+
+MATCHER_P(EqualsSetSchemaResult, expected, "") {
+ const SchemaStore::SetSchemaResult& actual = arg;
+
+ if (actual.success == expected.success &&
+ actual.index_incompatible == expected.index_incompatible &&
+ actual.old_schema_type_ids_changed ==
+ expected.old_schema_type_ids_changed &&
+ actual.schema_types_deleted_by_name ==
+ expected.schema_types_deleted_by_name &&
+ actual.schema_types_deleted_by_id ==
+ expected.schema_types_deleted_by_id &&
+ actual.schema_types_incompatible_by_name ==
+ expected.schema_types_incompatible_by_name &&
+ actual.schema_types_incompatible_by_id ==
+ expected.schema_types_incompatible_by_id) {
+ return true;
+ }
+
+ // Format schema_type_ids_changed
+ std::string actual_old_schema_type_ids_changed = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(actual.old_schema_type_ids_changed, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ std::string expected_old_schema_type_ids_changed = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(expected.old_schema_type_ids_changed, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ // Format schema_types_deleted_by_name
+ std::string actual_schema_types_deleted_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(actual.schema_types_deleted_by_name, ","), "]");
+
+ std::string expected_schema_types_deleted_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(expected.schema_types_deleted_by_name, ","),
+ "]");
+
+ // Format schema_types_deleted_by_id
+ std::string actual_schema_types_deleted_by_id = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(actual.schema_types_deleted_by_id, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ std::string expected_schema_types_deleted_by_id = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(expected.schema_types_deleted_by_id, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ // Format schema_types_incompatible_by_name
+ std::string actual_schema_types_incompatible_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(actual.schema_types_incompatible_by_name, ","),
+ "]");
+
+ std::string expected_schema_types_incompatible_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(expected.schema_types_incompatible_by_name, ","),
+ "]");
+
+ // Format schema_types_incompatible_by_id
+ std::string actual_schema_types_incompatible_by_id = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(actual.schema_types_incompatible_by_id, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ std::string expected_schema_types_incompatible_by_id = absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(expected.schema_types_incompatible_by_id, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+
+ *result_listener << IcingStringUtil::StringPrintf(
+ "\nExpected {\n"
+ "\tsuccess=%d,\n"
+ "\tindex_incompatible=%d,\n"
+ "\told_schema_type_ids_changed=%s,\n"
+ "\tschema_types_deleted_by_name=%s,\n"
+ "\tschema_types_deleted_by_id=%s,\n"
+ "\tschema_types_incompatible_by_name=%s,\n"
+ "\tschema_types_incompatible_by_id=%s\n"
+ "}\n"
+ "Actual {\n"
+ "\tsuccess=%d,\n"
+ "\tindex_incompatible=%d,\n"
+ "\told_schema_type_ids_changed=%s,\n"
+ "\tschema_types_deleted_by_name=%s,\n"
+ "\tschema_types_deleted_by_id=%s,\n"
+ "\tschema_types_incompatible_by_name=%s,\n"
+ "\tschema_types_incompatible_by_id=%s\n"
+ "}\n",
+ expected.success, expected.index_incompatible,
+ expected_old_schema_type_ids_changed.c_str(),
+ expected_schema_types_deleted_by_name.c_str(),
+ expected_schema_types_deleted_by_id.c_str(),
+ expected_schema_types_incompatible_by_name.c_str(),
+ expected_schema_types_incompatible_by_id.c_str(), actual.success,
+ actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(),
+ actual_schema_types_deleted_by_name.c_str(),
+ actual_schema_types_deleted_by_id.c_str(),
+ actual_schema_types_incompatible_by_name.c_str(),
+ actual_schema_types_incompatible_by_id.c_str());
+
+ return false;
+}
+
+std::string StatusCodeToString(libtextclassifier3::StatusCode code) {
+ switch (code) {
+ case libtextclassifier3::StatusCode::OK:
+ return "OK";
+ case libtextclassifier3::StatusCode::CANCELLED:
+ return "CANCELLED";
+ case libtextclassifier3::StatusCode::UNKNOWN:
+ return "UNKNOWN";
+ case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
+ return "INVALID_ARGUMENT";
+ case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+ return "DEADLINE_EXCEEDED";
+ case libtextclassifier3::StatusCode::NOT_FOUND:
+ return "NOT_FOUND";
+ case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+ return "ALREADY_EXISTS";
+ case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+ return "PERMISSION_DENIED";
+ case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
+ return "RESOURCE_EXHAUSTED";
+ case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
+ return "FAILED_PRECONDITION";
+ case libtextclassifier3::StatusCode::ABORTED:
+ return "ABORTED";
+ case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+ return "OUT_OF_RANGE";
+ case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+ return "UNIMPLEMENTED";
+ case libtextclassifier3::StatusCode::INTERNAL:
+ return "INTERNAL";
+ case libtextclassifier3::StatusCode::UNAVAILABLE:
+ return "UNAVAILABLE";
+ case libtextclassifier3::StatusCode::DATA_LOSS:
+ return "DATA_LOSS";
+ case libtextclassifier3::StatusCode::UNAUTHENTICATED:
+ return "UNAUTHENTICATED";
+ default:
+ return "";
+ }
+}
+
+MATCHER(IsOk, "") {
+ absl_ports::StatusAdapter adapter(arg);
+ if (adapter.status().ok()) {
+ return true;
+ }
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected OK, actual was (%s:%s)",
+ StatusCodeToString(adapter.status().CanonicalCode()).c_str(),
+ adapter.status().error_message().c_str());
+ return false;
+}
+
+MATCHER_P(IsOkAndHolds, matcher, "") {
+ if (!arg.ok()) {
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected OK, actual was (%s:%s)",
+ StatusCodeToString(arg.status().CanonicalCode()).c_str(),
+ arg.status().error_message().c_str());
+ return false;
+ }
+ return ExplainMatchResult(matcher, arg.ValueOrDie(), result_listener);
+}
+
+MATCHER_P(StatusIs, status_code, "") {
+ absl_ports::StatusAdapter adapter(arg);
+ if (adapter.status().CanonicalCode() == status_code) {
+ return true;
+ }
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected (%s:), actual was (%s:%s)",
+ StatusCodeToString(status_code).c_str(),
+ StatusCodeToString(adapter.status().CanonicalCode()).c_str(),
+ adapter.status().error_message().c_str());
+ return false;
+}
+
+MATCHER_P2(StatusIs, status_code, error_matcher, "") {
+ absl_ports::StatusAdapter adapter(arg);
+ if (adapter.status().CanonicalCode() != status_code) {
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected (%s:), actual was (%s:%s)",
+ StatusCodeToString(status_code).c_str(),
+ StatusCodeToString(adapter.status().CanonicalCode()).c_str(),
+ adapter.status().error_message().c_str());
+ return false;
+ }
+ return ExplainMatchResult(error_matcher, adapter.status().error_message(),
+ result_listener);
+}
+
+// TODO(tjbarron) Remove this once icing has switched to depend on TC3 Status
+#define ICING_STATUS_MACROS_CONCAT_NAME(x, y) \
+ ICING_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define ICING_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+#define ICING_EXPECT_OK(func) EXPECT_THAT(func, IsOk())
+#define ICING_ASSERT_OK(func) ASSERT_THAT(func, IsOk())
+#define ICING_ASSERT_OK_AND_ASSIGN(lhs, rexpr) \
+ ICING_ASSERT_OK_AND_ASSIGN_IMPL( \
+ ICING_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+ rexpr)
+#define ICING_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+ auto statusor = (rexpr); \
+ ICING_ASSERT_OK(statusor.status()); \
+ lhs = std::move(statusor).ValueOrDie()
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_COMMON_MATCHERS_H_
diff --git a/icing/testing/fake-clock.h b/icing/testing/fake-clock.h
new file mode 100644
index 0000000..c3b3af5
--- /dev/null
+++ b/icing/testing/fake-clock.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_FAKE_CLOCK_H_
+#define ICING_TESTING_FAKE_CLOCK_H_
+
+#include <ctime>
+
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+// Wrapper around real-time clock functions. This is separated primarily so
+// tests can override this clock and inject it into the class under test.
+class FakeClock : public Clock {
+ public:
+ std::time_t GetCurrentSeconds() const override { return seconds_; }
+
+ void SetSeconds(std::time_t seconds) { seconds_ = seconds; }
+
+ private:
+ std::time_t seconds_ = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_FAKE_CLOCK_H_
diff --git a/icing/testing/fake-clock_test.cc b/icing/testing/fake-clock_test.cc
new file mode 100644
index 0000000..3e85b35
--- /dev/null
+++ b/icing/testing/fake-clock_test.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/fake-clock.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(FakeClockTest, GetSetOk) {
+ FakeClock fake_clock;
+ EXPECT_THAT(fake_clock.GetCurrentSeconds(), Eq(0));
+
+ fake_clock.SetSeconds(10);
+ EXPECT_THAT(fake_clock.GetCurrentSeconds(), Eq(10));
+
+ fake_clock.SetSeconds(-1);
+ EXPECT_THAT(fake_clock.GetCurrentSeconds(), Eq(-1));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/i18n-test-utils.cc b/icing/testing/i18n-test-utils.cc
new file mode 100644
index 0000000..3839dc8
--- /dev/null
+++ b/icing/testing/i18n-test-utils.cc
@@ -0,0 +1,46 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/i18n-test-utils.h"
+
+#include <cstdint>
+#include <string>
+
+#include "icing/util/logging.h"
+#include "unicode/umachine.h"
+#include "unicode/utf8.h"
+
+namespace icing {
+namespace lib {
+
+std::string UcharToString(UChar32 uchar) {
+ std::string result;
+ uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
+
+ int utf8_index = 0;
+ UBool has_error = FALSE;
+
+ // utf8_index is advanced to the end of the contents if successful
+ U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
+
+ if (has_error) {
+ ICING_VLOG(1) << "Error converting UChar32 to UTF8";
+ return "";
+ }
+ result.append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/i18n-test-utils.h b/icing/testing/i18n-test-utils.h
new file mode 100644
index 0000000..4e8a3b8
--- /dev/null
+++ b/icing/testing/i18n-test-utils.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_I18N_TEST_UTILS_H_
+#define ICING_TESTING_I18N_TEST_UTILS_H_
+
+#include <string>
+
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+std::string UcharToString(UChar32 uchar);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_I18N_TEST_UTILS_H_
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
new file mode 100644
index 0000000..1510e15
--- /dev/null
+++ b/icing/testing/random-string.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_RANDOM_STRING_H_
+#define ICING_TESTING_RANDOM_STRING_H_
+
+#include <random>
+#include <string>
+
+namespace icing {
+namespace lib {
+
+inline constexpr std::string_view kAlNumAlphabet =
+ "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+template <typename Gen>
+std::string RandomString(const std::string_view alphabet, size_t len,
+ Gen* gen) {
+ std::uniform_int_distribution<size_t> uniform(0u, alphabet.size());
+ std::string result(len, '\0');
+ std::generate(
+ std::begin(result), std::end(result),
+ [&gen, &alphabet, &uniform]() { return alphabet[uniform(*gen)]; });
+
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_RANDOM_STRING_H_
diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
new file mode 100644
index 0000000..fde0004
--- /dev/null
+++ b/icing/testing/snippet-helpers.cc
@@ -0,0 +1,80 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/snippet-helpers.h"
+
+#include <algorithm>
+#include <string_view>
+
+#include "icing/proto/search.pb.h"
+
+namespace icing {
+namespace lib {
+
+const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto,
+ const std::string& property_name,
+ int snippet_index) {
+ auto iterator = std::find_if(
+ snippet_proto.entries().begin(), snippet_proto.entries().end(),
+ [&property_name](const SnippetProto::EntryProto& entry) {
+ return entry.property_name() == property_name;
+ });
+ if (iterator == snippet_proto.entries().end() ||
+ iterator->snippet_matches_size() <= snippet_index) {
+ return nullptr;
+ }
+ return &iterator->snippet_matches(snippet_index);
+}
+
+const PropertyProto* GetProperty(const DocumentProto& document,
+ const std::string& property_name) {
+ const PropertyProto* property = nullptr;
+ for (const PropertyProto& prop : document.properties()) {
+ if (prop.name() == property_name) {
+ property = ∝
+ }
+ }
+ return property;
+}
+
+std::string GetWindow(const DocumentProto& document,
+ const SnippetProto& snippet_proto,
+ const std::string& property_name, int snippet_index) {
+ const SnippetMatchProto* match =
+ GetSnippetMatch(snippet_proto, property_name, snippet_index);
+ const PropertyProto* property = GetProperty(document, property_name);
+ if (match == nullptr || property == nullptr) {
+ return "";
+ }
+ std::string_view value = property->string_values(match->values_index());
+ return std::string(
+ value.substr(match->window_position(), match->window_bytes()));
+}
+
+std::string GetMatch(const DocumentProto& document,
+ const SnippetProto& snippet_proto,
+ const std::string& property_name, int snippet_index) {
+ const SnippetMatchProto* match =
+ GetSnippetMatch(snippet_proto, property_name, snippet_index);
+ const PropertyProto* property = GetProperty(document, property_name);
+ if (match == nullptr || property == nullptr) {
+ return "";
+ }
+ std::string_view value = property->string_values(match->values_index());
+ return std::string(
+ value.substr(match->exact_match_position(), match->exact_match_bytes()));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/snippet-helpers.h b/icing/testing/snippet-helpers.h
new file mode 100644
index 0000000..124e421
--- /dev/null
+++ b/icing/testing/snippet-helpers.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_SNIPPET_HELPERS_H_
+#define ICING_TESTING_SNIPPET_HELPERS_H_
+
+#include <string>
+
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
+
+namespace icing {
+namespace lib {
+
+// Retrieve pointer to the snippet_index'th SnippetMatchProto within the
+// EntryProto identified by property_name within snippet_proto.
+// Returns nullptr
+// - if there is no EntryProto within snippet_proto corresponding to
+// property_name.
+// - if there is no SnippetMatchProto at snippet_index within the EntryProto
+const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto,
+ const std::string& property_name,
+ int snippet_index);
+
+// Retrieve pointer to the PropertyProto identified by property_name.
+// Returns nullptr if no such property exists.
+const PropertyProto* GetProperty(const DocumentProto& document,
+ const std::string& property_name);
+
+// Retrieves the window defined by the SnippetMatchProto returned by
+// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property
+// returned by GetProperty(document, property_name).
+// Returns "" if no such property, snippet or window exists.
+std::string GetWindow(const DocumentProto& document,
+ const SnippetProto& snippet_proto,
+ const std::string& property_name, int snippet_index);
+
+// Retrieves the match defined by the SnippetMatchProto returned by
+// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property
+// returned by GetProperty(document, property_name).
+// Returns "" if no such property or snippet exists.
+std::string GetMatch(const DocumentProto& document,
+ const SnippetProto& snippet_proto,
+ const std::string& property_name, int snippet_index);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_SNIPPET_HELPERS_H_
diff --git a/icing/testing/test-data.cc b/icing/testing/test-data.cc
new file mode 100644
index 0000000..9e74531
--- /dev/null
+++ b/icing/testing/test-data.cc
@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/test-data.h"
+
+#include <sys/mman.h>
+
+#include <cstdint>
+
+#include "devtools/build/runtime/get_runfiles_dir.h"
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "unicode/udata.h"
+#include "unicode/utypes.h"
+
+namespace icing {
+namespace lib {
+namespace {
+constexpr char kGoogle3LangIdModelPath[] =
+ "nlp/saft/components/lang_id/mobile/fb_model/models/latest_model.smfb";
+} // namespace
+
+std::string GetTestFilePath(const std::string& google3_relative_file_path) {
+ return absl_ports::StrCat(devtools_build::testonly::GetTestSrcdir(),
+ "/google3/", google3_relative_file_path);
+}
+
+std::string GetLangIdModelPath() {
+ return GetTestFilePath(kGoogle3LangIdModelPath);
+}
+
+libtextclassifier3::Status SetUpICUDataFile(
+ const std::string& icu_data_file_relative_path) {
+ const std::string& file_path = GetTestFilePath(icu_data_file_relative_path);
+
+ Filesystem filesystem;
+ int64_t file_size = filesystem.GetFileSize(file_path.c_str());
+ ScopedFd fd(filesystem.OpenForRead(file_path.c_str()));
+
+ // TODO(samzheng): figure out why icing::MemoryMappedFile causes
+ // segmentation fault here.
+ const void* data =
+ mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd.get(), 0);
+
+ UErrorCode status = U_ZERO_ERROR;
+ udata_setCommonData(data, &status);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError(
+ "Failed to set up ICU data, please check if you have the data file at "
+ "the given path.");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/test-data.h b/icing/testing/test-data.h
new file mode 100644
index 0000000..c780f0e
--- /dev/null
+++ b/icing/testing/test-data.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_TEST_DATA_H_
+#define ICING_TESTING_TEST_DATA_H_
+
+#include <string>
+
+#include "utils/base/status.h"
+
+// This file provides functions for getting / setting up absolute test file
+// paths. They are specific to Blaze and Google3 and should be changed when used
+// in AOSP / Gerrit.
+namespace icing {
+namespace lib {
+
+// The input path should be a relative path under google3. The function returns
+// an absolute path to the file during unit testing. Before calling this
+// function, please make sure the test file is added in "data" attribute in
+// portable_cc_test or any other test build rules.
+std::string GetTestFilePath(const std::string& google3_relative_file_path);
+
+// Returns the latest LangId model in Google3.
+std::string GetLangIdModelPath();
+
+// This is for unit testing in Google3. The library binary doesn't contain any
+// ICU data files, so we generate a .dat file at compile time and here make ICU
+// use that file.
+//
+// Returns:
+// Ok on success
+// INTERNAL_ERROR if failed on any errors
+libtextclassifier3::Status SetUpICUDataFile(
+ const std::string& icu_data_file_relative_path);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_TEST_DATA_H_
diff --git a/icing/testing/tmp-directory.cc b/icing/testing/tmp-directory.cc
new file mode 100644
index 0000000..ea25fe2
--- /dev/null
+++ b/icing/testing/tmp-directory.cc
@@ -0,0 +1,45 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/tmp-directory.h"
+
+#include <string>
+
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+
+// Some other options for getting a tmp directory:
+// 1. FLAGS_test_tmpdir. We don't use FLAGS_test_tmpdir because it only exists
+// in the internal version of googletest.h (as of June 2019)
+// 2. ::testing::TempDir(). It returns "/sdcard" for Android emulators in which
+// the sdcard file format is FAT32
+// (https://developer.android.com/studio/command-line/mksdcard). FAT32
+// doesn't support sparse files so that it fails some tests in
+// //icing/file/filesystem_test.cc.
+// The sparse file related methods are mostly for reporting/logging purposes
+// and not affecting any system behaviors.
+std::string GetTestTempDir() {
+#ifdef __ANDROID__
+ return "/data/local/tmp";
+#elif defined(__APPLE__)
+ return absl_ports::StrCat(getenv("HOME"), "/tmp");
+#else
+ return "/tmp";
+#endif // __ANDROID__
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/tmp-directory.h b/icing/testing/tmp-directory.h
new file mode 100644
index 0000000..0999007
--- /dev/null
+++ b/icing/testing/tmp-directory.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_TMP_DIRECTORY_H_
+#define ICING_TESTING_TMP_DIRECTORY_H_
+
+#include <string>
+
+namespace icing {
+namespace lib {
+
+// Returns an absolute path to a tmpdir on the test running the test.
+// The caller should clean up all files that it created in this dir.
+std::string GetTestTempDir();
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_TMP_DIRECTORY_H_
diff --git a/icing/tokenization/language-detector.cc b/icing/tokenization/language-detector.cc
new file mode 100644
index 0000000..aa29fc3
--- /dev/null
+++ b/icing/tokenization/language-detector.cc
@@ -0,0 +1,71 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-detector.h"
+
+#include "utils/base/statusor.h"
+#include "nlp/saft/components/lang_id/mobile/fb_model/lang-id-from-fb.h"
+#include "nlp/saft/components/lang_id/mobile/lang-id.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+using ::nlp_saft::mobile::lang_id::GetLangIdFromFlatbufferFile;
+using ::nlp_saft::mobile::lang_id::LangId;
+
+class LanguageDetectorWithLangId : public LanguageDetector {
+ public:
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<LanguageDetectorWithLangId>>
+ Create(const std::string& lang_id_model_path) {
+ auto language_detector = std::unique_ptr<LanguageDetectorWithLangId>(
+ new LanguageDetectorWithLangId(lang_id_model_path));
+ if (language_detector->is_valid()) {
+ return language_detector;
+ }
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Failed to create a language detector with LangId model path: ",
+ lang_id_model_path));
+ }
+
+ libtextclassifier3::StatusOr<std::string> DetectLanguage(
+ std::string_view text) const override {
+ const std::string& lang_found =
+ lang_id_->FindLanguage(text.data(), text.length());
+ if (lang_found == LangId::kUnknownLanguageCode) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Language not found in text: ", text));
+ }
+ return lang_found;
+ }
+
+ private:
+ // TODO(samzheng): Use GetLangIdWithParamsFromCc() as a fallback when it's
+ // available in AOSP
+ explicit LanguageDetectorWithLangId(const std::string& lang_id_model_path)
+ : lang_id_(GetLangIdFromFlatbufferFile(lang_id_model_path)) {}
+
+ std::unique_ptr<LangId> lang_id_;
+
+ bool is_valid() { return lang_id_->is_valid(); }
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageDetector>>
+LanguageDetector::CreateWithLangId(const std::string& lang_id_model_path) {
+ return LanguageDetectorWithLangId::Create(lang_id_model_path);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-detector.h b/icing/tokenization/language-detector.h
new file mode 100644
index 0000000..07b31ff
--- /dev/null
+++ b/icing/tokenization/language-detector.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_LANGUAGE_DETECTOR_H_
+#define ICING_TOKENIZATION_LANGUAGE_DETECTOR_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+class LanguageDetector {
+ public:
+ virtual ~LanguageDetector() = default;
+
+ // Creates a language detector that uses the given LangId model.
+ //
+ // Returns:
+ // A LanguageDetector on success
+ // INVALID_ARGUMENT if fails to load model
+ static libtextclassifier3::StatusOr<std::unique_ptr<LanguageDetector>>
+ CreateWithLangId(const std::string& lang_id_model_path);
+
+ // Detects the language of the given text, if there're multiple languages, the
+ // one with the biggest possibility will be returned. The two-letter language
+ // code uses the ISO-639 standard (https://en.wikipedia.org/wiki/ISO_639).
+ //
+ // Returns:
+ // language code on success
+ // NOT_FOUND if no language detected
+ virtual libtextclassifier3::StatusOr<std::string> DetectLanguage(
+ std::string_view text) const = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_LANGUAGE_DETECTOR_H_
diff --git a/icing/tokenization/language-detector_test.cc b/icing/tokenization/language-detector_test.cc
new file mode 100644
index 0000000..5958e5a
--- /dev/null
+++ b/icing/tokenization/language-detector_test.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-detector.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::Eq;
+
+TEST(LanguageDetectorTest, BadFilePath) {
+ EXPECT_THAT(LanguageDetector::CreateWithLangId("Bad file path"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// TODO(samzheng): more tests for other languages and mixed languages
+TEST(LanguageDetectorTest, DetectLanguage) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_detector,
+ LanguageDetector::CreateWithLangId(GetLangIdModelPath()));
+
+ EXPECT_THAT(language_detector->DetectLanguage(" , "),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ EXPECT_THAT(language_detector->DetectLanguage("hello world"),
+ IsOkAndHolds(Eq("en"))); // English
+
+ EXPECT_THAT(language_detector->DetectLanguage("Selam Dünya"),
+ IsOkAndHolds(Eq("tr"))); // Turkish
+
+ EXPECT_THAT(language_detector->DetectLanguage("Bonjour le monde"),
+ IsOkAndHolds(Eq("fr"))); // French
+
+ EXPECT_THAT(language_detector->DetectLanguage("你好世界"),
+ IsOkAndHolds(Eq("zh"))); // Chinese
+
+ EXPECT_THAT(language_detector->DetectLanguage("こんにちは世界"),
+ IsOkAndHolds(Eq("ja"))); // Japanese
+
+ EXPECT_THAT(language_detector->DetectLanguage("สวัสดีชาวโลก"),
+ IsOkAndHolds(Eq("th"))); // Thai
+
+ EXPECT_THAT(language_detector->DetectLanguage("안녕 세상"),
+ IsOkAndHolds(Eq("ko"))); // Korean
+
+ EXPECT_THAT(language_detector->DetectLanguage("Hallo Wereld"),
+ IsOkAndHolds(Eq("nl"))); // Dutch
+
+ EXPECT_THAT(language_detector->DetectLanguage("Hola Mundo"),
+ IsOkAndHolds(Eq("es"))); // Spanish
+
+ EXPECT_THAT(language_detector->DetectLanguage("नमस्ते दुनिया"),
+ IsOkAndHolds(Eq("hi"))); // Hindi
+
+ EXPECT_THAT(language_detector->DetectLanguage("مرحبا بالعالم"),
+ IsOkAndHolds(Eq("ar"))); // Arabic
+
+ EXPECT_THAT(language_detector->DetectLanguage("Привет, мир"),
+ IsOkAndHolds(Eq("ru"))); // Russian
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter.cc b/icing/tokenization/language-segmenter.cc
new file mode 100644
index 0000000..8c64f96
--- /dev/null
+++ b/icing/tokenization/language-segmenter.cc
@@ -0,0 +1,196 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-segmenter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/tokenization/language-detector.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/ubrk.h"
+#include "unicode/uchar.h"
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+constexpr char kASCIISpace = ' ';
+} // namespace
+
+LanguageSegmenter::LanguageSegmenter(
+ std::unique_ptr<LanguageDetector> language_detector,
+ const std::string default_locale)
+ : language_detector_(std::move(language_detector)),
+ default_locale_(std::move(default_locale)) {}
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>>
+LanguageSegmenter::Create(const std::string& lang_id_model_path,
+ const std::string& default_locale) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageDetector> language_detector,
+ LanguageDetector::CreateWithLangId(lang_id_model_path));
+ return std::unique_ptr<LanguageSegmenter>(
+ new LanguageSegmenter(std::move(language_detector), default_locale));
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+LanguageSegmenter::Segment(const std::string_view text) const {
+ // TODO(b/143769125): Remove LangId for now.
+ libtextclassifier3::StatusOr<std::string> language_or =
+ language_detector_->DetectLanguage(text);
+
+ if (language_or.ok()) {
+ return LanguageSegmenter::Iterator::Create(text, language_or.ValueOrDie());
+ } else {
+ return LanguageSegmenter::Iterator::Create(text, default_locale_);
+ }
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+LanguageSegmenter::GetAllTerms(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Iterator> iterator, Segment(text));
+ std::vector<std::string_view> terms;
+ while (iterator->Advance()) {
+ terms.push_back(iterator->GetTerm());
+ }
+ return terms;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+LanguageSegmenter::Iterator::Create(std::string_view text,
+ const std::string locale) {
+ std::unique_ptr<Iterator> iterator(new Iterator(text, std::move(locale)));
+ if (iterator->Initialize()) {
+ return iterator;
+ }
+ return absl_ports::InternalError("Unable to create a term iterator");
+}
+
+LanguageSegmenter::Iterator::Iterator(const std::string_view text,
+ const std::string&& locale)
+ : break_iterator_(nullptr),
+ text_(text),
+ locale_(std::move(locale)),
+ u_text_(UTEXT_INITIALIZER),
+ term_start_index_(0),
+ term_end_index_exclusive_(0) {}
+
+LanguageSegmenter::Iterator::~Iterator() {
+ ubrk_close(break_iterator_);
+ utext_close(&u_text_);
+}
+
+bool LanguageSegmenter::Iterator::Initialize() {
+ UErrorCode status = U_ZERO_ERROR;
+ utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status);
+ break_iterator_ = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
+ /*textLength=*/0, &status);
+ ubrk_setUText(break_iterator_, &u_text_, &status);
+ return !U_FAILURE(status);
+}
+
+bool LanguageSegmenter::Iterator::Advance() {
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
+
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
+
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_index_])) {
+ return true;
+ }
+
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), term_start_index_);
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (u_isUAlphabetic(uchar32)) {
+ return true;
+ } else {
+ return Advance();
+ }
+}
+
+std::string_view LanguageSegmenter::Iterator::GetTerm() const {
+ if (text_[term_start_index_] == kASCIISpace) {
+ // Rule 3: multiple continuous whitespaces are treated as one.
+ return std::string_view(&text_[term_start_index_], 1);
+ }
+ return text_.substr(term_start_index_,
+ term_end_index_exclusive_ - term_start_index_);
+}
+
+libtextclassifier3::StatusOr<int32_t>
+LanguageSegmenter::Iterator::ResetToTermStartingAfter(int32_t offset) {
+ term_start_index_ = ubrk_following(break_iterator_, offset);
+ if (term_start_index_ == UBRK_DONE) {
+ return absl_ports::NotFoundError("");
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_index_;
+}
+
+libtextclassifier3::Status
+LanguageSegmenter::Iterator::ResetToTermStartingBefore(int32_t offset) {
+ term_start_index_ = ubrk_preceding(break_iterator_, offset);
+ if (term_start_index_ == UBRK_DONE) {
+ return absl_ports::NotFoundError("");
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return absl_ports::NotFoundError("");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int32_t>
+LanguageSegmenter::Iterator::ResetToTermEndingBefore(int32_t offset) {
+ ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
+ if (term_end_index_exclusive_ > offset) {
+ // This term ends after offset. So we need to get the term just before this
+ // one.
+ ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
+ }
+ return term_start_index_;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
new file mode 100644
index 0000000..2b8b9aa
--- /dev/null
+++ b/icing/tokenization/language-segmenter.h
@@ -0,0 +1,192 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/tokenization/language-detector.h"
+#include "unicode/ubrk.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to segment sentences into words based on rules
+// (https://unicode.org/reports/tr29/#Word_Boundaries) and language
+// understanding. Based on the basic segmentation done by UBreakIterator,
+// some extra rules are applied in this class:
+//
+// 1. All ASCII terms will be returned.
+// 2. For non-ASCII terms, only the alphabetic terms are returned, which means
+// non-ASCII punctuation and special characters are left out.
+// 3. Multiple continuous whitespaces are treated as one.
+//
+// The rules above are common to the high-level tokenizers that might use this
+// class. Other special tokenization logic will be in each tokenizer.
+class LanguageSegmenter {
+ public:
+ LanguageSegmenter(const LanguageSegmenter&) = delete;
+ LanguageSegmenter& operator=(const LanguageSegmenter&) = delete;
+
+ // Creates a language segmenter that uses the given LangId model. Default
+ // locale is used when language can't be detected.
+ //
+ // Returns:
+ // A LanguageSegmenter on success
+ // INVALID_ARGUMENT if fails to load model
+ static libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>>
+ Create(const std::string& lang_id_model_path,
+ const std::string& default_locale = ULOC_US);
+
+ // An iterator helping to find terms in the input text.
+ // Example usage:
+ //
+ // while (iterator.Advance()) {
+ // const std::string_view term = iterator.GetTerm();
+ // // Do something
+ // }
+ class Iterator {
+ public:
+ // Factory function to create a segment iterator based on the given locale.
+ //
+ // Returns:
+ // An iterator on success
+ // INTERNAL_ERROR if unable to create
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<LanguageSegmenter::Iterator>>
+ Create(std::string_view text, const std::string locale);
+
+ ~Iterator();
+
+ // Advances to the next term. Returns false if it has reached the end.
+ bool Advance();
+
+ // Returns the current term. It can be called only when Advance() returns
+ // true.
+ std::string_view GetTerm() const;
+
+ // Resets the iterator to point to the first term that starts after offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that starts after
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that start after
+ // offset.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset);
+
+ // Resets the iterator to point to the first term that ends before offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that ends before
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that ends before
+ // offset.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset);
+
+ private:
+ Iterator(std::string_view text, const std::string&& locale);
+
+ // Returns true on success
+ bool Initialize();
+
+ // Resets the iterator to point to the first term that starts before offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if an error occurred or there are no terms that start before
+ // offset.
+ libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset);
+
+ // The underlying class that does the segmentation, ubrk_close() must be
+ // called after using.
+ UBreakIterator* break_iterator_;
+
+ // Text to be segmented
+ const std::string_view text_;
+
+ // Locale of the input text, used to help segment more accurately. If a
+ // wrong locale is set, text could probably still be segmented correctly
+ // because the default break iterator behavior is used for most locales.
+ const std::string locale_;
+
+ // A thin wrapper around the input UTF8 text, needed by break_iterator_.
+ // utext_close() must be called after using.
+ UText u_text_;
+
+ // The start and end indices are used to track the positions of current
+ // term.
+ int term_start_index_;
+ int term_end_index_exclusive_;
+ };
+
+ // Segments the input text into terms. The segmentation depends on the
+ // language detected in the input text.
+ //
+ // Returns:
+ // An iterator of terms on success
+ // INTERNAL_ERROR if any error occurs
+ //
+ // Note: The underlying char* data of the input string won't be copied but
+ // shared with the return strings, so please make sure the input string
+ // outlives the returned iterator.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const;
+
+ // Segments and returns all terms in the input text. The segmentation depends
+ // on the language detected in the input text.
+ //
+ // Returns:
+ // A list of terms on success
+ // INTERNAL_ERROR if any error occurs
+ //
+ // Note: The underlying char* data of the input string won't be copied but
+ // shared with the return strings, so please make sure the input string
+ // outlives the returned terms.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const;
+
+ private:
+ LanguageSegmenter(std::unique_ptr<LanguageDetector> language_detector,
+ const std::string default_locale);
+
+ // Used to detect languages in text
+ const std::unique_ptr<LanguageDetector> language_detector_;
+
+ // Used as default locale when language can't be detected in text
+ const std::string default_locale_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
new file mode 100644
index 0000000..889763b
--- /dev/null
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -0,0 +1,181 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/tokenization:language-segmenter_benchmark
+//
+// $ blaze-bin/icing/tokenization/language-segmenter_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// Make target //icing/tokenization:language-segmenter depend on
+// //third_party/icu
+//
+// Download LangId model file from
+// //nlp/saft/components/lang_id/mobile/fb_model:models/latest_model.smfb and
+// put it into your device:
+// $ adb push [your model path] /data/local/tmp/
+//
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/tokenization:language-segmenter_benchmark
+//
+// $ adb push
+// blaze-bin/icing/tokenization/language-segmenter_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/language-segmenter_benchmark --benchmarks=all
+// --adb
+
+// Flag to tell the benchmark that it'll be run on an Android device via adb,
+// the benchmark will set up data files accordingly.
+ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::unique_ptr<LanguageSegmenter> CreateLanguageSegmenter() {
+ if (absl::GetFlag(FLAGS_adb)) {
+ return LanguageSegmenter::Create("/data/local/tmp/latest_model.smfb")
+ .ValueOrDie();
+ } else {
+ return LanguageSegmenter::Create(GetLangIdModelPath()).ValueOrDie();
+ }
+}
+
+void BM_SegmentNoSpace(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+
+ std::string input_string(state.range(0), 'A');
+
+ for (auto _ : state) {
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator =
+ language_segmenter->Segment(input_string).ValueOrDie();
+ while (iterator->Advance()) {
+ iterator->GetTerm();
+ }
+ }
+}
+BENCHMARK(BM_SegmentNoSpace)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_SegmentWithSpaces(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+
+ std::string input_string(state.range(0), 'A');
+ for (int i = 1; i < input_string.length(); i += 2) {
+ input_string[i] = ' ';
+ }
+
+ for (auto _ : state) {
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator =
+ language_segmenter->Segment(input_string).ValueOrDie();
+ while (iterator->Advance()) {
+ iterator->GetTerm();
+ }
+ }
+}
+BENCHMARK(BM_SegmentWithSpaces)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_SegmentCJK(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ CreateLanguageSegmenter();
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("你好こんにちは안녕하세요");
+ }
+
+ for (auto _ : state) {
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator =
+ language_segmenter->Segment(input_string).ValueOrDie();
+ while (iterator->Advance()) {
+ iterator->GetTerm();
+ }
+ }
+}
+BENCHMARK(BM_SegmentCJK)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter_test.cc b/icing/tokenization/language-segmenter_test.cc
new file mode 100644
index 0000000..d87dca4
--- /dev/null
+++ b/icing/tokenization/language-segmenter_test.cc
@@ -0,0 +1,314 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-segmenter.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class LanguageSegmenterTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ }
+};
+
+TEST_F(LanguageSegmenterTest, BadModelPath) {
+ EXPECT_THAT(LanguageSegmenter::Create("Bad Model Path"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(LanguageSegmenterTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(LanguageSegmenterTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_F(LanguageSegmenterTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_F(LanguageSegmenterTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UcharToString(0x0009), UcharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UcharToString(0x0009),
+ UcharToString(0x0009), "World")));
+}
+
+TEST_F(LanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
+ IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_F(LanguageSegmenterTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAre("I.B.M", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_F(LanguageSegmenterTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_F(LanguageSegmenterTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UcharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_F(LanguageSegmenterTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_F(LanguageSegmenterTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_F(LanguageSegmenterTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_F(LanguageSegmenterTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_F(LanguageSegmenterTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ const std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_F(LanguageSegmenterTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ // Japanese
+ EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+ "い", "てい", "ます")));
+ // Khmer
+ EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ // Thai
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_F(LanguageSegmenterTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_F(LanguageSegmenterTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // Turkish
+ EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+ IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+ // Korean
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_F(LanguageSegmenterTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+ IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か")));
+}
+
+TEST_F(LanguageSegmenterTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
new file mode 100644
index 0000000..9cf6a0b
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -0,0 +1,125 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/plain-tokenizer.h"
+
+#include <cstdint>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Helper function to validate a term.
+// A term is valid if:
+// 1. it's not empty
+// 2. it's not a whitespace
+// 3. it's not a punctuation mark
+//
+// TODO(b/141007791): figure out how we'd like to support special characters
+// like "+", "&", "@", "#" in indexing and query tokenizers.
+bool IsValidTerm(std::string_view term) {
+ if (term.empty()) {
+ return false;
+ }
+ // Gets the first unicode character. We can know what the whole term is by
+ // checking only the first character.
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), 0);
+ return !u_isUWhiteSpace(uchar32) && !u_ispunct(uchar32);
+}
+} // namespace
+
+// Plain tokenizer applies its rules to the results from language segmenter. It
+// simply filters out invalid terms from language segmenter and returns
+// everything else as tokens. Please refer to IsValidTerm() above for what terms
+// are valid.
+class PlainTokenIterator : public Tokenizer::Iterator {
+ public:
+ explicit PlainTokenIterator(
+ std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)
+ : base_iterator_(std::move(base_iterator)) {}
+
+ bool Advance() override {
+ bool found_next_valid_term = false;
+ while (!found_next_valid_term && base_iterator_->Advance()) {
+ current_term_ = base_iterator_->GetTerm();
+ found_next_valid_term = IsValidTerm(current_term_);
+ }
+ return found_next_valid_term;
+ }
+
+ Token GetToken() const override {
+ if (current_term_.empty()) {
+ return Token(Token::INVALID);
+ }
+ return Token(Token::REGULAR, current_term_);
+ }
+
+ bool ResetToTokenAfter(int32_t offset) override {
+ if (!base_iterator_->ResetToTermStartingAfter(offset).ok()) {
+ return false;
+ }
+ current_term_ = base_iterator_->GetTerm();
+ if (!IsValidTerm(current_term_)) {
+ // If the current value isn't valid, advance to the next valid value.
+ return Advance();
+ }
+ return true;
+ }
+
+ bool ResetToTokenBefore(int32_t offset) override {
+ ICING_ASSIGN_OR_RETURN_VAL(
+ offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ current_term_ = base_iterator_->GetTerm();
+ while (!IsValidTerm(current_term_)) {
+ // Haven't found a valid term yet. Retrieve the term prior to this one
+ // from the segmenter.
+ ICING_ASSIGN_OR_RETURN_VAL(
+ offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ current_term_ = base_iterator_->GetTerm();
+ }
+ return true;
+ }
+
+ private:
+ std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
+ std::string_view current_term_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+PlainTokenizer::Tokenize(std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
+ language_segmenter_.Segment(text));
+ return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text));
+ std::vector<Token> tokens;
+ while (iterator->Advance()) {
+ tokens.push_back(iterator->GetToken());
+ }
+ return tokens;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/plain-tokenizer.h b/icing/tokenization/plain-tokenizer.h
new file mode 100644
index 0000000..cc3fe2e
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
+#define ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
+
+#include "utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+// Provides basic tokenization on input text
+class PlainTokenizer : public Tokenizer {
+ public:
+ explicit PlainTokenizer(const LanguageSegmenter* language_segmenter)
+ : language_segmenter_(*language_segmenter) {}
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+
+ private:
+ // Used to segment input texts based on language understanding
+ const LanguageSegmenter& language_segmenter_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
new file mode 100644
index 0000000..a3790f9
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -0,0 +1,313 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/plain-tokenizer.h"
+
+#include <string_view>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "gmock/gmock.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+class PlainTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ }
+};
+
+TEST_F(PlainTokenizerTest, Simple) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
+
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
+
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
+ "Duis efficitur iaculis auctor."),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
+ EqualsToken(Token::REGULAR, "ipsum"),
+ EqualsToken(Token::REGULAR, "dolor"),
+ EqualsToken(Token::REGULAR, "sit"),
+ EqualsToken(Token::REGULAR, "amet"),
+ EqualsToken(Token::REGULAR, "consectetur"),
+ EqualsToken(Token::REGULAR, "adipiscing"),
+ EqualsToken(Token::REGULAR, "elit"),
+ EqualsToken(Token::REGULAR, "Duis"),
+ EqualsToken(Token::REGULAR, "efficitur"),
+ EqualsToken(Token::REGULAR, "iaculis"),
+ EqualsToken(Token::REGULAR, "auctor"))));
+}
+
+TEST_F(PlainTokenizerTest, Whitespace) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ // There're many unicode characters that are whitespaces, here we choose tabs
+ // to represent others.
+
+ // 0x0009 is horizontal tab, considered as a whitespace
+ std::string text_with_horizontal_tab =
+ absl_ports::StrCat("Hello", UcharToString(0x0009), "World");
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
+
+ // 0x000B is vertical tab, considered as a whitespace
+ std::string text_with_vertical_tab =
+ absl_ports::StrCat("Hello", UcharToString(0x000B), "World");
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
+}
+
+TEST_F(PlainTokenizerTest, Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ // Half-width punctuation marks are filtered out.
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(
+ "Hello, World! Hello: World. \"Hello\" World?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"),
+ EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"),
+ EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
+
+ // Full-width punctuation marks are filtered out.
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"),
+ EqualsToken(Token::REGULAR, "你好"),
+ EqualsToken(Token::REGULAR, "世界"))));
+}
+
+TEST_F(PlainTokenizerTest, SpecialCharacters) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ // Right now we don't have special logic for these characters, just output
+ // them as tokens.
+
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
+ EqualsToken(Token::REGULAR, "+"),
+ EqualsToken(Token::REGULAR, "1"))));
+
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
+ EqualsToken(Token::REGULAR, "50"))));
+}
+
+TEST_F(PlainTokenizerTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ // In plain tokenizer, CJKT characters are handled the same way as non-CJKT
+ // characters, just add these tests as sanity checks.
+
+ // Chinese
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
+ EqualsToken(Token::REGULAR, "每天"),
+ EqualsToken(Token::REGULAR, "走路"),
+ EqualsToken(Token::REGULAR, "去"),
+ EqualsToken(Token::REGULAR, "上班"))));
+ // Japanese
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "私"), EqualsToken(Token::REGULAR, "は"),
+ EqualsToken(Token::REGULAR, "毎日"),
+ EqualsToken(Token::REGULAR, "仕事"),
+ EqualsToken(Token::REGULAR, "に"), EqualsToken(Token::REGULAR, "歩"),
+ EqualsToken(Token::REGULAR, "い"),
+ EqualsToken(Token::REGULAR, "てい"),
+ EqualsToken(Token::REGULAR, "ます"))));
+ // Khmer
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
+ EqualsToken(Token::REGULAR, "ដើរទៅ"),
+ EqualsToken(Token::REGULAR, "ធ្វើការ"),
+ EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
+ // Korean
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
+ EqualsToken(Token::REGULAR, "매일"),
+ EqualsToken(Token::REGULAR, "출근합니다"))));
+
+ // Thai
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
+ EqualsToken(Token::REGULAR, "เดิน"),
+ EqualsToken(Token::REGULAR, "ไป"),
+ EqualsToken(Token::REGULAR, "ทำงาน"),
+ EqualsToken(Token::REGULAR, "ทุก"),
+ EqualsToken(Token::REGULAR, "วัน"))));
+}
+
+TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+ constexpr std::string_view kText = "f b";
+ auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+
+ EXPECT_TRUE(iterator->ResetToTokenAfter(0));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
+
+ EXPECT_FALSE(iterator->ResetToTokenAfter(2));
+}
+
+TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+ constexpr std::string_view kText = "f b";
+ auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+
+ EXPECT_TRUE(iterator->ResetToTokenBefore(2));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
+
+ EXPECT_FALSE(iterator->ResetToTokenBefore(0));
+}
+
+TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ constexpr std::string_view kText = " foo . bar baz.. bat ";
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
+ EqualsToken(Token::REGULAR, "bar"),
+ EqualsToken(Token::REGULAR, "baz"),
+ EqualsToken(Token::REGULAR, "bat"))));
+ std::vector<std::string> expected_text = {
+ "foo", // 0: " foo . bar"
+ "bar", // 1: "foo . bar "
+ "bar", // 2: "oo . bar b"
+ "bar", // 3: "o . bar ba"
+ "bar", // 4: " . bar baz"
+ "bar", // 5: ". bar baz."
+ "bar", // 6: " bar baz.."
+ "baz", // 7: "bar baz.. b"
+ "baz", // 8: "ar baz.. ba"
+ "baz", // 9: "r baz.. bat"
+ "baz", // 10: " baz.. bat"
+ "bat", // 11: "baz.. bat"
+ "bat", // 12: "az.. bat"
+ "bat", // 13: "z.. bat"
+ "bat", // 14: ".. bat"
+ "bat", // 15: ". bat"
+ "bat", // 16: " bat"
+ };
+
+ auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ EXPECT_TRUE(iterator->Advance());
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ for (int i = 0; i < kText.length(); ++i) {
+ if (i < expected_text.size()) {
+ EXPECT_TRUE(iterator->ResetToTokenAfter(i));
+ EXPECT_THAT(iterator->GetToken(),
+ EqualsToken(Token::REGULAR, expected_text[i]));
+ } else {
+ EXPECT_FALSE(iterator->ResetToTokenAfter(i));
+ }
+ }
+}
+
+TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> plain_tokenizer =
+ std::make_unique<PlainTokenizer>(language_segmenter.get());
+
+ constexpr std::string_view kText = " foo . bar baz.. bat ";
+ EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
+ EqualsToken(Token::REGULAR, "bar"),
+ EqualsToken(Token::REGULAR, "baz"),
+ EqualsToken(Token::REGULAR, "bat"))));
+ std::vector<std::string> expected_text = {
+ "bat", // 20: "baz.. bat "
+ "baz", // 19: " baz.. bat"
+ "baz", // 18: "r baz.. ba"
+ "baz", // 17: "ar baz.. b"
+ "baz", // 16: "bar baz.. "
+ "baz", // 15: " bar baz.."
+ "baz", // 14: ". bar baz."
+ "bar", // 13: " . bar baz"
+ "bar", // 12: "o . bar ba"
+ "bar", // 11: "oo . bar b"
+ "bar", // 10: "foo . bar "
+ "foo", // 9: "foo . bar"
+ "foo", // 8: "foo . ba"
+ "foo", // 7: "foo . b"
+ "foo", // 6: "foo . "
+ "foo", // 5: "foo ."
+ "foo", // 4: "foo "
+ };
+
+ auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ EXPECT_TRUE(iterator->Advance());
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ for (int i = kText.length() - 1; i >= 0; --i) {
+ int expected_index = kText.length() - 1 - i;
+ if (expected_index < expected_text.size()) {
+ EXPECT_TRUE(iterator->ResetToTokenBefore(i));
+ EXPECT_THAT(iterator->GetToken(),
+ EqualsToken(Token::REGULAR, expected_text[expected_index]));
+ } else {
+ EXPECT_FALSE(iterator->ResetToTokenBefore(i));
+ }
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
new file mode 100644
index 0000000..779a555
--- /dev/null
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -0,0 +1,552 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/raw-query-tokenizer.h"
+
+#include "utils/base/status.h"
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/i18n-utils.h"
+
+// This file provides rules that tell the tokenizer what to do when it sees a
+// term.
+//
+// Some definitions:
+//
+// 1. State: We treat raw query tokenizer as a state machine, it has different
+// states when processing different terms.
+// 2. TermType: type of input terms from language segmenter
+// 3. Rule: a rule here is the combination of State and TermType, a rule tells
+// the raw query tokenizer what to do when it's in a certain State and
+// sees a certain TermType.
+//
+// There are 2 kinds of rules here:
+// 3.1 State transition rule: it tells the raw query tokenizer what new state
+// to transition into.
+// 3.2 Action rule: it tells the raw query tokenizer whether to output the
+// current term as a token or skip.
+//
+// Then a rule can be described as:
+// [current state] + [next term type] -> [new state] + [action]
+//
+// Since currently there're 9 states and 8 term types, we need 9 * 8 = 72 rules
+// to cover all possible cases for both state transition and action.
+//
+// Besides the 72 rules, there're 4 extra rules that we handle separately:
+// 1. Property name must be in ASCII.
+// 2. "OR" is ignored if there's no valid token on its left.
+// 3. "OR" is ignored if there's no valid token on its right.
+// 4. Parentheses must appear in pairs.
+namespace icing {
+namespace lib {
+
+namespace {
+constexpr char kWhitespace = ' ';
+constexpr char kColon = ':';
+constexpr char kLeftParentheses = '(';
+constexpr char kRightParentheses = ')';
+constexpr char kExclusion = '-';
+constexpr char kOrOperator[] = "OR";
+
+enum State {
+ // Ready to process any terms
+ READY = 0,
+
+ // When seeing an alphanumeric term
+ PROCESSING_ALPHANUMERIC_TERM = 1,
+
+ // When seeing an exclusion operator "-"
+ PROCESSING_EXCLUSION = 2,
+
+ // When seeing an exclusion operator + alphanumeric term
+ PROCESSING_EXCLUSION_TERM = 3,
+
+ // When seeing ASCII alphanumeric term + colon
+ PROCESSING_PROPERTY_RESTRICT = 4,
+
+ // When seeing ASCII alphanumeric term + colon + alphanumeric term
+ PROCESSING_PROPERTY_TERM = 5,
+
+ // When seeing OR operator
+ PROCESSING_OR = 6,
+
+ // When seeing left parentheses
+ OPENING_PARENTHESES = 7,
+
+ // When seeing right parentheses
+ CLOSING_PARENTHESES = 8,
+
+ // Valid state count
+ STATE_COUNT = 9,
+
+ INVALID = 10
+};
+
+enum TermType {
+ // " "
+ WHITESPACE = 0,
+
+ // A term that consists of unicode alphabetic and numeric characters
+ ALPHANUMERIC_TERM = 1,
+
+ // "("
+ LEFT_PARENTHESES = 2,
+
+ // ")"
+ RIGHT_PARENTHESES = 3,
+
+ // "-"
+ EXCLUSION_OPERATOR = 4,
+
+ // "OR"
+ OR_OPERATOR = 5,
+
+ // ":"
+ COLON = 6,
+
+ // All the other characters seen that are not the types above
+ OTHER = 7,
+
+ TYPE_COUNT = 8
+};
+
+enum ActionOrError {
+ // Output the current term as token
+ OUTPUT = 0,
+
+ // Do nothing and wait for more information as it's not clear what the current
+ // term is used for.
+ KEEP = 1,
+
+ // Ignore / throw away the current term
+ IGNORE = 2,
+
+ // Errors
+ ERROR_UNKNOWN = 100,
+ ERROR_NO_WHITESPACE_AROUND_OR = 101,
+ ERROR_GROUP_AFTER_EXCLUSION = 102,
+ ERROR_GROUP_AS_PROPERTY_NAME = 103,
+ ERROR_GROUP_AFTER_PROPERTY_RESTRICTION = 104,
+ ERROR_EXCLUSION_PROPERTY_TOGETHER = 105,
+ ERROR_EXCLUSION_OR_TOGETHER = 106,
+ ERROR_PROPERTY_OR_TOGETHER = 107,
+};
+
+std::string_view GetErrorMessage(ActionOrError maybe_error) {
+ switch (maybe_error) {
+ case ERROR_UNKNOWN:
+ return "Unknown error";
+ case ERROR_NO_WHITESPACE_AROUND_OR:
+ return "No whitespaces before or after OR operator";
+ case ERROR_GROUP_AFTER_EXCLUSION:
+ return "Exclusion on groups is not supported";
+ case ERROR_GROUP_AS_PROPERTY_NAME:
+ return "Property name can't be a group";
+ case ERROR_GROUP_AFTER_PROPERTY_RESTRICTION:
+ return "Property restriction on groups is not supported";
+ case ERROR_EXCLUSION_PROPERTY_TOGETHER:
+ return "Exclusion and property restriction operators can't be used "
+ "together";
+ case ERROR_EXCLUSION_OR_TOGETHER:
+ return "Exclusion and OR operators can't be used together";
+ case ERROR_PROPERTY_OR_TOGETHER:
+ return "Property restriction and OR operators can't be used together";
+ default:
+ return "";
+ }
+}
+
+// The following state transition table uses numbers to represent states and
+// letters to represent actions:
+//
+// States:
+//
+// READY = 0
+// PROCESSING_ALPHANUMERIC_TERM = 1
+// PROCESSING_EXCLUSION = 2
+// PROCESSING_EXCLUSION_TERM = 3
+// PROCESSING_PROPERTY_RESTRICT = 4
+// PROCESSING_PROPERTY_TERM = 5
+// PROCESSING_OR = 6
+// OPENING_PARENTHESES = 7
+// CLOSING_PARENTHESES = 8
+//
+// Actions:
+//
+// OUTPUT = a
+// KEEP = b
+// IGNORE = c
+//
+// ========================================================
+// Transition Table || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+// ===========================================================================
+// WHITESPACE || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
+// ALPHANUMERIC_TERM || 1,c | 1,a | 3,a | 1,a | 5,a | 1,a |ERROR| 1,a | 1,a |
+// LEFT_PARENTHESES || 7,c | 7,a |ERROR| 7,a |ERROR| 7,a | 7,a | 7,a | 7,a |
+// RIGHT_PARENTHESES || 8,c | 8,a | 8,c | 8,a | 8,a | 8,a | 8,c | 8,a | 8,a |
+// EXCLUSION_OPERATOR || 2,c | 0,a | 2,c | 0,a |ERROR| 0,a |ERROR| 2,a | 2,a |
+// OR_OPERATOR || 6,c |ERROR|ERROR|ERROR|ERROR|ERROR|ERROR| 7,b | 6,a |
+// COLON || 0,c | 4,b |ERROR|ERROR| 4,b | 0,a |ERROR| 0,a |ERROR|
+// OTHER || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
+//
+// Each cell is a rule that consists of 4 things:
+// [current state] + [next term type] -> [new state] + [action]
+//
+// E.g. the cell at intersection of "0" and "ALPHANUMERIC_TERM" means that when
+// we're at state 0 (READY) and seeing a new term with type "ALPHANUMERIC_TERM",
+// we'll transition into a new state 1 (PROCESSING_ALPHANUMERIC_TERM) and take
+// action c (IGNORE the current term).
+
+// We use a 2D array to encode the state transition rules,
+// The value of state_transition_rules[state1][term_type1] means "what state we
+// need to transition into when the current state is state1 and the next term
+// type is term_type1".
+//
+// NOTE: Please update the state transition table above if this is updated.
+//
+// TODO(samzheng): support syntax "-property1:term1", right now we don't allow
+// exclusion and property restriction applied on the same term.
+// TODO(b/141007791): figure out how we'd like to support special characters
+// like "+", "&", "@", "#" in indexing and query tokenizers.
+constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
+ /*State: Ready*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY},
+ /*State: PROCESSING_ALPHANUMERIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
+ /*State: PROCESSING_EXCLUSION*/
+ {READY, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES,
+ PROCESSING_EXCLUSION, INVALID, INVALID, READY},
+ /*State: PROCESSING_EXCLUSION_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
+ /*State: PROCESSING_PROPERTY_RESTRICT*/
+ {READY, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID,
+ INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
+ /*State: PROCESSING_PROPERTY_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, READY, READY},
+ /*State: PROCESSING_OR*/
+ {READY, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID,
+ INVALID, READY},
+ /*State: OPENING_PARENTHESES*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY,
+ READY},
+ /*State: CLOSING_PARENTHESES*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}};
+
+// We use a 2D array to encode the action rules,
+// The value of action_rules[state1][term_type1] means "what action we need to
+// take when the current state is state1 and the next term type is term_type1".
+//
+// NOTE: Please update the state transition table above if this is updated.
+constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
+ /*State: Ready*/
+ {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE},
+ /*State: PROCESSING_ALPHANUMERIC_TERM*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
+ KEEP, OUTPUT},
+ /*State: PROCESSING_EXCLUSION*/
+ {IGNORE, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE,
+ ERROR_EXCLUSION_OR_TOGETHER, ERROR_EXCLUSION_PROPERTY_TOGETHER, IGNORE},
+ /*State: PROCESSING_EXCLUSION_TERM*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
+ ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT},
+ /*State: PROCESSING_PROPERTY_RESTRICT*/
+ {OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT,
+ ERROR_EXCLUSION_PROPERTY_TOGETHER, ERROR_PROPERTY_OR_TOGETHER, KEEP,
+ OUTPUT},
+ /*State: PROCESSING_PROPERTY_TERM*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
+ OUTPUT, OUTPUT},
+ /*State: PROCESSING_OR*/
+ {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT, IGNORE,
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR,
+ ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT},
+ /*State: OPENING_PARENTHESES*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT},
+ /*State: CLOSING_PARENTHESES*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}};
+
+// Helper function to get the TermType of the input term.
+TermType GetTermType(std::string_view term) {
+ if (term.length() == 1) {
+ // Must be an ASCII char
+ const char& first_term_char = term[0];
+ if (first_term_char == kWhitespace) {
+ return WHITESPACE;
+ } else if (first_term_char == kColon) {
+ return COLON;
+ } else if (first_term_char == kLeftParentheses) {
+ return LEFT_PARENTHESES;
+ } else if (first_term_char == kRightParentheses) {
+ return RIGHT_PARENTHESES;
+ } else if (first_term_char == kExclusion) {
+ return EXCLUSION_OPERATOR;
+ }
+ } else if (term.length() == 2 && term == kOrOperator) {
+ return OR_OPERATOR;
+ }
+ // Checks the first char to see if it's an ASCII term
+ if (i18n_utils::IsAscii(term[0])) {
+ if (u_isalnum(term[0])) {
+ return ALPHANUMERIC_TERM;
+ }
+ return OTHER;
+ }
+ // All non-ASCII terms are alphabetic since language segmenter already
+ // filters out non-ASCII and non-alphabetic terms
+ return ALPHANUMERIC_TERM;
+}
+
+// Helper function to remove the last token if it's OR operator. This is used to
+// correct the queries where there're no valid tokens after "OR", e.g. [cat OR]
+// and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
+// valid token on its right.
+void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
+ if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
+ tokens->pop_back();
+ }
+}
+
+// Helper function to output an "OR" token while asserting the extra rule 2:
+// "OR" is ignored if there's no valid token on its left.
+libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
+ if (tokens->empty()) {
+ // Ignores "OR" because it's the first token.
+ return libtextclassifier3::Status::OK;
+ }
+ Token::Type last_token_type = tokens->back().type;
+ switch (last_token_type) {
+ case Token::REGULAR:
+ case Token::QUERY_RIGHT_PARENTHESES:
+ tokens->emplace_back(Token::QUERY_OR);
+ break;
+ case Token::QUERY_OR:
+ // Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
+ break;
+ default:
+ // Ignores "OR" because there isn't a valid token on its left.
+ break;
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to output a token according to current term and new state.
+// The new token will be added to 'tokens'.
+//
+// NOTE: how we output the current term is depending on the new state and not
+// the current state. E.g. for these two queries: [property1: ] and
+// [property1:term], "property1" is a regular term in the first query but a
+// property name in the second. The meaning of "property1" is determined when
+// we read the content after the colon. That's why we need to get the new state
+// here.
+//
+// Returns:
+// OK on success
+// INVALID_ARGUMENT with error message on invalid query syntax
+libtextclassifier3::Status OutputToken(State new_state,
+ std::string_view current_term,
+ TermType current_term_type,
+ std::vector<Token>* tokens) {
+ switch (current_term_type) {
+ case ALPHANUMERIC_TERM:
+ if (new_state == PROCESSING_PROPERTY_TERM) {
+ // Asserts extra rule 1: property name must be in ASCII
+ if (!i18n_utils::IsAscii(current_term[0])) {
+ return absl_ports::InvalidArgumentError(
+ "Characters in property name must all be ASCII.");
+ }
+ tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
+ } else {
+ tokens->emplace_back(Token::REGULAR, current_term);
+ }
+ break;
+ case LEFT_PARENTHESES:
+ tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
+ break;
+ case RIGHT_PARENTHESES:
+ // Ignores "OR" if it's followed by right parentheses.
+ RemoveLastTokenIfOrOperator(tokens);
+ tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
+ break;
+ case EXCLUSION_OPERATOR:
+ tokens->emplace_back(Token::QUERY_EXCLUSION);
+ break;
+ case OR_OPERATOR:
+ return OutputOrOperatorToken(tokens);
+ default:
+ break;
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to apply proper rules on current state and next term type.
+// 'current_state' and other output parameters will be modified to new values,
+// new token will be added to 'tokens' if possible.
+//
+// Returns:
+// OK on success
+// INVALID_ARGUMENT with error message on invalid query syntax
+libtextclassifier3::Status ProcessTerm(State* current_state,
+ std::string_view* current_term,
+ TermType* current_term_type,
+ int* unclosed_parentheses_count,
+ const std::string_view next_term,
+ TermType next_term_type,
+ std::vector<Token>* tokens) {
+ // Asserts extra rule 4: parentheses must appear in pairs.
+ if (next_term_type == LEFT_PARENTHESES) {
+ ++(*unclosed_parentheses_count);
+ } else if (next_term_type == RIGHT_PARENTHESES &&
+ --(*unclosed_parentheses_count) < 0) {
+ return absl_ports::InvalidArgumentError("Too many right parentheses.");
+ }
+ // Asks the rules what action to take and what the new state is based on
+ // current state and next term.
+ ActionOrError action_or_error = action_rules[*current_state][next_term_type];
+ State new_state = state_transition_rules[*current_state][next_term_type];
+ // Sanity check
+ if (action_or_error >= ERROR_UNKNOWN || new_state == INVALID) {
+ return absl_ports::InvalidArgumentError(GetErrorMessage(action_or_error));
+ }
+ switch (action_or_error) {
+ case OUTPUT:
+ ICING_RETURN_IF_ERROR(
+ OutputToken(new_state, *current_term, *current_term_type, tokens));
+ U_FALLTHROUGH;
+ case IGNORE:
+ *current_term = next_term;
+ *current_term_type = next_term_type;
+ break;
+ case KEEP:
+ break;
+ default:
+ return absl_ports::InvalidArgumentError(GetErrorMessage(ERROR_UNKNOWN));
+ }
+ *current_state = new_state;
+ return libtextclassifier3::Status::OK;
+}
+
+// Processes all the terms from base iterator and produces a list of tokens
+// based on the raw query syntax rules.
+//
+// Returns:
+// A list of tokens on success
+// INVALID_ARGUMENT with error message on invalid query syntax
+libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
+ std::unique_ptr<LanguageSegmenter::Iterator> base_iterator) {
+ std::vector<Token> tokens;
+ State current_state = READY;
+ std::string_view current_term;
+ TermType current_term_type;
+ int unclosed_parentheses_count = 0;
+ while (base_iterator->Advance()) {
+ const std::string_view next_term = base_iterator->GetTerm();
+ size_t colon_position = next_term.find(kColon);
+ // Since colon ":" is a word connector per ICU's rule
+ // (https://unicode.org/reports/tr29/#Word_Boundaries), strings like
+ // "foo:bar" are returned by LanguageSegmenter as one term. Here we're
+ // trying to find the first colon as it represents property restriction in
+ // raw query.
+ if (colon_position == std::string_view::npos) {
+ // No colon found
+ ICING_RETURN_IF_ERROR(ProcessTerm(¤t_state, ¤t_term,
+ ¤t_term_type,
+ &unclosed_parentheses_count, next_term,
+ GetTermType(next_term), &tokens));
+ } else if (next_term.size() == 1 && next_term[0] == kColon) {
+ // The whole term is a colon
+ ICING_RETURN_IF_ERROR(
+ ProcessTerm(¤t_state, ¤t_term, ¤t_term_type,
+ &unclosed_parentheses_count, next_term, COLON, &tokens));
+ } else {
+ // String before the colon is the property name
+ std::string_view property_name = next_term.substr(0, colon_position);
+ ICING_RETURN_IF_ERROR(
+ ProcessTerm(¤t_state, ¤t_term, ¤t_term_type,
+ &unclosed_parentheses_count, property_name,
+ GetTermType(property_name), &tokens));
+ ICING_RETURN_IF_ERROR(
+ ProcessTerm(¤t_state, ¤t_term, ¤t_term_type,
+ &unclosed_parentheses_count, std::string_view(&kColon, 1),
+ COLON, &tokens));
+ // String after the colon is the term that property restriction is applied
+ // on.
+ std::string_view property_term = next_term.substr(colon_position + 1);
+ ICING_RETURN_IF_ERROR(
+ ProcessTerm(¤t_state, ¤t_term, ¤t_term_type,
+ &unclosed_parentheses_count, property_term,
+ GetTermType(property_term), &tokens));
+ }
+ }
+ // Adds a fake whitespace at the end to flush the last term.
+ ICING_RETURN_IF_ERROR(
+ ProcessTerm(¤t_state, ¤t_term, ¤t_term_type,
+ &unclosed_parentheses_count,
+ std::string_view(&kWhitespace, 1), WHITESPACE, &tokens));
+ if (unclosed_parentheses_count > 0) {
+ return absl_ports::InvalidArgumentError("Unclosed left parentheses.");
+ }
+ // Ignores "OR" if it's at the end.
+ RemoveLastTokenIfOrOperator(&tokens);
+ return tokens;
+}
+
+// For raw query, it's easier to produce all the tokens together one time and
+// pass them to the iterator because the meaning of each term may relate to the
+// terms before or after it.
+class RawQueryTokenIterator : public Tokenizer::Iterator {
+ public:
+ explicit RawQueryTokenIterator(std::vector<Token>&& tokens)
+ : tokens_(std::move(tokens)) {}
+
+ bool Advance() override { return ++current_ < tokens_.size(); }
+
+ Token GetToken() const override {
+ if (current_ < 0 || current_ >= tokens_.size()) {
+ return Token(Token::INVALID);
+ }
+ return tokens_.at(current_);
+ }
+
+ private:
+ const std::vector<Token> tokens_;
+ int current_ = -1;
+};
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+RawQueryTokenizer::Tokenize(std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, TokenizeAll(text));
+ return std::make_unique<RawQueryTokenIterator>(std::move(tokens));
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> RawQueryTokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
+ language_segmenter_.Segment(text));
+ return ProcessTerms(std::move(base_iterator));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/raw-query-tokenizer.h b/icing/tokenization/raw-query-tokenizer.h
new file mode 100644
index 0000000..570a652
--- /dev/null
+++ b/icing/tokenization/raw-query-tokenizer.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_RAW_QUERY_TOKENIZER_H_
+#define ICING_TOKENIZATION_RAW_QUERY_TOKENIZER_H_
+
+#include <string_view>
+
+#include "utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+class RawQueryTokenizer : public Tokenizer {
+ public:
+ explicit RawQueryTokenizer(const LanguageSegmenter* language_segmenter)
+ : language_segmenter_(*language_segmenter) {}
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+
+ private:
+ // Used to segment input texts based on language understanding
+ const LanguageSegmenter& language_segmenter_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_RAW_QUERY_TOKENIZER_H_
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
new file mode 100644
index 0000000..38420c2
--- /dev/null
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -0,0 +1,536 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/raw-query-tokenizer.h"
+
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/tokenizer.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+
+class RawQueryTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+ }
+};
+
+TEST_F(RawQueryTokenizerTest, Simple) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
+ EqualsToken(Token::REGULAR, "World"))));
+}
+
+TEST_F(RawQueryTokenizerTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term3"),
+ EqualsToken(Token::REGULAR, "term4"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1)term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property name can't be a group")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Unclosed left parentheses")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1))"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Too many right parentheses")));
+}
+
+TEST_F(RawQueryTokenizerTest, Exclustion) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // Exclusion operator is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ // Exclusion operator is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ // Exclusion operator is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // First exclusion operator is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ // First "-" is exclusion operator, second is not and will be discarded.
+ // In other words, exclusion only applies to the term right after it.
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Exclusion on groups is not supported")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-OR"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Exclusion and OR operators can't be used together")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-:term1"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Exclusion and property restriction operators "
+ "can't be used together")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-property1:term1"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Exclusion and property restriction operators "
+ "can't be used together")));
+}
+
+TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // Colon is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ // Colon is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // Colon is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ // property name can be a path
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
+ EqualsToken(Token::REGULAR, "hello"))));
+
+ // The first colon ":" triggers property restriction, the second colon is used
+ // as a word connector per ICU's rule
+ // (https://unicode.org/reports/tr29/#Word_Boundaries).
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::REGULAR, "foo:bar"))));
+
+ // Property restriction only applies to the term right after it.
+ // Note: "term1:term2" is not a term but 2 terms because word connectors
+ // don't apply to numbers and alphabets.
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1-"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ // Multiple continuous colons will still be recognized as a property
+ // restriction operator
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1::term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:(term1)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property restriction on groups is not supported")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:OR"),
+ StatusIs(
+ libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr(
+ "Property restriction and OR operators can't be used together")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:-term1"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Exclusion and property restriction operators "
+ "can't be used together")));
+}
+
+TEST_F(RawQueryTokenizerTest, OR) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ // Two continuous "OR"s are treated as one
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // Only "OR" (all in uppercase) is the operator
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "or"),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::REGULAR, "Or"),
+ EqualsToken(Token::REGULAR, "term3"),
+ EqualsToken(Token::REGULAR, "oR"),
+ EqualsToken(Token::REGULAR, "term4"))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // "OR" is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term2"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR:term2"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+}
+
+// CJKT are treated the same way by language segmenter and raw tokenizer, so
+// here we test Chinese and Japanese to represent CJKT.
+TEST_F(RawQueryTokenizerTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ // Exclusion only applies to the term right after it.
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-今天天气很好"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::REGULAR, "天气"),
+ EqualsToken(Token::REGULAR, "很好"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:你好"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "你好"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("标题:你好"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Characters in property name must all be ASCII")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "ねこ"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("cat ORねこ"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("ねこOR cat"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-ねこOR cat"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property:ねこOR cat"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("No whitespaces before or after OR operator")));
+}
+
+// Raw tokenizer identifies all characters that it doesn't know as OTHER type,
+// so we can choose comma "," to represent all OTHER characters.
+TEST_F(RawQueryTokenizerTest, OtherChars) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ // Comma is ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+
+ // Exclusion operator and comma are ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ // Colon and comma are ignored
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
+ EqualsToken(Token::REGULAR, "term1"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::REGULAR, "term2"))));
+
+ // This is a special case for OR, unknown chars are treated the same as
+ // whitespaces before and after OR.
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
+ EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::REGULAR, "term2"))));
+}
+
+TEST_F(RawQueryTokenizerTest, Mix) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ LanguageSegmenter::Create(GetLangIdModelPath()));
+ std::unique_ptr<Tokenizer> raw_query_tokenizer =
+ std::make_unique<RawQueryTokenizer>(language_segmenter.get());
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(
+ "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::REGULAR, "こんにちは"),
+ EqualsToken(Token::REGULAR, "good"),
+ EqualsToken(Token::REGULAR, "afternoon"),
+ EqualsToken(Token::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::REGULAR, "今天"), EqualsToken(Token::QUERY_OR, ""),
+ EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::REGULAR, "ใน"), EqualsToken(Token::REGULAR, "วัน"),
+ EqualsToken(Token::REGULAR, "นี้"),
+ EqualsToken(Token::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::REGULAR, "B12"),
+ EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
new file mode 100644
index 0000000..0bb3aaf
--- /dev/null
+++ b/icing/tokenization/token.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_TOKEN_H_
+#define ICING_TOKENIZATION_TOKEN_H_
+
+#include <string_view>
+
+namespace icing {
+namespace lib {
+
+// TODO(samzheng) Add group id support if needed. Right now in raw query we
+// don't need group ids since all our query operators (OR, Exclusion, Property
+// Restriction) only apply to the token right after them (vs. applying to
+// multiple tokens after them). The "groups" of tokens can be easily recognized.
+struct Token {
+ enum Type {
+ // Common types
+ REGULAR, // A token without special meanings, the value of it will be
+ // indexed or searched directly
+
+ // Types only used in raw query
+ QUERY_OR, // Indicates OR logic between its left and right tokens
+ QUERY_EXCLUSION, // Indicates exclusion operation on next token
+ QUERY_PROPERTY, // Indicates property restrict on next token
+ QUERY_LEFT_PARENTHESES, // Left parentheses
+ QUERY_RIGHT_PARENTHESES, // Right parentheses
+
+ // Indicates errors
+ INVALID,
+ };
+
+ // The input text should outlive the Token instance.
+ explicit Token(Type type_in, std::string_view text_in = "")
+ : type(type_in), text(text_in) {}
+
+ // The type of token
+ const Type type;
+
+ // The content of token
+ const std::string_view text;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_TOKEN_H_
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
new file mode 100644
index 0000000..09bf2d1
--- /dev/null
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/tokenizer-factory.h"
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/plain-tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace tokenizer_factory {
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
+CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+ const LanguageSegmenter* lang_segmenter) {
+ switch (type) {
+ case IndexingConfig::TokenizerType::PLAIN:
+ return std::make_unique<PlainTokenizer>(lang_segmenter);
+ case IndexingConfig::TokenizerType::NONE:
+ U_FALLTHROUGH;
+ default:
+ // This should never happen.
+ return absl_ports::InvalidArgumentError(
+ "Invalid tokenizer type for an indexed section");
+ }
+}
+
+} // namespace tokenizer_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/tokenizer-factory.h b/icing/tokenization/tokenizer-factory.h
new file mode 100644
index 0000000..8a22f29
--- /dev/null
+++ b/icing/tokenization/tokenizer-factory.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_TOKENIZER_FACTORY_H_
+#define ICING_TOKENIZATION_TOKENIZER_FACTORY_H_
+
+#include <memory>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace tokenizer_factory {
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
+CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+ const LanguageSegmenter* lang_segmenter);
+
+} // namespace tokenizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_TOKENIZER_FACTORY_H_
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
new file mode 100644
index 0000000..96e3231
--- /dev/null
+++ b/icing/tokenization/tokenizer.h
@@ -0,0 +1,115 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_TOKENIZER_H_
+#define ICING_TOKENIZATION_TOKENIZER_H_
+
+#include <cstdint>
+
+#include "utils/base/statusor.h"
+#include "icing/tokenization/token.h"
+namespace icing {
+namespace lib {
+
+// A virtual class that all other tokenizers should inherit. It provides
+// interfaces that allow callers to tokenize text. The return value could be an
+// iterator or a list of tokens. Example usage:
+//
+// std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
+// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
+// tokenizer->Tokenize(text));
+// ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
+// tokenizer->TokenizeAll(text));
+class Tokenizer {
+ public:
+ virtual ~Tokenizer() = default;
+
+ enum Type {
+ // Index tokenizers
+ PLAIN, // Used to tokenize plain text input
+
+ // Query tokenizers
+ RAW_QUERY, // Used to tokenize raw queries
+ };
+
+ // An iterator helping to get tokens.
+ // Example usage:
+ //
+ // while (iterator.Advance()) {
+ // const Token& token = iterator.GetToken();
+ // // Do something
+ // }
+ class Iterator {
+ public:
+ virtual ~Iterator() = default;
+
+ // Advances to the next token. Returns false if it has reached the end.
+ virtual bool Advance() = 0;
+
+ // Returns the current token. It can be called only when Advance() returns
+ // true, otherwise an invalid token could be returned.
+ virtual Token GetToken() const = 0;
+
+ // Sets the tokenizer to point at the first token that *starts* *after*
+ // offset. Returns false if there are no valid tokens starting after
+ // offset.
+ // Ex.
+ // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
+ // iterator.ResetToTokenAfter(4);
+ // // The first full token starting after position 4 (the 'b' in "bar") is
+ // // "baz".
+ // PrintToken(iterator.GetToken()); // prints "baz"
+ virtual bool ResetToTokenAfter(int32_t offset) { return false; }
+
+ // Sets the tokenizer to point at the first token that *ends* *before*
+ // offset. Returns false if there are no valid tokens ending
+ // before offset.
+ // Ex.
+ // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
+ // iterator.ResetToTokenBefore(4);
+ // // The first full token ending before position 4 (the 'b' in "bar") is
+ // // "foo".
+ // PrintToken(iterator.GetToken()); // prints "foo"
+ virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+ };
+
+ // Tokenizes the input text. The input text should outlive the returned
+ // iterator.
+ //
+ // Returns:
+ // A token iterator on success
+ // INVALID_ARGUMENT with error message if input text has a wrong syntax
+ // according to implementations of different tokenizer
+ // types.
+ // INTERNAL_ERROR if any other errors occur
+ virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
+ std::string_view text) const = 0;
+
+ // Tokenizes and returns all tokens in the input text. The input text should
+ // outlive the returned vector.
+ //
+ // Returns:
+ // A list of tokens on success
+ // INVALID_ARGUMENT with error message if input text has a wrong syntax
+ // according to implementations of different tokenizer
+ // types.
+ // INTERNAL_ERROR if any other errors occur
+ virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_TOKENIZER_H_
diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc
new file mode 100644
index 0000000..45c9bf5
--- /dev/null
+++ b/icing/tools/document-store-dump.cc
@@ -0,0 +1,119 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tools/document-store-dump.h"
+
+#include <cinttypes>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+void AppendDocumentProto(DocId document_id, const Document& doc,
+ std::string* output) {
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(
+ "Document {\n document_id: %d\n corpus_id: %d\n uri: "
+ "'%s'\n score: %d\n created_timestamp_ms: %" PRIu64 "\n",
+ static_cast<int>(document_id), doc.corpus_id(),
+ doc.uri().c_str(), static_cast<int>(doc.score()),
+ static_cast<int64_t>(doc.created_timestamp_ms())));
+ for (const auto& section : doc.sections()) {
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(
+ " section {\n id: %d\n indexed_length: "
+ "%d\n content: '%s'\n snippet: '%s'\n",
+ static_cast<int>(section.id()),
+ static_cast<int>(section.indexed_length()),
+ section.content().c_str(), section.snippet().c_str()));
+ for (int64_t extracted_number : section.extracted_numbers()) {
+ absl_ports::StrAppend(output, IcingStringUtil::StringPrintf(
+ " extracted_numbers: %" PRId64 "\n",
+ extracted_number));
+ }
+ for (const std::string& annotation_token : section.annotation_tokens()) {
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(" annotation_tokens: '%s'\n",
+ annotation_token.c_str()));
+ }
+ std::string indexed = (section.config().indexed()) ? "true" : "false";
+ std::string index_prefixes =
+ (section.config().index_prefixes()) ? "true" : "false";
+ absl_ports::StrAppend(
+ output,
+ IcingStringUtil::StringPrintf(
+ " config {\n name: '%s'\n indexed: %s\n "
+ "tokenizer: %d\n weight: %d\n index_prefixes: %s\n "
+ "subsection_separator: '%s'\n",
+ section.config().name().c_str(), indexed.c_str(),
+ section.config().tokenizer(),
+ static_cast<int>(section.config().weight()), index_prefixes.c_str(),
+ section.config().subsection_separator().c_str()));
+ for (const auto& variant_generator :
+ section.config().variant_generators()) {
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(
+ " variant_generators: %d\n", variant_generator));
+ }
+ absl_ports::StrAppend(
+ output,
+ IcingStringUtil::StringPrintf(
+ " common_term_legacy_hit_score: %d\n "
+ "rfc822_host_name_term_legacy_hit_score: %d\n "
+ "semantic_property: '%s'\n universal_section_id: %d\n "
+ "omnibox_section_type: %d\n st_section_type: %d\n }\n }\n",
+ section.config().common_term_legacy_hit_score(),
+ section.config().rfc822_host_name_term_legacy_hit_score(),
+ section.config().semantic_property().c_str(),
+ section.config().universal_section_id(),
+ section.config().omnibox_section_type(),
+ section.config().st_section_type()));
+ }
+ for (const auto& language : doc.languages()) {
+ std::string used_classifier =
+ (language.used_classifier()) ? "true" : "false";
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(
+ " languages {\n language: %d\n score: %d\n "
+ "used_classifier: %s\n }\n",
+ language.language(), static_cast<int>(language.score()),
+ used_classifier.c_str()));
+ }
+ absl_ports::StrAppend(
+ output, IcingStringUtil::StringPrintf(
+ " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n"));
+}
+
+} // namespace
+
+std::string GetDocumentStoreDump(const DocumentStore& document_store) {
+ std::string output;
+ for (DocId document_id = 0; document_id < document_store.num_documents();
+ document_id++) {
+ Document doc;
+ if (!document_store.ReadDocument(document_id, &doc)) {
+ ICING_LOG(FATAL) << "Failed to read document";
+ }
+
+ AppendDocumentProto(document_id, doc, &output);
+ }
+ return output;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h
new file mode 100644
index 0000000..023b301
--- /dev/null
+++ b/icing/tools/document-store-dump.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
+#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
+
+#include <string>
+
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// Utility function for dumping the complete document store content.
+// This provides a human-readable representation of the document store, mainly
+// provided for easier understandability for developers.
+// The output of this class should only be available on cmdline-tool-level
+// (with root access), or unit tests. In other words it should not be possible
+// to trigger this on a release key device, for data protection reasons.
+std::string GetDocumentStoreDump(const DocumentStore& document_store);
+
+} // namespace lib
+} // namespace icing
+#endif // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc
new file mode 100644
index 0000000..72a11e9
--- /dev/null
+++ b/icing/tools/icing-tool.cc
@@ -0,0 +1,306 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: ulas@google.com (Ulas Kirazci)
+//
+// A tool to debug the native index.
+
+#include <getopt.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h"
+#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/tools/document-store-dump.h"
+#include "icing/util/logging.h"
+
+using std::vector;
+using ::wireless_android_play_playlog::icing::IndexRestorationStats;
+
+namespace icing {
+namespace lib {
+
+// 256KB for debugging.
+const size_t kMaxDocumentSizeForDebugging = 1u << 18;
+// Dump dynamic trie stats and contents.
+void ProcessDynamicTrie(const char* filename) {
+ Filesystem filesystem;
+ DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem);
+ if (!trie.Init()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed",
+ filename);
+ return;
+ }
+
+ std::string out;
+ trie.GetDebugInfo(true, &out);
+ printf("Stats:\n%s", out.c_str());
+
+ std::ostringstream contents;
+ vector<std::string> keys;
+ trie.DumpTrie(&contents, &keys);
+ printf("Contents:\n%s", contents.str().c_str());
+}
+
+NativeIndexImpl* MakeIndex(const char* root_dir) {
+ NativeConfig native_config;
+ native_config.set_max_document_size(kMaxDocumentSizeForDebugging);
+ FlashIndexOptions flash_index_options(
+ NativeIndexImpl::GetNativeIndexDir(root_dir));
+ NativeIndexImpl* ni =
+ new NativeIndexImpl(root_dir, native_config, flash_index_options);
+ InitStatus init_status;
+ if (!ni->Init(&init_status)) {
+ ICING_LOG(FATAL) << "Failed to initialize legacy native index impl";
+ }
+
+ IndexRestorationStats unused;
+ ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused);
+ return ni;
+}
+
+void RunQuery(NativeIndexImpl* ni, const std::string& query, int start,
+ int num_results) {
+ // Pull out corpusids and uris.
+ QueryRequestSpec spec;
+ spec.set_no_corpus_filter(true);
+ spec.set_want_uris(true);
+ spec.set_scoring_verbosity_level(1);
+ spec.set_prefix_match(true);
+
+ QueryResponse response;
+ ni->ExecuteQuery(query, spec, 10000, start, num_results, &response);
+
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Query [%s] num results %u", query.c_str(), response.num_results());
+
+ for (int i = 0, uri_offset = 0; i < response.num_results(); i++) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i),
+ response.uri_lengths(i), response.uri_buffer().data() + uri_offset);
+ uri_offset += response.uri_lengths(i);
+ }
+}
+
+void RunSuggest(NativeIndexImpl* ni, const std::string& prefix,
+ int num_results) {
+ SuggestionResponse results;
+ ni->Suggest(prefix, num_results, vector<CorpusId>(), &results);
+
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Query [%s] num results %zu", prefix.c_str(),
+ static_cast<size_t>(results.suggestions_size()));
+
+ for (size_t i = 0; i < results.suggestions_size(); i++) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(),
+ results.suggestions(i).display_text().c_str());
+ }
+}
+
+int IcingTool(int argc, char** argv) {
+ auto file_storage = CreatePosixFileStorage();
+ enum Options {
+ OPT_FILENAME,
+ OPT_OP,
+ OPT_QUERY,
+ NUM_OPT,
+ };
+ static const option kOptions[NUM_OPT + 1] = {
+ {"filename", 1, nullptr, 0},
+ {"op", 1, nullptr, 0},
+ {"query", 1, nullptr, 0},
+ {nullptr, 0, nullptr, 0},
+ };
+ const char* opt_values[NUM_OPT];
+ memset(opt_values, 0, sizeof(opt_values));
+
+ while (true) {
+ int opt_idx = -1;
+ int ret = getopt_long(argc, argv, "", kOptions, &opt_idx);
+ if (ret != 0) break;
+
+ if (opt_idx >= 0 && opt_idx < NUM_OPT) {
+ opt_values[opt_idx] = optarg;
+ }
+ }
+
+ if (!opt_values[OPT_OP]) {
+ ICING_LOG(ERROR) << "No op specified";
+ return -1;
+ }
+
+ if (!opt_values[OPT_FILENAME]) {
+ ICING_LOG(ERROR) << "No filename specified";
+ return -1;
+ }
+ if (!strncmp(
+ opt_values[OPT_FILENAME],
+ "/data/data/com.google.android.gms/files/AppDataSearch",
+ strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) {
+ ICING_LOG(ERROR)
+ << "Should not read directly from the file in gmscore - "
+ "icing-tool also commits writes as side-effects which corrupts "
+ "the index on concurrent modification";
+ return -1;
+ }
+
+ const char* op = opt_values[OPT_OP];
+ DocumentStore::Options options(file_storage.get(),
+ kMaxDocumentSizeForDebugging);
+ if (!strcmp(op, "dyntrie")) {
+ std::string full_file_path =
+ absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon");
+ ProcessDynamicTrie(full_file_path.c_str());
+ } else if (!strcmp(op, "verify")) {
+ std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
+ ni->CheckVerify();
+ } else if (!strcmp(op, "query")) {
+ if (opt_values[OPT_QUERY] == nullptr) {
+ ICING_LOG(FATAL) << "Opt value is null";
+ }
+
+ std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
+ RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100);
+ } else if (!strcmp(op, "suggest")) {
+ if (opt_values[OPT_QUERY] == nullptr) {
+ ICING_LOG(FATAL) << "Opt value is null";
+ }
+
+ std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
+ RunSuggest(ni.get(), opt_values[OPT_QUERY], 100);
+ } else if (!strcmp(op, "dump-all-docs")) {
+ DocumentStore ds(opt_values[OPT_FILENAME], options);
+ if (!ds.Init()) {
+ ICING_LOG(FATAL) << "Legacy document store failed to initialize";
+ }
+
+ printf(
+ "------ Document Store Dump Start ------\n"
+ "%s\n"
+ "------ Document Store Dump End ------\n",
+ GetDocumentStoreDump(ds).c_str());
+ } else if (!strcmp(op, "dump-uris")) {
+ CorpusId corpus_id = kInvalidCorpusId;
+ if (opt_values[OPT_QUERY]) {
+ // Query is corpus id.
+ corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
+ }
+ DocumentStore ds(opt_values[OPT_FILENAME], options);
+ if (!ds.Init()) {
+ ICING_LOG(FATAL) << "Legacy document store failed to initialize";
+ }
+
+ DocPropertyFilter dpf;
+ ds.AddDeletedTagFilter(&dpf);
+
+ // Dump with format "<corpusid> <uri> <tagname>*".
+ int filtered = 0;
+ vector<std::string> tagnames;
+ for (DocId document_id = 0; document_id < ds.num_documents();
+ document_id++) {
+ Document doc;
+ if (!ds.ReadDocument(document_id, &doc)) {
+ ICING_LOG(FATAL) << "Failed to read document.";
+ }
+
+ if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
+ filtered++;
+ continue;
+ }
+ if (dpf.Match(0, document_id)) {
+ filtered++;
+ continue;
+ }
+
+ tagnames.clear();
+ ds.GetAllSetUserTagNames(document_id, &tagnames);
+
+ printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(),
+ StringUtil::JoinStrings("/", tagnames).c_str());
+ }
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Processed %u filtered %d", ds.num_documents(), filtered);
+ } else if (!strcmp(op, "dump-docs")) {
+ std::string out_filename = opt_values[OPT_FILENAME];
+ out_filename.append("/docs-dump");
+ CorpusId corpus_id = kInvalidCorpusId;
+ if (opt_values[OPT_QUERY]) {
+ // Query is corpus id.
+ corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
+ out_filename.push_back('.');
+ out_filename.append(opt_values[OPT_QUERY]);
+ }
+ DocumentStore ds(opt_values[OPT_FILENAME], options);
+ if (!ds.Init()) {
+ ICING_LOG(FATAL) << "Legacy document store failed to initialize";
+ }
+
+ DocPropertyFilter dpf;
+ ds.AddDeletedTagFilter(&dpf);
+
+ // Dump with format (<32-bit length><serialized content>)*.
+ FILE* fp = fopen(out_filename.c_str(), "w");
+ int filtered = 0;
+ for (DocId document_id = 0; document_id < ds.num_documents();
+ document_id++) {
+ Document doc;
+ if (!ds.ReadDocument(document_id, &doc)) {
+ ICING_LOG(FATAL) << "Failed to read document.";
+ }
+
+ if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
+ filtered++;
+ continue;
+ }
+ if (dpf.Match(0, document_id)) {
+ filtered++;
+ continue;
+ }
+
+ std::string serialized = doc.SerializeAsString();
+ uint32_t length = serialized.size();
+ if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) {
+ ICING_LOG(FATAL) << "Failed to write length information to file";
+ }
+
+ if (fwrite(serialized.data(), 1, serialized.size(), fp) !=
+ serialized.size()) {
+ ICING_LOG(FATAL) << "Failed to write document to file";
+ }
+ }
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Processed %u filtered %d", ds.num_documents(), filtered);
+ fclose(fp);
+ } else {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op);
+ return -1;
+ }
+
+ return 0;
+}
+
+} // namespace lib
+} // namespace icing
+
+int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); }
diff --git a/icing/transform/normalizer.cc b/icing/transform/normalizer.cc
new file mode 100644
index 0000000..7553e28
--- /dev/null
+++ b/icing/transform/normalizer.cc
@@ -0,0 +1,229 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/normalizer.h"
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
+#include "unicode/umachine.h"
+#include "unicode/unorm2.h"
+#include "unicode/utrans.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// The following is the compound id used to tell UTransliterator how to
+// transform terms. The basic normalization forms NFD (canonical normalization
+// form decomposition) and NFKC (compatible normalization form composition)
+// are applied as well as some other rules we need. More information at
+// http://www.unicode.org/reports/tr15/
+// TODO(samzheng) Figure out if we need to support small hiragana to katakana
+// transformation.
+constexpr UChar kTransformRulesUtf16[] =
+ u"Lower; " // Lowercase
+ "Latin-ASCII; " // Map Latin characters to ASCII characters
+ "Hiragana-Katakana; " // Map hiragana to katakana
+ "[:Latin:] NFD; " // Decompose Latin letters
+ "[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks
+ "NFKC"; // Decompose and compose everything
+
+// Length of the transform rules excluding the terminating NULL.
+constexpr int kTransformRulesLength =
+ sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
+
+// An invalid value defined by Unicode.
+constexpr UChar32 kInvalidUchar32 = 0xFFFD;
+} // namespace
+
+// Creates a Normalizer with a valid TermTransformer instance.
+//
+// Note: UTokenizer2 is also an option to normalize Unicode strings, but since
+// we need some custom transform rules other than NFC/NFKC we have to use
+// TermTransformer as a custom transform rule executor.
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Normalizer::Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Normalizer::TermTransformer> term_transformer,
+ Normalizer::TermTransformer::Create());
+
+ return std::unique_ptr<Normalizer>(
+ new Normalizer(std::move(term_transformer), max_term_byte_size));
+}
+
+Normalizer::Normalizer(
+ std::unique_ptr<Normalizer::TermTransformer> term_transformer,
+ int max_term_byte_size)
+ : term_transformer_(std::move(term_transformer)),
+ max_term_byte_size_(max_term_byte_size) {}
+
+std::string Normalizer::NormalizeTerm(const std::string_view term) const {
+ std::string normalized_text;
+
+ if (term.empty()) {
+ return normalized_text;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ // ICU manages the singleton instance
+ const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
+ if (U_FAILURE(status)) {
+ ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
+ }
+
+ // Checks if the first character is within ASCII range or can be transformed
+ // into an ASCII char. Since the term is tokenized, we know that the whole
+ // term can be transformed into ASCII if the first character can.
+ UChar32 first_uchar32 =
+ i18n_utils::GetUChar32At(term.data(), term.length(), 0);
+ if (normalizer2 != nullptr && first_uchar32 != kInvalidUchar32 &&
+ i18n_utils::DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
+ // This is a faster method to normalize Latin terms.
+ normalized_text = NormalizeLatin(normalizer2, term);
+ } else {
+ normalized_text = term_transformer_->Transform(term);
+ }
+
+ if (normalized_text.length() > max_term_byte_size_) {
+ i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+ }
+
+ return normalized_text;
+}
+
+std::string Normalizer::NormalizeLatin(const UNormalizer2* normalizer2,
+ const std::string_view term) const {
+ std::string result;
+ result.reserve(term.length());
+ for (int i = 0; i < term.length(); i++) {
+ if (i18n_utils::IsAscii(term[i])) {
+ result.push_back(std::tolower(term[i]));
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ if (uchar32 == kInvalidUchar32) {
+ ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
+ << " at position" << i;
+ continue;
+ }
+ char ascii_char;
+ if (i18n_utils::DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
+ result.push_back(std::tolower(ascii_char));
+ } else {
+ // We don't know how to transform / decompose this Unicode character, it
+ // probably means that some other Unicode characters are mixed with
+ // Latin characters. This shouldn't happen if input term is properly
+ // tokenized. We handle it here in case there're something wrong with
+ // the tokenizers.
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ absl_ports::StrAppend(&result, term.substr(i, utf8_length));
+ }
+ }
+ }
+
+ return result;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer::TermTransformer>>
+Normalizer::TermTransformer::Create() {
+ UErrorCode status = U_ZERO_ERROR;
+ UTransliterator* term_transformer = utrans_openU(
+ kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
+ /*rules=*/nullptr, /*rulesLength=*/0, /*parseError=*/nullptr, &status);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError("Failed to create UTransliterator.");
+ }
+
+ return std::unique_ptr<Normalizer::TermTransformer>(
+ new Normalizer::TermTransformer(term_transformer));
+}
+
+Normalizer::TermTransformer::TermTransformer(UTransliterator* u_transliterator)
+ : u_transliterator_(u_transliterator) {}
+
+Normalizer::TermTransformer::~TermTransformer() {
+ if (u_transliterator_ != nullptr) {
+ utrans_close(u_transliterator_);
+ }
+}
+
+std::string Normalizer::TermTransformer::Transform(
+ const std::string_view term) const {
+ auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
+ if (!utf16_term_or.ok()) {
+ ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
+ return std::string(term);
+ }
+ std::u16string utf16_term = std::move(utf16_term_or).ValueOrDie();
+ UErrorCode status = U_ZERO_ERROR;
+ int utf16_term_desired_length = utf16_term.length();
+ int limit = utf16_term.length();
+ utrans_transUChars(u_transliterator_, &utf16_term[0],
+ &utf16_term_desired_length, utf16_term.length(),
+ /*start=*/0, &limit, &status);
+
+ // For most cases, one Unicode character is normalized to exact one Unicode
+ // character according to our transformation rules. However, there could be
+ // some rare cases where the normalized text is longer than the original
+ // one. E.g. "¼" (1 character) -> "1/4" (3 characters). That causes a buffer
+ // overflow error and we need to increase our buffer size and try again.
+ if (status == U_BUFFER_OVERFLOW_ERROR) {
+ // 'utf16_term_desired_length' has already been set to the desired value
+ // by utrans_transUChars(), here we increase the buffer size to that
+ // value.
+ //
+ // NOTE: we need to call resize() but not reserve() because values can't
+ // be set at positions after length().
+ int original_content_length = utf16_term.length();
+ utf16_term.resize(utf16_term_desired_length);
+ utf16_term_desired_length = original_content_length;
+ limit = original_content_length;
+ status = U_ZERO_ERROR;
+ utrans_transUChars(u_transliterator_, &utf16_term[0],
+ &utf16_term_desired_length, utf16_term.length(),
+ /*start=*/0, &limit, &status);
+ }
+
+ if (U_FAILURE(status)) {
+ // Failed to transform, return its original form.
+ ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
+ return std::string(term);
+ }
+
+ auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
+ if (!utf8_term_or.ok()) {
+ ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
+ return std::string(term);
+ }
+ return std::move(utf8_term_or).ValueOrDie();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
new file mode 100644
index 0000000..7f6350a
--- /dev/null
+++ b/icing/transform/normalizer.h
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_NORMALIZER_H_
+#define ICING_TRANSFORM_NORMALIZER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "utils/base/statusor.h"
+#include "unicode/unorm2.h"
+#include "unicode/utrans.h"
+
+namespace icing {
+namespace lib {
+
+// Used to normalize UTF8 strings for text matching. It enforces a set of rules:
+// 1. Transforms text to be lowercase UTF8.
+// 2. Transforms full-width Latin characters to ASCII characters if possible.
+// 3. Transforms hiragana to katakana.
+// 4. Removes accent / diacritic marks on Latin characters
+// 5. Normalized text must be less than or equal to max_term_byte_size,
+// otherwise it will be truncated.
+//
+// There're some other rules from ICU not listed here, please see .cc file for
+// details.
+//
+// Example use:
+// ICING_ASSIGN_OR_RETURN(auto normalizer,
+// Normalizer::Create(/*max_term_byte_size=*/5);
+//
+// std::string normalized_text = normalizer->NormalizeText("HELLO!");
+// ICING_LOG(INFO) << normalized_text; // prints "hello"
+class Normalizer {
+ public:
+ Normalizer(const Normalizer&) = delete;
+ Normalizer& operator=(const Normalizer&) = delete;
+
+ // Creates a normalizer with the subcomponents it needs. max_term_byte_size
+ // enforces the max size of text after normalization, text will be truncated
+ // if exceeds the max size.
+ //
+ // Returns:
+ // A normalizer on success
+ // INVALID_ARGUMENT if max_term_byte_size <= 0
+ // INTERNAL_ERROR if failed to create any subcomponent
+ static libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size);
+
+ // Normalizes the input term based on rules. See .cc file for rule details.
+ //
+ // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
+ // result in the non-Latin characters not properly being normalized
+ std::string NormalizeTerm(std::string_view term) const;
+
+ private:
+ // A handler class that helps manage the lifecycle of UTransliterator. It's
+ // used in Normalizer to transform terms into the formats we need.
+ class TermTransformer {
+ public:
+ // Creates TermTransformer with a valid UTransliterator instance
+ //
+ // Returns:
+ // A term transformer on success
+ // INTERNAL_ERROR if failed to create any subcomponent
+ static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
+ Create();
+
+ // Closes the UTransliterator instance
+ ~TermTransformer();
+
+ // Transforms the text based on our rules described at top of this file
+ std::string Transform(std::string_view term) const;
+
+ private:
+ explicit TermTransformer(UTransliterator* u_transliterator);
+
+ // An ICU class to execute custom term transformation / normalization rules.
+ // utrans_close() must by called after using.
+ UTransliterator* u_transliterator_;
+ };
+
+ explicit Normalizer(std::unique_ptr<TermTransformer> term_transformer,
+ int max_term_byte_size);
+
+ // Helper method to normalize Latin terms only. Rules applied:
+ // 1. Uppercase to lowercase
+ // 2. Remove diacritic (accent) marks
+ std::string NormalizeLatin(const UNormalizer2* normalizer2,
+ std::string_view term) const;
+
+ // Used to transform terms into their normalized forms.
+ std::unique_ptr<TermTransformer> term_transformer_;
+
+ // The maximum term length allowed after normalization.
+ const int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_NORMALIZER_H_
diff --git a/icing/transform/normalizer_benchmark.cc b/icing/transform/normalizer_benchmark.cc
new file mode 100644
index 0000000..53adac7
--- /dev/null
+++ b/icing/transform/normalizer_benchmark.cc
@@ -0,0 +1,157 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/transform:normalizer_benchmark
+//
+// $ blaze-bin/icing/transform/normalizer_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// Make target //icing/transform:normalizer depend on
+// //third_party/icu
+//
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/transform:normalizer_benchmark
+//
+// $ adb push blaze-bin/icing/transform/normalizer_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/normalizer_benchmark --benchmarks=all --adb
+
+// Flag to tell the benchmark that it'll be run on an Android device via adb,
+// the benchmark will set up data files accordingly.
+ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+void BM_NormalizeUppercase(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeUppercase)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeAccent(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ }
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeAccent)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeHiragana(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ Normalizer::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ }
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeHiragana)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/normalizer_test.cc b/icing/transform/normalizer_test.cc
new file mode 100644
index 0000000..ec0a782
--- /dev/null
+++ b/icing/transform/normalizer_test.cc
@@ -0,0 +1,164 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/normalizer.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::Eq;
+
+class NormalizerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ SetUpICUDataFile("icing/icu.dat"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, Normalizer::Create(
+ /*max_term_byte_size=*/1024));
+ }
+
+ std::unique_ptr<Normalizer> normalizer_;
+};
+
+TEST_F(NormalizerTest, Creation) {
+ EXPECT_THAT(Normalizer::Create(5), IsOk());
+ EXPECT_THAT(Normalizer::Create(0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(Normalizer::Create(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST_F(NormalizerTest, AlreadyNormalized) {
+ EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST_F(NormalizerTest, UppercaseToLowercase) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST_F(NormalizerTest, LatinLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("āăąḃḅḇčćç"), Eq("aaabbbccc"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÁȦÄḂḄḆĆČḈ"), Eq("aaabbbccc"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST_F(NormalizerTest, NonLatinLetterNotRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+}
+
+TEST_F(NormalizerTest, FullWidthCharsToASCII) {
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer_->NormalizeTerm("。,!?:”"), Eq(".,!?:\""));
+ // 0xff10 is the full-width number 0
+ EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff10)), Eq("0"));
+ // 0xff21 is the full-width letter A
+ EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff21)), Eq("a"));
+ // 0xff41 is the full-width letter a
+ EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff41)), Eq("a"));
+}
+
+// For Katakana, each character is normalized to its full-width version.
+TEST_F(NormalizerTest, KatakanaHalfWidthToFullWidth) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ"));
+}
+
+TEST_F(NormalizerTest, HiraganaToKatakana) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ぎゃぎゅぎょ"), Eq("ギャギュギョ"));
+}
+
+TEST_F(NormalizerTest, SuperscriptAndSubscriptToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
+}
+
+TEST_F(NormalizerTest, CircledCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
+}
+
+TEST_F(NormalizerTest, RotatedCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
+}
+
+TEST_F(NormalizerTest, SquaredCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
+}
+
+TEST_F(NormalizerTest, FractionsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
+}
+
+TEST_F(NormalizerTest, Truncate) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer,
+ Normalizer::Create(/*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+ // Each Japanese character has 3 bytes, so truncating to length 5 results in
+ // only 1 character.
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+ // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+ // character.
+ EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer,
+ Normalizer::Create(/*max_term_byte_size=*/2));
+ // The Japanese character has 3 bytes, truncating it results in an empty
+ // string.
+ EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h
new file mode 100644
index 0000000..e2bb817
--- /dev/null
+++ b/icing/util/bit-util.h
@@ -0,0 +1,68 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_BIT_UTIL_H_
+#define ICING_UTIL_BIT_UTIL_H_
+
+#include <cstdint>
+
+namespace icing {
+namespace lib {
+
+namespace bit_util {
+
+// Manipulating bit fields.
+//
+// x value containing the bit field(s)
+// offset offset of bit field in x
+// len len of bit field in x
+//
+// REQUIREMENTS
+//
+// - x an unsigned integer <= 64 bits
+// - offset + len <= sizeof(x) * 8
+//
+// There is no error checking so you will get garbage if you don't
+// ensure the above.
+//
+// To set a value, use BITFIELD_CLEAR then BITFIELD_OR.
+
+// Shifting by more than the word length is undefined (on ARM it has the
+// intended effect, but on Intel it shifts by % word length), so check the
+// length).
+inline uint64_t BitfieldMask(uint32_t len) {
+ return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len))));
+}
+inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) {
+ return ((mask) >> (lsb_offset)) & BitfieldMask(len);
+}
+inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len,
+ uint32_t* mask) {
+ // We conservatively mask val at len so x won't be corrupted if val >=
+ // 1 << len.
+ *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset);
+}
+inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len,
+ uint64_t* mask) {
+ // We conservatively mask val at len so x won't be corrupted if val >=
+ // 1 << len.
+ *mask |= (value & BitfieldMask(len)) << (lsb_offset);
+}
+
+} // namespace bit_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_BIT_UTIL_H_
diff --git a/icing/util/clock.h b/icing/util/clock.h
new file mode 100644
index 0000000..09cb375
--- /dev/null
+++ b/icing/util/clock.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CLOCK_H_
+#define ICING_UTIL_CLOCK_H_
+
+#include <ctime>
+
+namespace icing {
+namespace lib {
+
+// Wrapper around real-time clock functions. This is separated primarily so
+// tests can override this clock and inject it into the class under test.
+//
+// A few things to note about std::time_t :
+// From cppreference:
+// "Although not defined, this is almost always an integral value holding the
+// number of seconds (not counting leap seconds) since 00:00, Jan 1 1970 UTC,
+// corresponding to POSIX time"
+//
+// From Wikipedia:
+// "ISO C defines time_t as an arithmetic type, but does not specify any
+// particular type, range, resolution, or encoding for it. Also unspecified
+// are the meanings of arithmetic operations applied to time values."
+class Clock {
+ public:
+ virtual ~Clock() {}
+
+ // Returns:
+ // The current time defined by the clock on success
+ // std::time_t(-1) on error
+ virtual std::time_t GetCurrentSeconds() const { return std::time(nullptr); }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_CLOCK_H_
diff --git a/icing/util/crc32.cc b/icing/util/crc32.cc
new file mode 100644
index 0000000..8b2243a
--- /dev/null
+++ b/icing/util/crc32.cc
@@ -0,0 +1,96 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/crc32.h"
+
+#include <cstdint>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/zlib.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+uint32_t UpdateCrc32(uint32_t crc, const std::string_view str) {
+ if (str.length() > 0) {
+ // crc32() already includes a pre- and post-condition of taking the one's
+ // complement of the value.
+ crc =
+ ~crc32(~crc, reinterpret_cast<const Bytef*>(str.data()), str.length());
+ }
+ return crc;
+}
+} // namespace
+
+uint32_t Crc32::Get() const { return crc_; }
+
+uint32_t Crc32::Append(const std::string_view str) {
+ crc_ = UpdateCrc32(crc_, str);
+ return crc_;
+}
+
+libtextclassifier3::StatusOr<uint32_t> Crc32::UpdateWithXor(
+ const std::string_view xored_str, int full_data_size, int position) {
+ // For appending, use Append().
+ if (position + xored_str.length() > full_data_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "offset position %d + length %zd > full data size %d", position,
+ xored_str.length(), full_data_size));
+ }
+
+ // We have CRC(A|U|B) and we want CRC(A|V|B) where U is the slice
+ // that updated to V.
+ //
+ // xored_str = X = U ^ V
+ //
+ // Some terminology:
+ // `|`: denotes concatenation, NOT the bitwise operator OR
+ //
+ // (A|U|B): a concatenated string of A+U+B
+ //
+ // CRC(A|U|B): The crc of a concatenated string of A+U+B
+ //
+ // 0_lenA: a string of 0's of the length of string A
+ //
+ //
+ // (A|V|B) = (0_lenA|X|0_lenB) ^ (A|U|B)
+ //
+ // since CRC(D) = CRC(E) ^ CRC(F), where D = E ^ F:
+ // CRC(A|V|B)
+ // = CRC(0_lenA|X|0_lenB) ^ CRC(A|U|B)
+ //
+ // and CRC(D|E) = CRC_COMBINE(D, E), so
+ // = CRC_COMBINE(CRC(0_lenA), CRC_COMBINE(CRC(X), CRC(0_lenB)) ^ CRC(A|U|B)
+ //
+ // and CRC(0) = 0, so
+ // = CRC_COMBINE(0, CRC_COMBINE(CRC(X), CRC(0_lenB)) ^ CRC(A|U|B)
+ //
+ // and CRC(0|B) = CRC(B), so
+ // = CRC_COMBINE(CRC(X), CRC(0_lenB)) ^ CRC(A|U|B)
+ //
+ // For more details, see this post by Mark Adler, one of the authors of zlib:
+ // https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768#23126768
+
+ uint32_t update_crc = UpdateCrc32(0, xored_str);
+ update_crc = crc32_combine(update_crc, 0,
+ full_data_size - (position + xored_str.length()));
+ crc_ ^= update_crc;
+ return crc_;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/crc32.h b/icing/util/crc32.h
new file mode 100644
index 0000000..5f7a71b
--- /dev/null
+++ b/icing/util/crc32.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CRC32_H_
+#define ICING_UTIL_CRC32_H_
+
+#include <cstdint>
+#include <string_view>
+
+#include "utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+// Efficient mechanism to incrementally compute checksum of a file and keep it
+// updated when its content changes. Internally uses zlib based crc32()
+// implementation.
+//
+// See https://www.zlib.net/manual.html#Checksum for more details.
+//
+// TODO (samzheng): investigate/benchmark swapping zlib crc32 with
+// util/hash/crc32c.h. Regarding util/hash/crc32c.h, CRC32C::Extend crashes as
+// described in b/145837799.
+class Crc32 {
+ public:
+ // Default to the checksum of an empty string, that is "0".
+ Crc32() : crc_(0) {}
+
+ explicit Crc32(uint32_t init_crc) : crc_(init_crc) {}
+
+ inline bool operator==(const Crc32& other) const {
+ return crc_ == other.Get();
+ }
+
+ // Returns the checksum of all the data that has been processed till now.
+ uint32_t Get() const;
+
+ // Incrementally update the current checksum to reflect the fact that the
+ // underlying data has been appended with 'str'. It calculates a new crc32
+ // based on the current crc value and the newly appended string.
+ //
+ // NOTE: As this method accepts incremental appends, all these 3 will lead to
+ // the same checksum:
+ // 1) crc32.Append("AAA"); crc32.Append("BBB");
+ // 2) crc32.Append("AAABBB");
+ // 3) crc32.Append("AA"); crc32.Append("AB"); crc32.Append("BB");
+ //
+ // NOTE: While this class internally uses zlib's crc32(),
+ // Crc32(base_crc).Append(str) is not the same as zlib::crc32(base_crc, str);
+ uint32_t Append(std::string_view str);
+
+ // Update a string's rolling crc when some content is modified in the middle
+ // at an offset. We need the xored_str, which is the new value xored with the
+ // original value.
+ //
+ // Original string:
+ // string(original_start | original_mid | original_end)
+ // -------------------------------------------> full_data_size
+ // ^ offset position
+ //
+ // Modified string:
+ // string(original_start | changed_mid | original_end)
+ // ^ offset position
+ //
+ // And where
+ // xored_str = changed_mid ^ original_mid
+ // xored_len = length(xored_str)
+ // full_data_size = the length of all the strings that have been Appended to
+ // generate the current checksum
+ //
+ // REQUIRES: offset position + xored_len <= full_data_size.
+ //
+ // E.g.
+ // Old data: ABCDEF; New data: ABXYZF
+ //
+ // Crc32 crc32; crc32.Append("ABCDEF");
+ // crc32.UpdateWithXor("CDE" xor "XYZ", 6, 2);
+ //
+ // This is the same as
+ // Crc32 crc32; crc32.Append("ABXYZF");
+ //
+ // See .cc file for implementation notes.
+ //
+ // Returns:
+ // Updated crc on success
+ // INVALID_ARGUMENT if offset position + xored_len > full_data_size
+ libtextclassifier3::StatusOr<uint32_t> UpdateWithXor(
+ std::string_view xored_str, int full_data_size, int position);
+
+ private:
+ uint32_t crc_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_CRC32_H_
diff --git a/icing/util/crc32_test.cc b/icing/util/crc32_test.cc
new file mode 100644
index 0000000..ab8582a
--- /dev/null
+++ b/icing/util/crc32_test.cc
@@ -0,0 +1,108 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/crc32.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/portable/zlib.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+void UpdateAtRandomOffset(std::string* buf, uint32_t* update_xor, int* offset) {
+ // The max value of rand() is 2^31 - 1 (2147483647) but the max value of
+ // uint32_t is 2^32 (4294967296), so we need rand() * rand() to cover all the
+ // possibilities.
+ *offset = (static_cast<uint32_t>(rand()) *
+ static_cast<uint32_t>(rand())) % // NOLINT
+ (buf->size() - sizeof(uint32_t));
+ *update_xor =
+ static_cast<uint32_t>(rand()) * static_cast<uint32_t>(rand()); // NOLINT
+ const unsigned char* update_xor_buf =
+ reinterpret_cast<const unsigned char*>(update_xor);
+
+ // XOR update_xor at offset.
+ for (size_t j = 0; j < sizeof(*update_xor); j++) {
+ (*buf)[*offset + j] ^= update_xor_buf[j];
+ }
+}
+
+TEST(Crc32Test, Get) {
+ Crc32 crc32_test{10};
+ Crc32 crc32_test_empty{};
+ EXPECT_THAT(crc32_test.Get(), Eq(10));
+ EXPECT_THAT(crc32_test_empty.Get(), Eq(0));
+}
+
+TEST(Crc32Test, Append) {
+ // Test the complement logic inside Append()
+ const uLong kCrcInitZero = crc32(0L, nullptr, 0);
+ uint32_t foo_crc =
+ crc32(kCrcInitZero, reinterpret_cast<const Bytef*>("foo"), 3);
+ uint32_t foobar_crc =
+ crc32(kCrcInitZero, reinterpret_cast<const Bytef*>("foobar"), 6);
+
+ Crc32 crc32_test(~foo_crc);
+ ASSERT_THAT(~crc32_test.Append("bar"), Eq(foobar_crc));
+
+ // Test Append() that appending things separately should be the same as
+ // appending in one shot
+ Crc32 crc32_foobar{};
+ crc32_foobar.Append("foobar");
+ Crc32 crc32_foo_and_bar{};
+ crc32_foo_and_bar.Append("foo");
+ crc32_foo_and_bar.Append("bar");
+
+ EXPECT_THAT(crc32_foo_and_bar.Get(), Eq(crc32_foobar.Get()));
+}
+
+TEST(Crc32Test, UpdateAtPosition) {
+ std::string buf;
+ buf.resize(1000);
+ for (size_t i = 0; i < buf.size(); i++) {
+ buf[i] = i * 2;
+ }
+ Crc32 crc32_test{};
+ crc32_test.Append(buf);
+
+ for (int i = 0; i < 100; i++) {
+ uint32_t update_xor;
+ int offset;
+ UpdateAtRandomOffset(&buf, &update_xor, &offset);
+
+ // Compute crc from scratch and compare against update.
+ uint32_t new_crc =
+ ~crc32(~0, reinterpret_cast<const Bytef*>(buf.data()), buf.size());
+ const std::string_view xored_str(reinterpret_cast<const char*>(&update_xor),
+ sizeof(update_xor));
+ EXPECT_THAT(crc32_test.UpdateWithXor(xored_str, buf.size(), offset),
+ IsOkAndHolds(new_crc));
+ }
+
+ // Wrong string length
+ EXPECT_THAT(crc32_test.UpdateWithXor("12345", buf.size(), buf.size() - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc
new file mode 100644
index 0000000..6e1bf8b
--- /dev/null
+++ b/icing/util/document-validator.cc
@@ -0,0 +1,177 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/document-validator.h"
+
+#include <cstdint>
+#include <unordered_set>
+
+#include "utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/status_macros.h"
+#include "icing/schema/schema-util.h"
+
+namespace icing {
+namespace lib {
+
+using PropertyConfigMap =
+ std::unordered_map<std::string_view, const PropertyConfigProto*>;
+
+DocumentValidator::DocumentValidator(const SchemaStore* schema_store)
+ : schema_store_(schema_store) {}
+
+libtextclassifier3::Status DocumentValidator::Validate(
+ const DocumentProto& document) {
+ if (document.namespace_().empty()) {
+ return absl_ports::InvalidArgumentError("Field 'namespace' is empty.");
+ }
+
+ if (document.uri().empty()) {
+ return absl_ports::InvalidArgumentError("Field 'uri' is empty.");
+ }
+
+ if (document.schema().empty()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'schema' is empty for key: (",
+ document.namespace_(), ", ", document.uri(), ")."));
+ }
+
+ if (document.score() < 0) {
+ return absl_ports::InvalidArgumentError("Field 'score' is negative.");
+ }
+
+ if (document.creation_timestamp_secs() < 0) {
+ return absl_ports::InvalidArgumentError(
+ "Field 'creation_timestamp_secs' is negative.");
+ }
+
+ if (document.ttl_secs() < 0) {
+ return absl_ports::InvalidArgumentError("Field 'ttl_secs' is negative.");
+ }
+
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_ASSIGN_OR_RETURN that can support error logging.
+ auto type_config_or = schema_store_->GetSchemaTypeConfig(document.schema());
+ if (!type_config_or.ok()) {
+ ICING_LOG(ERROR) << type_config_or.status().error_message()
+ << "Error while validating document ("
+ << document.namespace_() << ", " << document.uri() << ")";
+ return type_config_or.status();
+ }
+ const SchemaTypeConfigProto* type_config =
+ std::move(type_config_or).ValueOrDie();
+
+ int32_t num_required_properties_expected = 0;
+ int32_t num_required_properties_actual = 0;
+ PropertyConfigMap property_config_map;
+ SchemaUtil::BuildPropertyConfigMap(*type_config, &property_config_map,
+ &num_required_properties_expected);
+ std::unordered_set<std::string_view> unique_properties;
+
+ for (const PropertyProto& property : document.properties()) {
+ if (property.name().empty()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Field 'name' is empty in PropertyProto for key: (",
+ document.namespace_(), ", ", document.uri(), ")."));
+ }
+
+ if (!unique_properties.insert(property.name()).second) {
+ // Failed to insert because of duplicate property name
+ return absl_ports::AlreadyExistsError(absl_ports::StrCat(
+ "Property name '", property.name(), "' already exists for key: (",
+ document.namespace_(), ", ", document.uri(), ")."));
+ }
+
+ const auto& property_iter = property_config_map.find(property.name());
+ if (property_iter == property_config_map.end()) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Property config '", property.name(), "' not found for key: (",
+ document.namespace_(), ", ", document.uri(), ")."));
+ }
+ const PropertyConfigProto& property_config = *property_iter->second;
+
+ // Get the property value size according to data type.
+ // TODO (samzheng): make sure values of other data types are empty.
+ int value_size = 0;
+ if (property_config.data_type() == PropertyConfigProto::DataType::STRING) {
+ value_size = property.string_values_size();
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::INT64) {
+ value_size = property.int64_values_size();
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOUBLE) {
+ value_size = property.double_values_size();
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::BOOLEAN) {
+ value_size = property.boolean_values_size();
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::BYTES) {
+ value_size = property.bytes_values_size();
+ } else if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT) {
+ value_size = property.document_values_size();
+ }
+
+ if (property_config.cardinality() ==
+ PropertyConfigProto::Cardinality::OPTIONAL) {
+ if (value_size != 0 && value_size != 1) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Property '%s' is optional but %d elements are "
+ "found for key: (%s, %s).",
+ property.name().c_str(), value_size, document.namespace_().c_str(),
+ document.uri().c_str()));
+ }
+ } else if (property_config.cardinality() ==
+ PropertyConfigProto::Cardinality::REQUIRED) {
+ if (value_size != 1) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Property '%s' with only 1 value is required but "
+ "%d elements are found for key: (%s, %s).",
+ property.name().c_str(), value_size, document.namespace_().c_str(),
+ document.uri().c_str()));
+ }
+ num_required_properties_actual++;
+ }
+
+ // We put the validation for nested DocumentProto at last separately
+ // because it takes longer time to run. If any of the previous validations
+ // fail, we don't need to validate the extra documents.
+ if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT) {
+ const std::string_view nested_type_expected =
+ property_config.schema_type();
+ for (const DocumentProto& nested_document : property.document_values()) {
+ if (nested_type_expected.compare(nested_document.schema()) != 0) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Property '", property.name(), "' should have type '",
+ nested_type_expected,
+ "' but actual "
+ "value has type '",
+ nested_document.schema(), "' for key: (", document.namespace_(),
+ ", ", document.uri(), ")."));
+ }
+ ICING_RETURN_IF_ERROR(Validate(nested_document));
+ }
+ }
+ }
+ if (num_required_properties_actual < num_required_properties_expected) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("One or more required fields missing for key: (",
+ document.namespace_(), ", ", document.uri(), ")."));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/document-validator.h b/icing/util/document-validator.h
new file mode 100644
index 0000000..c684cb5
--- /dev/null
+++ b/icing/util/document-validator.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_DOCUMENT_VALIDATOR_H_
+#define ICING_UTIL_DOCUMENT_VALIDATOR_H_
+
+#include "utils/base/status.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-store.h"
+
+namespace icing {
+namespace lib {
+
+// This class validates DocumentProto based on the corresponding
+// SchemaTypeConfigProto in the given type config map.
+class DocumentValidator {
+ public:
+ explicit DocumentValidator(const SchemaStore* schema_store);
+ DocumentValidator() = delete;
+
+ // This function validates:
+ // 1. DocumentProto.namespace is not empty
+ // 2. DocumentProto.uri is not empty
+ // 3. DocumentProto.schema is not empty
+ // 4. DocumentProto.schema matches one of SchemaTypeConfigProto.schema_type
+ // in the given SchemaProto in constructor
+ // 5. Each PropertyProto.name in DocumentProto.properties is not empty
+ // 6. Each PropertyProto.name is unique
+ // 7. Each PropertyProto.name matches one of
+ // PropertyConfigProto.property_name in the given SchemaProto in
+ // constructor
+ // 8. For each PropertyProto, the size of repeated value field matches
+ // PropertyConfigProto.cardinality defined in the given SchemaProto in
+ // constructor (e.g. OPTIONAL means 0 or 1, REQUIRED means 1)
+ // 9. For each PropertyProto with nested DocumentProto,
+ // DocumentProto.schema (nested) matches the current
+ // PropertyConfigProto.schema_type
+ // 10. All PropertyProto with REQUIRED cardinality in the corresponding
+ // PropertyConfigProto present in the DocumentProto
+ // 11. DocumentProto.score is not negative
+ // 12. DocumentProto.creation_timestamp_secs is not negative
+ // 13. DocumentProto.ttl_secs is not negative
+ //
+ // In addition, all nested DocumentProto will also be validated towards the
+ // requirements above.
+ //
+ // DocumentProto.custom_properties are not validated.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if any of case 1, 2, 3, 5, 8, 9, 10, 11, 12, 13 fails
+ // NOT_FOUND if case 4 or 7 fails
+ // ALREADY_EXISTS if case 6 fails
+ libtextclassifier3::Status Validate(const DocumentProto& document);
+
+ void UpdateSchemaStore(const SchemaStore* schema_store) {
+ schema_store_ = schema_store;
+ }
+
+ private:
+ const SchemaStore* schema_store_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_DOCUMENT_VALIDATOR_H_
diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc
new file mode 100644
index 0000000..7a43f6b
--- /dev/null
+++ b/icing/util/document-validator_test.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/document-validator.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::HasSubstr;
+
+// type and property names of EmailMessage
+constexpr char kTypeEmail[] = "EmailMessage";
+constexpr char kPropertySubject[] = "subject";
+constexpr char kPropertyText[] = "text";
+constexpr char kPropertyRecipients[] = "recipients";
+// type and property names of Conversation
+constexpr char kTypeConversation[] = "Conversation";
+constexpr char kPropertyName[] = "name";
+constexpr char kPropertyEmails[] = "emails";
+// Other values
+constexpr char kDefaultNamespace[] = "icing";
+constexpr char kDefaultString[] = "This is a string.";
+
+class DocumentValidatorTest : public ::testing::Test {
+ protected:
+ DocumentValidatorTest() {}
+
+ void SetUp() override {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ CreateEmailTypeConfig(type_config);
+
+ type_config = schema.add_types();
+ CreateConversationTypeConfig(type_config);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, GetTestTempDir()));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ document_validator_ =
+ std::make_unique<DocumentValidator>(schema_store_.get());
+ }
+
+ static void CreateEmailTypeConfig(SchemaTypeConfigProto* type_config) {
+ type_config->set_schema_type(kTypeEmail);
+
+ auto subject = type_config->add_properties();
+ subject->set_property_name(kPropertySubject);
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ auto text = type_config->add_properties();
+ text->set_property_name(kPropertyText);
+ text->set_data_type(PropertyConfigProto::DataType::STRING);
+ text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ auto recipients = type_config->add_properties();
+ recipients->set_property_name(kPropertyRecipients);
+ recipients->set_data_type(PropertyConfigProto::DataType::STRING);
+ recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ }
+
+ static DocumentBuilder SimpleEmailBuilder() {
+ return DocumentBuilder()
+ .SetKey(kDefaultNamespace, "email/1")
+ .SetSchema(kTypeEmail)
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertyText, kDefaultString)
+ .AddStringProperty(kPropertyRecipients, kDefaultString, kDefaultString,
+ kDefaultString);
+ }
+
+ static void CreateConversationTypeConfig(SchemaTypeConfigProto* type_config) {
+ type_config->set_schema_type(kTypeConversation);
+
+ auto name = type_config->add_properties();
+ name->set_property_name(kPropertyName);
+ name->set_data_type(PropertyConfigProto::DataType::STRING);
+ name->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ auto emails = type_config->add_properties();
+ emails->set_property_name(kPropertyEmails);
+ emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ emails->set_schema_type(kTypeEmail);
+ }
+
+ static DocumentBuilder SimpleConversationBuilder() {
+ return DocumentBuilder()
+ .SetKey(kDefaultNamespace, "conversation/1")
+ .SetSchema(kTypeConversation)
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails, SimpleEmailBuilder().Build(),
+ SimpleEmailBuilder().Build(),
+ SimpleEmailBuilder().Build());
+ }
+
+ std::unique_ptr<DocumentValidator> document_validator_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ Filesystem filesystem_;
+};
+
+TEST_F(DocumentValidatorTest, ValidateSimpleSchemasOk) {
+ DocumentProto email = SimpleEmailBuilder().Build();
+ EXPECT_THAT(document_validator_->Validate(email), IsOk());
+
+ DocumentProto conversation = SimpleConversationBuilder().Build();
+ EXPECT_THAT(document_validator_->Validate(conversation), IsOk());
+}
+
+TEST_F(DocumentValidatorTest, ValidateEmptyNamespaceInvalid) {
+ DocumentProto email = SimpleEmailBuilder().SetNamespace("").Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'namespace' is empty")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateEmptyUriInvalid) {
+ DocumentProto email = SimpleEmailBuilder().SetUri("").Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'uri' is empty")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateEmptySchemaInvalid) {
+ DocumentProto email = SimpleEmailBuilder().SetSchema("").Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'schema' is empty")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateNonexistentSchemaNotFound) {
+ DocumentProto email =
+ SimpleEmailBuilder().SetSchema("WrongEmailType").Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'WrongEmailType' not found")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateEmptyPropertyInvalid) {
+ DocumentProto email =
+ SimpleEmailBuilder().AddStringProperty("", kDefaultString).Build();
+
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'name' is empty")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateDuplicatePropertyAlreadyExists) {
+ DocumentProto email = SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS,
+ HasSubstr("'subject' already exists")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateNonexistentPropertyNotFound) {
+ DocumentProto email =
+ SimpleEmailBuilder()
+ .AddStringProperty("WrongPropertyName", kDefaultString)
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'WrongPropertyName' not found")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateAllCustomPropertyOk) {
+ DocumentProto email =
+ SimpleEmailBuilder()
+ // A nonexistent property, would've triggered a NotFound message
+ .AddCustomStringProperty("WrongPropertyName", kDefaultString)
+ // 'subject' property should've been a string according to the schema
+ .AddCustomBooleanProperty(kPropertySubject, false, true)
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email), IsOk());
+}
+
+TEST_F(DocumentValidatorTest, ValidateExactlyOneRequiredValueOk) {
+ // Required property should have exactly 1 value
+ DocumentProto email =
+ SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString) // 1 value
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email), IsOk());
+}
+
+TEST_F(DocumentValidatorTest, ValidateInvalidNumberOfRequiredValues) {
+ // Required property should have exactly 1 value
+ DocumentProto email = SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject) // 0 values
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'subject' with only 1 value is required "
+ "but 0 elements are found")));
+
+ email =
+ SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString, kDefaultString)
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'subject' with only 1 value is required "
+ "but 2 elements are found")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateZeroOrOneOptionalValueOk) {
+ DocumentProto email = SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertyText) // 0 values
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email), IsOk());
+
+ email = SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertyText, kDefaultString) // 1 value
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(email), IsOk());
+}
+
+TEST_F(DocumentValidatorTest, ValidateInvalidNumberOfOptionalValues) {
+ DocumentProto email =
+ SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertyText, kDefaultString, kDefaultString)
+ .Build();
+
+ EXPECT_THAT(
+ document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'text' is optional but 2 elements are found")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateMissingRequiredPropertyInvalid) {
+ // All required properties should be present in document
+ DocumentProto email = SimpleEmailBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertyText, kDefaultString)
+ .Build();
+
+ // The required property 'subject' isn't added in email.
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("One or more required fields missing")));
+}
+
+TEST_F(DocumentValidatorTest,
+ ValidateNestedPropertyDoesntMatchSchemaTypeInvalid) {
+ // Nested DocumentProto should have the expected schema type
+ DocumentProto conversation =
+ SimpleConversationBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(
+ kPropertyEmails, SimpleEmailBuilder().Build(),
+ SimpleConversationBuilder().Build(), // Wrong document type
+ SimpleEmailBuilder().Build())
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(conversation),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'emails' should have type 'EmailMessage' but "
+ "actual value has type 'Conversation'")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateNestedPropertyInvalid) {
+ // Issues in nested DocumentProto should be detected
+ DocumentProto conversation =
+ SimpleConversationBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails,
+ SimpleEmailBuilder()
+ .SetNamespace("")
+ .Build()) // Bad nested document
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(conversation),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'namespace' is empty")));
+}
+
+TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) {
+ SchemaProto email_schema;
+ auto type_config = email_schema.add_types();
+ CreateEmailTypeConfig(type_config);
+
+ // Create a custom directory so we don't collide with the test's preset schema
+ // in SetUp
+ const std::string custom_schema_dir = GetTestTempDir() + "/custom_schema";
+ filesystem_.DeleteDirectoryRecursively(custom_schema_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(custom_schema_dir.c_str());
+
+ // Set a schema with only the 'Email' type
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, custom_schema_dir));
+ ASSERT_THAT(schema_store->SetSchema(email_schema), IsOk());
+
+ DocumentValidator document_validator(schema_store.get());
+
+ DocumentProto conversation = SimpleConversationBuilder().Build();
+
+ // Schema doesn't know about the 'Conversation' type yet
+ EXPECT_THAT(document_validator.Validate(conversation),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("'Conversation' not found")));
+
+ // Add the 'Conversation' type
+ SchemaProto email_and_conversation_schema = email_schema;
+ type_config = email_and_conversation_schema.add_types();
+ CreateConversationTypeConfig(type_config);
+
+ // DocumentValidator should be able to handle the SchemaStore getting updated
+ // separately
+ ASSERT_THAT(schema_store->SetSchema(email_and_conversation_schema), IsOk());
+
+ ICING_EXPECT_OK(document_validator.Validate(conversation));
+}
+
+TEST_F(DocumentValidatorTest, PositiveDocumentScoreOk) {
+ DocumentProto email = SimpleEmailBuilder().SetScore(1).Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+
+ email = SimpleEmailBuilder()
+ .SetScore(std::numeric_limits<int32_t>::max())
+ .Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+}
+
+TEST_F(DocumentValidatorTest, NegativeDocumentScoreInvalid) {
+ DocumentProto email = SimpleEmailBuilder().SetScore(-1).Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+
+ email = SimpleEmailBuilder()
+ .SetScore(std::numeric_limits<int32_t>::min())
+ .Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+}
+
+TEST_F(DocumentValidatorTest, PositiveDocumentCreationTimestampSecsOk) {
+ DocumentProto email =
+ SimpleEmailBuilder().SetCreationTimestampSecs(1).Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+
+ email = SimpleEmailBuilder()
+ .SetCreationTimestampSecs(std::numeric_limits<int32_t>::max())
+ .Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+}
+
+TEST_F(DocumentValidatorTest, ZeroDocumentCreationTimestampSecssOk) {
+ DocumentProto email =
+ SimpleEmailBuilder().SetCreationTimestampSecs(0).Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+}
+
+TEST_F(DocumentValidatorTest, NegativeDocumentCreationTimestampSecsInvalid) {
+ DocumentProto email =
+ SimpleEmailBuilder().SetCreationTimestampSecs(-1).Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+
+ email = SimpleEmailBuilder()
+ .SetCreationTimestampSecs(std::numeric_limits<int32_t>::min())
+ .Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+}
+
+TEST_F(DocumentValidatorTest, PositiveDocumentTtlSecsOk) {
+ DocumentProto email = SimpleEmailBuilder().SetTtlSecs(1).Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+
+ email = SimpleEmailBuilder()
+ .SetTtlSecs(std::numeric_limits<int32_t>::max())
+ .Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+}
+
+TEST_F(DocumentValidatorTest, ZeroDocumentTtlSecsOk) {
+ DocumentProto email = SimpleEmailBuilder().SetTtlSecs(0).Build();
+ ICING_EXPECT_OK(document_validator_->Validate(email));
+}
+
+TEST_F(DocumentValidatorTest, NegativeDocumentTtlSecsInvalid) {
+ DocumentProto email = SimpleEmailBuilder().SetTtlSecs(-1).Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+
+ email = SimpleEmailBuilder()
+ .SetTtlSecs(std::numeric_limits<int32_t>::min())
+ .Build();
+ EXPECT_THAT(document_validator_->Validate(email),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("is negative")));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
new file mode 100644
index 0000000..e8b109a
--- /dev/null
+++ b/icing/util/i18n-utils.cc
@@ -0,0 +1,160 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/i18n-utils.h"
+
+#include <sys/types.h>
+
+#include <cctype>
+#include <string>
+#include <string_view>
+
+#include "utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "unicode/umachine.h"
+#include "unicode/unorm2.h"
+#include "unicode/ustring.h"
+#include "unicode/utf8.h"
+
+namespace icing {
+namespace lib {
+namespace i18n_utils {
+
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+ const std::u16string& utf16_string) {
+ std::string utf8_string;
+ // Allocates the maximum possible UTF8 string length:
+ // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
+ //
+ // NOTE: we need to call resize() but not reserve() because values can't be
+ // set at positions after length().
+ utf8_string.resize(utf16_string.length() * 3 + 1);
+
+ int result_length = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
+ utf16_string.data(), utf16_string.length(), &status);
+ // Corrects the length
+ utf8_string.resize(result_length);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
+ }
+ return utf8_string;
+}
+
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+ std::string_view utf8_string) {
+ std::u16string utf16_result;
+ // The UTF16 string won't be longer than its UTF8 format
+ //
+ // NOTE: we need to call resize() but not reserve() because values can't be
+ // set at positions after length().
+ utf16_result.resize(utf8_string.length());
+
+ int result_length = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
+ utf8_string.data(), utf8_string.length(), &status);
+ // Corrects the length
+ utf16_result.resize(result_length);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
+ }
+ return utf16_result;
+}
+
+UChar32 GetUChar32At(const char* data, int length, int position) {
+ UChar32 uchar32;
+ U8_NEXT_OR_FFFD(data, position, length, uchar32);
+ return uchar32;
+}
+
+void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
+ if (str == nullptr || truncate_to_length >= str->length()) {
+ return;
+ }
+
+ while (truncate_to_length > 0) {
+ if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
+ str->resize(truncate_to_length);
+ return;
+ }
+ truncate_to_length--;
+ }
+
+ // Truncates to an empty string
+ str->resize(0);
+}
+
+bool IsAscii(char c) { return U8_IS_SINGLE((u_int8_t)c); }
+
+bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
+
+int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
+
+bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((u_int8_t)c); }
+
+bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
+ if (IsAscii(input[position])) {
+ if (char_len_out != nullptr) {
+ *char_len_out = 1;
+ }
+ return std::ispunct(input[position]);
+ }
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ if (char_len_out != nullptr) {
+ *char_len_out = U8_LENGTH(c);
+ }
+ return u_ispunct(c);
+}
+
+bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
+ char* char_out) {
+ if (IsAscii(uchar32_in)) {
+ // The Unicode character is within ASCII range
+ if (char_out != nullptr) {
+ *char_out = uchar32_in;
+ }
+ return true;
+ }
+
+ // Maximum number of pieces a Unicode character can be decomposed into.
+ // TODO(samzheng) figure out if this number is proper.
+ constexpr int kDecompositionBufferCapacity = 5;
+
+ // A buffer used to store Unicode decomposition mappings of only one
+ // character.
+ UChar decomposition_buffer[kDecompositionBufferCapacity];
+
+ // Decomposes the Unicode character, trying to get an ASCII char and some
+ // diacritic chars.
+ UErrorCode status = U_ZERO_ERROR;
+ if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
+ kDecompositionBufferCapacity, &status) > 0 &&
+ !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
+ if (char_out != nullptr) {
+ *char_out = decomposition_buffer[0];
+ }
+ return true;
+ }
+ return false;
+}
+
+} // namespace i18n_utils
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
new file mode 100644
index 0000000..04edbc7
--- /dev/null
+++ b/icing/util/i18n-utils.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_I18N_UTILS_H_
+#define ICING_UTIL_I18N_UTILS_H_
+
+#include <string>
+#include <string_view>
+
+#include "utils/base/statusor.h"
+#include "unicode/umachine.h"
+#include "unicode/unorm2.h"
+
+namespace icing {
+namespace lib {
+namespace i18n_utils {
+
+// Converts a UTF16 string to a UTF8 string.
+//
+// Returns:
+// A UTF8 string on success
+// INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+ const std::u16string& utf16_string);
+
+// Converts a UTF8 string to a UTF16 string.
+//
+// Returns:
+// A UTF16 string on success
+// INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+ std::string_view utf8_string);
+
+// Returns the Unicode char at the given position. If anything wrong happens, an
+// invalid value 0xFFFD is returned.
+UChar32 GetUChar32At(const char* data, int length, int position);
+
+// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
+// in the middle. The string will be truncated in place.
+void SafeTruncateUtf8(std::string* str, int truncate_to_length);
+
+// Checks if the single char is within ASCII range.
+bool IsAscii(char c);
+
+// Checks if the Unicode char is within ASCII range.
+bool IsAscii(UChar32 c);
+
+// Returns how many code units (bytes) are used for the UTF-8 encoding of this
+// Unicode character. Returns 0 if not valid.
+int GetUtf8Length(UChar32 c);
+
+// Checks if the single char is the first byte of a UTF8 character, note
+// that a single ASCII char is also considered a lead byte.
+bool IsLeadUtf8Byte(char c);
+
+// Checks if the character at position is punctuation. Assigns the length of the
+// character at position to *char_len_out if the character at position is valid
+// punctuation and char_len_out is not null.
+bool IsPunctuationAt(std::string_view input, int position,
+ int* char_len_out = nullptr);
+
+// Transforms a Unicode character with diacritics to its counterpart in ASCII
+// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
+// the transformation is successful.
+//
+// NOTE: According to our convention this function should have returned
+// StatusOr<char>. However, this function is performance-sensitive because is
+// could be called on every Latin character in normalization, so we make it
+// return a bool here to save a bit more time and memory.
+bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
+ char* char_out);
+
+} // namespace i18n_utils
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_I18N_UTILS_H_
diff --git a/icing/util/logging.h b/icing/util/logging.h
new file mode 100644
index 0000000..1916e53
--- /dev/null
+++ b/icing/util/logging.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_LOGGING_H_
+#define ICING_UTIL_LOGGING_H_
+
+#include "base/logging.h"
+
+namespace icing {
+namespace lib {
+
+// TODO(samzheng): Change to TC3_VLOG and TC3_LOG
+#define ICING_VLOG(severity) VLOG(severity)
+#define ICING_LOG(severity) LOG(severity)
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_LOGGING_H_
diff --git a/icing/util/math-util.h b/icing/util/math-util.h
new file mode 100644
index 0000000..fc11a09
--- /dev/null
+++ b/icing/util/math-util.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_MATH_UTIL_H_
+#define ICING_UTIL_MATH_UTIL_H_
+
+#include <limits>
+
+namespace icing {
+namespace lib {
+
+namespace math_util {
+
+inline double SafeDivide(double first, double second) {
+ if (second == 0) {
+ return std::numeric_limits<double>::infinity();
+ }
+ return first / second;
+}
+
+// Returns the maximum integer value which is a multiple of rounding_value,
+// and less than or equal to input_value.
+//
+// input_value must be greater than or equal to zero, or else returns 0.
+// rounding_value must be greater than or equal to zero, or else returns 0.
+template <typename IntType>
+static IntType RoundDownTo(IntType input_value, IntType rounding_value) {
+ static_assert(std::numeric_limits<IntType>::is_integer,
+ "RoundUpTo() operation type is not integer");
+
+ if (input_value <= 0) {
+ return 0;
+ }
+
+ if (rounding_value <= 0) {
+ return 0;
+ }
+
+ return (input_value / rounding_value) * rounding_value;
+}
+
+// Returns the minimum integer value which is a multiple of rounding_value,
+// and greater than or equal to input_value.
+//
+// input_value must be greater than or equal to zero, or else returns 0.
+// rounding_value must be greater than or equal to zero, or else returns 0.
+template <typename IntType>
+static IntType RoundUpTo(IntType input_value, IntType rounding_value) {
+ static_assert(std::numeric_limits<IntType>::is_integer,
+ "RoundUpTo() operation type is not integer");
+
+ if (input_value <= 0) {
+ return 0;
+ }
+
+ if (rounding_value <= 0) {
+ return 0;
+ }
+
+ const IntType remainder = input_value % rounding_value;
+ return (remainder == 0) ? input_value
+ : (input_value - remainder + rounding_value);
+}
+
+} // namespace math_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_MATH_UTIL_H_