Merge remote-tracking branch 'goog/androidx-platform-dev' into master
* goog/androidx-platform-dev:
Pull upstream changes.
Pull upstream changes.
Move compilation of libicing.so into appsearch:appsearch.
Move icing lib build.gradle to root of repo.
Build the external/icing project in a self-contained manner.
Hide classes from public API surface.
Pull upstream changes.
Uncomment CMakeLists.txt to start building icing native lib.
Update from upstream, including proto location changes.
Copy over changes made to Google3 codebase in Icing.
Update CMakeLists.txt to use precompiled protoc.
Modify external/icing CMake to build against libandroidicu.
Pull upstream changes.
Use add_custom_command instead of execute_process for building protos.
Implement jarjar for icing protos.
Split icing project into two parts: protos/java, and native lib.
Modify external/icing CMake to link against libprotobuf.
Copy over changes made to Google3 codebase in Icing.
Test: m -j
Change-Id: I69b719ca9b0ca9032eaca5ba23c62dfa03b185f8
diff --git a/Android.bp b/Android.bp
index 9f80856..fb14778 100644
--- a/Android.bp
+++ b/Android.bp
@@ -12,17 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-java_library {
- name: "icing-java-proto-lite",
- proto: {
- type: "lite",
- include_dirs: ["external/protobuf/src"],
- canonical_path_from_root: false,
- },
- srcs: ["icing/proto/*.proto"],
- sdk_version: "core_current",
-}
-
cc_defaults {
name: "libicing_defaults",
@@ -43,6 +32,7 @@
"-Wno-undefined-var-template",
"-Wno-unused-function",
"-Wno-unused-parameter",
+ "-Wno-unused-private-field",
"-Wno-extern-c-compat",
"-funsigned-char",
@@ -50,34 +40,23 @@
],
}
-cc_library_static {
- name: "icing-c-proto",
- defaults: ["libicing_defaults"],
- proto: {
- type: "lite",
- // Find protos relative from where they're specified (useful for external protos)
- canonical_path_from_root: false,
- // Need to be able to see the .pb.h files that are generated
- export_proto_headers: true,
- },
- srcs: ["icing/**/*.proto"],
-}
-
cc_library_shared {
- name: "libicing_jni",
+ name: "libicing",
defaults: ["libicing_defaults"],
srcs: [
"icing/**/*.cc",
],
exclude_srcs: [
- // Tests
+ "icing/**/*-test-*",
+ "icing/**/*-test.*",
"icing/**/*_test.cc",
- // Benchmarks
"icing/**/*_benchmark.cc",
- // Test-only related files (i.e. utils)
"icing/testing/**/*",
- // Tools for manual debugging/investigating
+ "icing/tokenization/reverse_jni/**/*",
+ "icing/tokenization/simple/**/*",
"icing/tools/**/*",
+ "icing/transform/map/**/*",
+ "icing/transform/simple/**/*",
],
header_libs: ["jni_headers"],
static_libs: [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 556150a..0830783 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,75 +14,66 @@
cmake_minimum_required(VERSION 3.10.2)
-# Build protoc with a host configuration. We need to run it on the host to create our proto
-# files.
-set(CMAKE_HOST_ARGS
- -DBUILD_SHARED_LIBS:BOOL=${BUILD_SHARED_LIBS}
- -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_GENERATOR:STRING=${CMAKE_GENERATOR}
- -DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM})
+add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1")
+set(
+ Protobuf_PREBUILTS_DIR
+ "${CMAKE_CURRENT_SOURCE_DIR}/../../prebuilts/protobuf")
set(Protobuf_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../protobuf")
-set(Protobuf_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf-host")
-# Run another cmake invocation to configure the protobuf project
-execute_process(
- COMMAND "${CMAKE_COMMAND}"
- ${CMAKE_HOST_ARGS}
- -H${Protobuf_SOURCE_DIR}/cmake
- -B${Protobuf_BINARY_DIR}
- -Dprotobuf_BUILD_TESTS:BOOL=OFF
- RESULT_VARIABLE exec_value
- OUTPUT_VARIABLE exec_output
- ERROR_VARIABLE exec_output
-)
-message("Result of proto configuration: ${exec_value}. Output: ${exec_output}")
+set(Protobuf_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf-target")
+set(Icing_PROTO_GEN_DIR "${CMAKE_CURRENT_BINARY_DIR}/icing-protobuf-gen")
-# Run the actual build tool (ninja) to compile protoc for the host
-execute_process(
- COMMAND "${CMAKE_MAKE_PROGRAM}" protoc
- WORKING_DIRECTORY ${Protobuf_BINARY_DIR}
- RESULT_VARIABLE exec_value
- OUTPUT_VARIABLE exec_output
- ERROR_VARIABLE exec_output
-)
-message("Result of proto build: ${exec_value}. Output: ${exec_output}")
+## Configure libprotobuf ##
+# Find the right protoc to compile our proto files
+if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Darwin")
+ set(Protobuf_PROTOC_PATH "${Protobuf_PREBUILTS_DIR}/darwin-x86_64/protoc")
+elseif(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux")
+ set(Protobuf_PROTOC_PATH "${Protobuf_PREBUILTS_DIR}/linux-x86_64/protoc")
+elseif(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Windows")
+ set(Protobuf_PROTOC_PATH "${Protobuf_PREBUILTS_DIR}/windows-x86/protoc.exe")
+else()
+ message(
+ FATAL_ERROR
+ "No protoc prebuilt found for host OS ${CMAKE_HOST_SYSTEM_NAME}")
+endif()
+message(STATUS "Using prebuilt protoc at: ${Protobuf_PROTOC_PATH}")
-# Glob Icing proto sources
+# Compile libprotobuf
+set(protobuf_BUILD_TESTS OFF CACHE BOOL "")
+add_subdirectory("${Protobuf_SOURCE_DIR}/cmake" ${Protobuf_TARGET_BINARY_DIR})
+
+# Compile libandroidicu
+set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../icu/libandroidicu")
+set(ICU_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/icu-target")
+add_subdirectory(${ICU_SOURCE_DIR} ${ICU_TARGET_BINARY_DIR})
+
+# Glob Icing proto sources. Results look like this: icing/proto/document.proto
file(
GLOB_RECURSE
Icing_PROTO_FILES
- RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- "icing/*.proto")
+ RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/proto"
+ "*.proto")
message(STATUS "Icing_PROTO_FILES=${Icing_PROTO_FILES}")
# Run protoc on Icing_PROTO_FILES to generate pb.cc and pb.h files
-set(Icing_PROTO_GEN_DIR "${PROTO_GENERATED_FILES_BASE_DIR}/cpp")
file(MAKE_DIRECTORY ${Icing_PROTO_GEN_DIR})
foreach(FILE ${Icing_PROTO_FILES})
# Find the name of the proto file without the .proto extension
string(REGEX REPLACE "\.proto$" "" FILE_NOEXT ${FILE})
- execute_process(
- COMMAND "${Protobuf_BINARY_DIR}/protoc"
- --proto_path ${CMAKE_CURRENT_SOURCE_DIR}
- --cpp_out ${Icing_PROTO_GEN_DIR}
- ${FILE}
+ list(APPEND Icing_PROTO_SOURCES
+ "${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.cc"
+ "${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.h")
+ add_custom_command(
+ OUTPUT "${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.cc"
+ "${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.h"
+ COMMAND ${Protobuf_PROTOC_PATH}
+ --proto_path "${CMAKE_CURRENT_SOURCE_DIR}/proto"
+ --cpp_out ${Icing_PROTO_GEN_DIR}
+ ${FILE}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- RESULT_VARIABLE exec_value
- OUTPUT_VARIABLE exec_output
- ERROR_VARIABLE exec_output
+ DEPENDS ${Protobuf_PROTOC_PATH}
)
- message("Result of protoc ${FILE}: ${exec_value}. Output: ${exec_output}")
endforeach()
-
-# Glob generated source files from running protoc
-file(
- GLOB_RECURSE
- Icing_PROTO_SOURCES
- "${Icing_PROTO_GEN_DIR}/*.pb.cc"
- "${Icing_PROTO_GEN_DIR}/*.pb.h"
-)
message(STATUS "Icing_PROTO_SOURCES=${Icing_PROTO_SOURCES}")
# Glob Icing C++ sources
@@ -99,10 +90,15 @@
icing/*.cc icing/*.h
)
# Exclude the same types of files as Android.bp. See the comments there.
-list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/.*_test\.cc$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/.*[^a-zA-Z0-9]test[^a-zA-Z0-9].*$")
list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/.*_benchmark\.cc$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/helpers/icu/.*$")
list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/testing/.*$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/tokenization/icu/.*$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/tokenization/simple/.*$")
list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/tools/.*$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/transform/icu/.*$")
+list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/transform/simple/.*$")
message(STATUS "Icing_CC_SOURCES=${Icing_CC_SOURCES}")
add_library(
@@ -119,3 +115,5 @@
target_include_directories(icing PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(icing PRIVATE ${Icing_PROTO_GEN_DIR})
target_include_directories(icing PRIVATE "${Protobuf_SOURCE_DIR}/src")
+target_include_directories(icing PRIVATE "${ICU_SOURCE_DIR}/include")
+target_link_libraries(icing protobuf::libprotobuf libandroidicu log)
diff --git a/build.gradle b/build.gradle
index 30265cd..3901078 100644
--- a/build.gradle
+++ b/build.gradle
@@ -14,63 +14,59 @@
* limitations under the License.
*/
+import org.anarres.gradle.plugin.jarjar.JarjarTask
+
+import static androidx.build.SupportConfig.*
+import static androidx.build.dependencies.DependenciesKt.*
+
buildscript {
- boolean unbundleBuild = (new File('unbundled-build')).exists()
- repositories {
- maven { url '../../prebuilts/androidx/external' }
- if (unbundleBuild) {
- jcenter()
- }
- }
dependencies {
classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.8')
+ classpath('org.anarres.jarjar:jarjar-gradle:1.0.1')
}
}
-apply plugin: 'AndroidXPlugin'
-apply plugin: 'com.android.library'
-apply plugin: 'com.google.protobuf'
-apply plugin: 'idea'
-
-def protoGeneratedFilesBaseDir = "${project.buildDir}/generated/source/proto"
+plugins {
+ id('com.android.library')
+ id('com.google.protobuf')
+ id('org.anarres.jarjar')
+}
android {
+ buildToolsVersion BUILD_TOOLS_VERSION
+ compileSdkVersion COMPILE_SDK_VERSION
defaultConfig {
- externalNativeBuild {
- cmake {
- arguments "-DPROTO_GENERATED_FILES_BASE_DIR=${protoGeneratedFilesBaseDir}"
- cppFlags "-std=c++17"
- arguments "-DCMAKE_VERBOSE_MAKEFILE=ON"
- targets "icing"
- }
- }
+ minSdkVersion DEFAULT_MIN_SDK_VERSION
+ targetSdkVersion TARGET_SDK_VERSION
+ testInstrumentationRunner INSTRUMENTATION_RUNNER
}
-
+ compileOptions {
+ sourceCompatibility = JavaVersion.VERSION_1_8
+ targetCompatibility = JavaVersion.VERSION_1_8
+ }
sourceSets {
main {
+ java.srcDir 'java/src/'
manifest.srcFile 'AndroidManifest.xml'
- proto {
- srcDir '.'
- include '**/*.proto'
- }
+ proto.srcDir 'proto/'
}
- }
-
- externalNativeBuild {
- cmake {
- version '3.10.2'
- // TODO(b/149853706): Uncomment this line once the lib fully compiles under cmake
- //path 'CMakeLists.txt'
- }
+ // TODO(b/161205849): Re-enable this test once icing nativeLib is no longer being built
+ // inside appsearch:appsearch.
+ //androidTest.java.srcDir 'java/tests/instrumentation/'
}
}
dependencies {
- api('com.google.protobuf:protobuf-javalite:3.10.0')
+ api('androidx.annotation:annotation:1.1.0')
+
+ implementation('com.google.protobuf:protobuf-javalite:3.10.0')
+
+ androidTestImplementation(ANDROIDX_TEST_CORE)
+ androidTestImplementation(ANDROIDX_TEST_RULES)
+ androidTestImplementation(TRUTH)
}
protobuf {
- generatedFilesBaseDir = protoGeneratedFilesBaseDir
protoc {
artifact = 'com.google.protobuf:protoc:3.10.0'
}
@@ -85,3 +81,23 @@
}
}
}
+
+// Create jarjar artifact for all variants (debug/release)
+android.libraryVariants.all { variant ->
+ def variantName = variant.name
+ def suffix = variantName.capitalize()
+ def jarjarTask = tasks.create("jarjar${suffix}", JarjarTask) {
+ destinationName "icing-java-${variantName}-jarjar.jar"
+ from 'com.google.protobuf:protobuf-javalite:3.10.0'
+ from files(variant.javaCompileProvider.get().destinationDir)
+ dependsOn variant.javaCompileProvider.get()
+ classRename 'com.google.protobuf.**', 'com.google.android.icing.protobuf.@1'
+ }
+
+ def jarjarConf = configurations.register("jarjar${suffix}")
+ artifacts.add("${jarjarConf.name}", jarjarTask.destinationPath) {
+ name "icing-java-${variantName}-jarjar"
+ type 'jar'
+ builtBy jarjarTask
+ }
+}
diff --git a/icing/absl_ports/mutex.h b/icing/absl_ports/mutex.h
index c49b1e1..382a100 100644
--- a/icing/absl_ports/mutex.h
+++ b/icing/absl_ports/mutex.h
@@ -26,17 +26,19 @@
// Simple wrapper around std::shared_mutex with annotations to allow thread
// annotation checks.
-class LOCKABLE shared_mutex {
+class ICING_LOCKABLE shared_mutex {
public:
- void lock() EXCLUSIVE_LOCK_FUNCTION() { m_.lock(); }
- bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true) { return m_.try_lock(); }
- void unlock() UNLOCK_FUNCTION() { m_.unlock(); }
+ void lock() ICING_EXCLUSIVE_LOCK_FUNCTION() { m_.lock(); }
+ bool try_lock() ICING_EXCLUSIVE_TRYLOCK_FUNCTION(true) {
+ return m_.try_lock();
+ }
+ void unlock() ICING_UNLOCK_FUNCTION() { m_.unlock(); }
- void lock_shared() SHARED_LOCK_FUNCTION() { m_.lock_shared(); }
- bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true) {
+ void lock_shared() ICING_SHARED_LOCK_FUNCTION() { m_.lock_shared(); }
+ bool try_lock_shared() ICING_SHARED_TRYLOCK_FUNCTION(true) {
return m_.try_lock_shared();
}
- void unlock_shared() UNLOCK_FUNCTION() { m_.unlock_shared(); }
+ void unlock_shared() ICING_UNLOCK_FUNCTION() { m_.unlock_shared(); }
private:
std::shared_mutex m_;
@@ -44,11 +46,11 @@
// Simple wrapper around std::unique_lock with annotations to allow thread
// annotation checks.
-class SCOPED_LOCKABLE unique_lock {
+class ICING_SCOPED_LOCKABLE unique_lock {
public:
- explicit unique_lock(shared_mutex* mu) EXCLUSIVE_LOCK_FUNCTION(mu)
+ explicit unique_lock(shared_mutex* mu) ICING_EXCLUSIVE_LOCK_FUNCTION(mu)
: lock_(*mu) {}
- ~unique_lock() UNLOCK_FUNCTION() = default;
+ ~unique_lock() ICING_UNLOCK_FUNCTION() = default;
private:
std::unique_lock<shared_mutex> lock_;
@@ -56,11 +58,11 @@
// Simple wrapper around std::shared_lock with annotations to allow thread
// annotation checks.
-class SCOPED_LOCKABLE shared_lock {
+class ICING_SCOPED_LOCKABLE shared_lock {
public:
- explicit shared_lock(shared_mutex* mu) SHARED_LOCK_FUNCTION(mu)
+ explicit shared_lock(shared_mutex* mu) ICING_SHARED_LOCK_FUNCTION(mu)
: lock_(*mu) {}
- ~shared_lock() UNLOCK_FUNCTION() = default;
+ ~shared_lock() ICING_UNLOCK_FUNCTION() = default;
private:
std::shared_lock<shared_mutex> lock_;
diff --git a/icing/absl_ports/thread_annotations.h b/icing/absl_ports/thread_annotations.h
index f5de7b7..a60895d 100644
--- a/icing/absl_ports/thread_annotations.h
+++ b/icing/absl_ports/thread_annotations.h
@@ -33,31 +33,32 @@
#define ICING_ABSL_PORTS_THREAD_ANNOTATIONS_H_
#if defined(__clang__)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
#else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op
-#endif // defined(__clang__)
+#define ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x) // no-op
+#endif
-// GUARDED_BY()
+// ICING_GUARDED_BY()
//
// Documents if a shared field or global variable needs to be protected by a
-// mutex. GUARDED_BY() allows the user to specify a particular mutex that
+// mutex. ICING_GUARDED_BY() allows the user to specify a particular mutex that
// should be held when accessing the annotated variable.
//
-// Although this annotation (and PT_GUARDED_BY, below) cannot be applied to
-// local variables, a local variable and its associated mutex can often be
+// Although this annotation (and ICING_PT_GUARDED_BY, below) cannot be applied
+// to local variables, a local variable and its associated mutex can often be
// combined into a small class or struct, thereby allowing the annotation.
//
// Example:
//
// class Foo {
// Mutex mu_;
-// int p1_ GUARDED_BY(mu_);
+// int p1_ ICING_GUARDED_BY(mu_);
// ...
// };
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define ICING_GUARDED_BY(x) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
-// PT_GUARDED_BY()
+// ICING_PT_GUARDED_BY()
//
// Documents if the memory location pointed to by a pointer should be guarded
// by a mutex when dereferencing the pointer.
@@ -65,7 +66,7 @@
// Example:
// class Foo {
// Mutex mu_;
-// int *p1_ PT_GUARDED_BY(mu_);
+// int *p1_ ICING_PT_GUARDED_BY(mu_);
// ...
// };
//
@@ -76,31 +77,32 @@
//
// // `q_`, guarded by `mu1_`, points to a shared memory location that is
// // guarded by `mu2_`:
-// int *q_ GUARDED_BY(mu1_) PT_GUARDED_BY(mu2_);
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+// int *q_ ICING_GUARDED_BY(mu1_) ICING_PT_GUARDED_BY(mu2_);
+#define ICING_PT_GUARDED_BY(x) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
-// ACQUIRED_AFTER() / ACQUIRED_BEFORE()
+// ICING_ACQUIRED_AFTER() / ICING_ACQUIRED_BEFORE()
//
// Documents the acquisition order between locks that can be held
// simultaneously by a thread. For any two locks that need to be annotated
// to establish an acquisition order, only one of them needs the annotation.
-// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
-// and ACQUIRED_BEFORE.)
+// (i.e. You don't have to annotate both locks with both ICING_ACQUIRED_AFTER
+// and ICING_ACQUIRED_BEFORE.)
//
-// As with GUARDED_BY, this is only applicable to mutexes that are shared
+// As with ICING_GUARDED_BY, this is only applicable to mutexes that are shared
// fields or global variables.
//
// Example:
//
// Mutex m1_;
-// Mutex m2_ ACQUIRED_AFTER(m1_);
-#define ACQUIRED_AFTER(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+// Mutex m2_ ICING_ACQUIRED_AFTER(m1_);
+#define ICING_ACQUIRED_AFTER(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_after(__VA_ARGS__))
-#define ACQUIRED_BEFORE(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+#define ICING_ACQUIRED_BEFORE(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_before(__VA_ARGS__))
-// EXCLUSIVE_LOCKS_REQUIRED() / SHARED_LOCKS_REQUIRED()
+// ICING_EXCLUSIVE_LOCKS_REQUIRED() / ICING_SHARED_LOCKS_REQUIRED()
//
// Documents a function that expects a mutex to be held prior to entry.
// The mutex is expected to be held both on entry to, and exit from, the
@@ -112,76 +114,78 @@
// concurrently.
//
// Generally, non-const methods should be annotated with
-// EXCLUSIVE_LOCKS_REQUIRED, while const methods should be annotated with
-// SHARED_LOCKS_REQUIRED.
+// ICING_EXCLUSIVE_LOCKS_REQUIRED, while const methods should be annotated with
+// ICING_SHARED_LOCKS_REQUIRED.
//
// Example:
//
// Mutex mu1, mu2;
-// int a GUARDED_BY(mu1);
-// int b GUARDED_BY(mu2);
+// int a ICING_GUARDED_BY(mu1);
+// int b ICING_GUARDED_BY(mu2);
//
-// void foo() EXCLUSIVE_LOCKS_REQUIRED(mu1, mu2) { ... }
-// void bar() const SHARED_LOCKS_REQUIRED(mu1, mu2) { ... }
-#define EXCLUSIVE_LOCKS_REQUIRED(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
+// void foo() ICING_EXCLUSIVE_LOCKS_REQUIRED(mu1, mu2) { ... }
+// void bar() const ICING_SHARED_LOCKS_REQUIRED(mu1, mu2) { ... }
+#define ICING_EXCLUSIVE_LOCKS_REQUIRED(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+ exclusive_locks_required(__VA_ARGS__))
-#define SHARED_LOCKS_REQUIRED(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
+#define ICING_SHARED_LOCKS_REQUIRED(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_locks_required(__VA_ARGS__))
-// LOCKS_EXCLUDED()
+// ICING_LOCKS_EXCLUDED()
//
// Documents the locks acquired in the body of the function. These locks
// cannot be held when calling this function.
-#define LOCKS_EXCLUDED(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define ICING_LOCKS_EXCLUDED(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(__VA_ARGS__))
-// LOCK_RETURNED()
+// ICING_LOCK_RETURNED()
//
// Documents a function that returns a mutex without acquiring it. For example,
// a public getter method that returns a pointer to a private mutex should
-// be annotated with LOCK_RETURNED.
-#define LOCK_RETURNED(x) \
- THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+// be annotated with ICING_LOCK_RETURNED.
+#define ICING_LOCK_RETURNED(x) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lock_returned(x))
-// LOCKABLE
+// ICING_LOCKABLE
//
// Documents if a class/type is a lockable type.
-#define LOCKABLE \
- THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+#define ICING_LOCKABLE \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lockable)
-// SCOPED_LOCKABLE
+// ICING_SCOPED_LOCKABLE
//
// Documents if a class does RAII locking.
-// The constructor should use `LOCK_FUNCTION()` to specify the mutex that is
-// acquired, and the destructor should use `UNLOCK_FUNCTION()` with no
+// The constructor should use `ICING_LOCK_FUNCTION()` to specify the mutex that
+// is acquired, and the destructor should use `ICING_UNLOCK_FUNCTION()` with no
// arguments; the analysis will assume that the destructor unlocks whatever the
// constructor locked.
-#define SCOPED_LOCKABLE \
- THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define ICING_SCOPED_LOCKABLE \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(scoped_lockable)
-// EXCLUSIVE_LOCK_FUNCTION()
+// ICING_EXCLUSIVE_LOCK_FUNCTION()
//
// Documents functions that acquire a lock in the body of a function, and do
// not release it.
-#define EXCLUSIVE_LOCK_FUNCTION(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
+#define ICING_EXCLUSIVE_LOCK_FUNCTION(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+ exclusive_lock_function(__VA_ARGS__))
-// SHARED_LOCK_FUNCTION()
+// ICING_SHARED_LOCK_FUNCTION()
//
// Documents functions that acquire a shared (reader) lock in the body of a
// function, and do not release it.
-#define SHARED_LOCK_FUNCTION(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
+#define ICING_SHARED_LOCK_FUNCTION(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_lock_function(__VA_ARGS__))
-// UNLOCK_FUNCTION()
+// ICING_UNLOCK_FUNCTION()
//
// Documents functions that expect a lock to be held on entry to the function,
// and release it in the body of the function.
-#define UNLOCK_FUNCTION(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
+#define ICING_UNLOCK_FUNCTION(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(unlock_function(__VA_ARGS__))
-// EXCLUSIVE_TRYLOCK_FUNCTION() / SHARED_TRYLOCK_FUNCTION()
+// ICING_EXCLUSIVE_TRYLOCK_FUNCTION() / ICING_SHARED_TRYLOCK_FUNCTION()
//
// Documents functions that try to acquire a lock, and return success or failure
// (or a non-boolean value that can be interpreted as a boolean).
@@ -189,20 +193,22 @@
// success, or `false` for functions that return `false` on success. The second
// argument specifies the mutex that is locked on success. If unspecified, this
// mutex is assumed to be `this`.
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
+#define ICING_EXCLUSIVE_TRYLOCK_FUNCTION(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+ exclusive_trylock_function(__VA_ARGS__))
-#define SHARED_TRYLOCK_FUNCTION(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
+#define ICING_SHARED_TRYLOCK_FUNCTION(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+ shared_trylock_function(__VA_ARGS__))
-// ASSERT_EXCLUSIVE_LOCK() / ASSERT_SHARED_LOCK()
+// ICING_ASSERT_EXCLUSIVE_LOCK() / ICING_ASSERT_SHARED_LOCK()
//
// Documents functions that dynamically check to see if a lock is held, and fail
// if it is not held.
-#define ASSERT_EXCLUSIVE_LOCK(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(assert_exclusive_lock(__VA_ARGS__))
+#define ICING_ASSERT_EXCLUSIVE_LOCK(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_exclusive_lock(__VA_ARGS__))
-#define ASSERT_SHARED_LOCK(...) \
- THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_lock(__VA_ARGS__))
+#define ICING_ASSERT_SHARED_LOCK(...) \
+ ICING_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_shared_lock(__VA_ARGS__))
#endif // ICING_ABSL_PORTS_THREAD_ANNOTATIONS_H_
diff --git a/icing/document-builder.h b/icing/document-builder.h
index de3c412..4c95b89 100644
--- a/icing/document-builder.h
+++ b/icing/document-builder.h
@@ -16,6 +16,7 @@
#define ICING_DOCUMENT_BUILDER_H_
#include <cstdint>
+#include <initializer_list>
#include <string>
#include <string_view>
#include <utility>
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 0b36e18..62943b8 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -210,13 +210,23 @@
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the file size of all the elements held in the log. File size is in
+ // bytes. This excludes the size of any internal metadata of the log, e.g. the
+ // log's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// An iterator helping to find offsets of all the protos in file.
// Example usage:
//
@@ -736,6 +746,17 @@
}
template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elments in the proto log");
+ }
+ return total_file_size - sizeof(Header);
+}
+
+template <typename ProtoT>
FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
const std::string& file_path,
int64_t initial_offset)
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h
index b7cc5e1..aede8de 100644
--- a/icing/file/file-backed-proto.h
+++ b/icing/file/file-backed-proto.h
@@ -73,7 +73,7 @@
// Returns NOT_FOUND if the file was empty or never written to.
// Returns INTERNAL_ERROR if an IO error or a corruption was encountered.
libtextclassifier3::StatusOr<const ProtoT*> Read() const
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// Writes the new version of the proto provided through to disk.
// Successful Write() invalidates any previously read version of the proto.
@@ -85,7 +85,7 @@
// We should write to a tmp file first and rename the file to fix this.
// TODO(samzheng) Change to Write(ProtoT&& proto)
libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// Disallow copy and assign.
FileBackedProto(const FileBackedProto&) = delete;
@@ -101,7 +101,7 @@
const Filesystem* const filesystem_;
const std::string file_path_;
- mutable std::unique_ptr<ProtoT> cached_proto_ GUARDED_BY(mutex_);
+ mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
};
template <typename ProtoT>
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index f13b67b..e4ec0cd 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -181,7 +181,9 @@
// OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
libtextclassifier3::Status Set(int32_t idx, const T& value);
- // Resizes to first len elements. The crc is not updated on truncation.
+ // Resizes to first len elements. The crc is cleared on truncation and will be
+ // updated on destruction, or once the client calls ComputeChecksum() or
+ // PersistToDisk().
//
// Returns:
// OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
@@ -194,13 +196,23 @@
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the file size of the all the elements held in the vector. File size
+ // is in bytes. This excludes the size of any internal metadata of the vector,
+ // e.g. the vector's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// Accessors.
const T* array() const {
return reinterpret_cast<const T*>(mmapped_file_->region());
@@ -567,6 +579,13 @@
new_num_elements, header_->num_elements));
}
+ ICING_VLOG(2)
+ << "FileBackedVector truncating, need to recalculate entire checksum";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+
header_->num_elements = new_num_elements;
return libtextclassifier3::Status::OK;
}
@@ -643,6 +662,7 @@
}
cur_offset += sizeof(T);
}
+
if (!changes_.empty()) {
ICING_VLOG(2) << IcingStringUtil::StringPrintf(
"Array update partial crcs %d truncated %d overlapped %d duplicate %d",
@@ -705,6 +725,17 @@
return size;
}
+template <typename T>
+libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
+ const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elements in the file-backed vector");
+ }
+ return total_file_size - sizeof(Header);
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index 7561b57..6c3b931 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -158,8 +158,8 @@
// Truncate the content
ICING_EXPECT_OK(vector->TruncateTo(0));
- // We don't automatically update the crc when we truncate.
- EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
+ // Crc is cleared after truncation and reset to 0.
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
EXPECT_EQ(0u, vector->num_elements());
}
@@ -409,10 +409,10 @@
EXPECT_EQ(1, vector->num_elements());
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
- // Truncating doesn't cause the checksum to be updated.
+ // Truncating clears the checksum and resets it to 0
ICING_EXPECT_OK(vector->TruncateTo(0));
EXPECT_EQ(0, vector->num_elements());
- EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
// Can't truncate past end.
EXPECT_THAT(vector->TruncateTo(100),
@@ -423,6 +423,46 @@
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
+TEST_F(FileBackedVectorTest, TruncateAndReReadFile) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ ICING_ASSERT_OK(vector->Set(0, 1.0));
+ ICING_ASSERT_OK(vector->Set(1, 2.0));
+ ICING_ASSERT_OK(vector->Set(2, 2.0));
+ ICING_ASSERT_OK(vector->Set(3, 2.0));
+ ICING_ASSERT_OK(vector->Set(4, 2.0));
+ } // Destroying the vector should trigger a checksum of the 5 elements
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_EQ(5, vector->num_elements());
+ ICING_EXPECT_OK(vector->TruncateTo(4));
+ EXPECT_EQ(4, vector->num_elements());
+ } // Destroying the vector should update the checksum to 4 elements
+
+ // Creating again should double check that our checksum of 4 elements matches
+ // what was previously saved.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_EQ(vector->num_elements(), 4);
+ }
+}
+
} // namespace
} // namespace lib
diff --git a/icing/file/memory-mapped-file-leak_test.cc b/icing/file/memory-mapped-file-leak_test.cc
new file mode 100644
index 0000000..598fb61
--- /dev/null
+++ b/icing/file/memory-mapped-file-leak_test.cc
@@ -0,0 +1,72 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/memory-mapped-file.h"
+
+#include "perftools/profiles/collector/heap/alloc_recorder.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/recorder-test-utils.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+namespace heap_profile = ::perftools::profiles::collector::heap;
+
+using testing::Le;
+
+TEST(MemoryMappedFileTest, MMapMemoryLeak) {
+ std::string test_dir = GetTestTempDir();
+ std::string recorder_dir = test_dir + "/recorder";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(recorder_dir.c_str()));
+
+ ASSERT_TRUE(heap_profile::AllocRecorderStartWithMmapTracking(recorder_dir));
+ {
+ std::string mmfile_dir = test_dir + "/file";
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(mmfile_dir.c_str()));
+ MemoryMappedFile mmfile(filesystem, mmfile_dir + "/mmfile",
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ // How this works:
+ // We request a 500-byte mapping starting at the 101st byte of the file.
+ // But(!), mmap only accepts offsets that are multiples of page size. So
+ // instead mmfile will create a 600-byte mapping starting at the 1st byte of
+ // file and then return the address of the 101st byte within that mapping.
+ // For this reason, total bytes and peak bytes will be 600 bytes.
+ //
+ // When mmfile goes out of scope it needs to munmap the mapping that it
+ // created. But, remember that the mapping is larger (600 bytes) than what
+ // we requested (500 bytes)! So mmfile needs to remember the actual size of
+ // the mapping, NOT the requested size. Calling munmap with the correct size
+ // will ensure that total_inuse_bytes is 0 after mmfile goes out of scope.
+ // Calling munmap with the requested size would still keep 100 bytes of the
+ // mapping around!
+ mmfile.Remap(100, 500);
+ }
+ heap_profile::AllocRecorderStop();
+
+ // Mmap only affects bytes measurements.
+ ProfileInfo profile_info = SummarizeProfileProto(recorder_dir + ".0.pb.gz");
+ EXPECT_THAT(profile_info.total_alloc_bytes, Le(600));
+ EXPECT_THAT(profile_info.peak_bytes, Le(600));
+ EXPECT_THAT(profile_info.inuse_bytes, Le(0));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc
index 0479043..bda01f2 100644
--- a/icing/file/memory-mapped-file.cc
+++ b/icing/file/memory-mapped-file.cc
@@ -37,11 +37,28 @@
file_path_(file_path),
strategy_(mmap_strategy) {}
+MemoryMappedFile::MemoryMappedFile(MemoryMappedFile&& other)
+ // Make sure that mmap_result_ is a nullptr before we call Swap. We don't
+ // care what values the remaining members hold before we swap into other,
+ // but if mmap_result_ holds a non-NULL value before we initialized anything
+ // then other will try to free memory at that address when it's destroyed!
+ : mmap_result_(nullptr) {
+ Swap(&other);
+}
+
+MemoryMappedFile& MemoryMappedFile::operator=(MemoryMappedFile&& other) {
+ // Swap all of our elements with other. This will ensure that both this now
+ // holds other's previous resources and that this's previous resources will be
+ // properly freed when other is destructed at the end of this function.
+ Swap(&other);
+ return *this;
+}
+
MemoryMappedFile::~MemoryMappedFile() { Unmap(); }
void MemoryMappedFile::MemoryMappedFile::Unmap() {
if (mmap_result_ != nullptr) {
- munmap(mmap_result_, region_size_);
+ munmap(mmap_result_, adjusted_mmap_size_);
mmap_result_ = nullptr;
}
@@ -167,5 +184,16 @@
return libtextclassifier3::Status::OK;
}
+void MemoryMappedFile::Swap(MemoryMappedFile* other) {
+ std::swap(filesystem_, other->filesystem_);
+ std::swap(file_path_, other->file_path_);
+ std::swap(strategy_, other->strategy_);
+ std::swap(file_offset_, other->file_offset_);
+ std::swap(region_, other->region_);
+ std::swap(region_size_, other->region_size_);
+ std::swap(adjusted_mmap_size_, other->adjusted_mmap_size_);
+ std::swap(mmap_result_, other->mmap_result_);
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/memory-mapped-file.h b/icing/file/memory-mapped-file.h
index 5a52368..1d31a42 100644
--- a/icing/file/memory-mapped-file.h
+++ b/icing/file/memory-mapped-file.h
@@ -79,7 +79,10 @@
// file_path : Full path of the file that needs to be memory-mapped.
MemoryMappedFile(const Filesystem& filesystem, std::string_view file_path,
Strategy mmap_strategy);
-
+ MemoryMappedFile(const MemoryMappedFile& other) = delete;
+ MemoryMappedFile(MemoryMappedFile&& other);
+ MemoryMappedFile& operator=(const MemoryMappedFile& other) = delete;
+ MemoryMappedFile& operator=(MemoryMappedFile&& other);
// Frees any region that is still memory-mapped region.
~MemoryMappedFile();
@@ -134,10 +137,13 @@
Strategy strategy() const { return strategy_; }
private:
+ // Swaps the contents of this with other.
+ void Swap(MemoryMappedFile* other);
+
// Cached constructor params.
- const Filesystem* const filesystem_;
- const std::string file_path_;
- const Strategy strategy_;
+ const Filesystem* filesystem_;
+ std::string file_path_;
+ Strategy strategy_;
// Offset within the file at which the current memory-mapped region starts.
size_t file_offset_ = 0;
diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h
index a82f253..b89295e 100644
--- a/icing/file/mock-filesystem.h
+++ b/icing/file/mock-filesystem.h
@@ -67,22 +67,24 @@
ON_CALL(*this, ListDirectory(_, _))
.WillByDefault(
- [this](const char* dir_name, std::vector<string>* entries) {
+ [this](const char* dir_name, std::vector<std::string>* entries) {
return real_filesystem_.ListDirectory(dir_name, entries);
});
ON_CALL(*this, ListDirectory(_, _, _, _))
.WillByDefault([this](const char* dir_name,
- const std::unordered_set<string>& exclude,
- bool recursive, std::vector<string>* entries) {
+ const std::unordered_set<std::string>& exclude,
+ bool recursive,
+ std::vector<std::string>* entries) {
return real_filesystem_.ListDirectory(dir_name, exclude, recursive,
entries);
});
ON_CALL(*this, GetMatchingFiles)
- .WillByDefault([this](const char* glob, std::vector<string>* matches) {
- return real_filesystem_.GetMatchingFiles(glob, matches);
- });
+ .WillByDefault(
+ [this](const char* glob, std::vector<std::string>* matches) {
+ return real_filesystem_.GetMatchingFiles(glob, matches);
+ });
ON_CALL(*this, OpenForWrite).WillByDefault([this](const char* file_name) {
return real_filesystem_.OpenForWrite(file_name);
diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc
new file mode 100644
index 0000000..5cf6a1d
--- /dev/null
+++ b/icing/helpers/icu/icu-data-file-helper.cc
@@ -0,0 +1,74 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/helpers/icu/icu-data-file-helper.h"
+
+#include <sys/mman.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "unicode/udata.h"
+#include "unicode/utypes.h"
+
+namespace icing {
+namespace lib {
+
+namespace icu_data_file_helper {
+
+// ICU data file needs to be set up once, it can be shared between different
+// Icing instances. Setting up the file too many times may cause out-of-memory
+// segmentation fault errors.
+bool has_set_up_icu_data_file = false;
+
+libtextclassifier3::Status SetUpICUDataFile(
+ const std::string& icu_data_file_absolute_path) {
+ if (has_set_up_icu_data_file) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ Filesystem filesystem;
+ int64_t file_size =
+ filesystem.GetFileSize(icu_data_file_absolute_path.c_str());
+ ScopedFd fd(filesystem.OpenForRead(icu_data_file_absolute_path.c_str()));
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError("Unable to open file at provided path");
+ }
+
+ // TODO(samzheng): figure out why icing::MemoryMappedFile causes
+ // segmentation fault here.
+ const void* data =
+ mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd.get(), 0);
+
+ UErrorCode status = U_ZERO_ERROR;
+ udata_setCommonData(data, &status);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError(
+ "Failed to set up ICU data, please check if you have the data file at "
+ "the given path.");
+ }
+
+ has_set_up_icu_data_file = true;
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace icu_data_file_helper
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h
new file mode 100644
index 0000000..90f5bc7
--- /dev/null
+++ b/icing/helpers/icu/icu-data-file-helper.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+
+namespace icing {
+namespace lib {
+
+namespace icu_data_file_helper {
+
+// The library binary doesn't contain any ICU data files, so we generate a .dat
+// file at compile time and here make ICU use that file.
+//
+// NOTE: This target does NOT contain the ICU .dat file. To use this helper
+// function, the calling target must include a data dependency on the .dat file
+// and provide a valid path to that file.
+//
+// Returns:
+// Ok on success
+// INTERNAL_ERROR if failed on any errors
+libtextclassifier3::Status SetUpICUDataFile(
+ const std::string& icu_data_file_absolute_path);
+
+} // namespace icu_data_file_helper
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
new file mode 100644
index 0000000..32ac9e6
--- /dev/null
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -0,0 +1,121 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/icing-search-engine.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+
+std::string GetTestBaseDir() {
+ return GetTestTempDir() + "/icing_with_icu_files";
+}
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ return icing_options;
+}
+
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
+ return DocumentBuilder()
+ .SetKey(std::move(name_space), std::move(uri))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(1)
+ .Build();
+}
+
+SchemaProto CreateMessageSchema() {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ return schema;
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+TEST(IcingSearchEngineWithIcuFileTest, ShouldInitialize) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+}
+
+TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document_two;
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status().code(), Eq(StatusProto::OK));
+ // The token is a random number so we don't verify it.
+ expected_search_result_proto.set_next_page_token(
+ search_result_proto.next_page_token());
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 2b07205..c973885 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -37,6 +37,7 @@
#include "icing/proto/initialize.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
@@ -45,11 +46,14 @@
#include "icing/schema/schema-store.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
+#include "icing/scoring/ranker.h"
#include "icing/scoring/scored-document-hit.h"
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
@@ -81,9 +85,21 @@
libtextclassifier3::Status ValidateResultSpec(
const ResultSpecProto& result_spec) {
- if (result_spec.num_to_retrieve() < 0) {
+ if (result_spec.num_per_page() < 0) {
return absl_ports::InvalidArgumentError(
- "ResultSpecProto.num_to_retrieve cannot be negative.");
+ "ResultSpecProto.num_per_page cannot be negative.");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status ValidateSearchSpec(
+ const SearchSpecProto& search_spec,
+ const PerformanceConfiguration& configuration) {
+ if (search_spec.query().size() > configuration.max_query_length) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
+ "allowed query length: ",
+ std::to_string(configuration.max_query_length)));
}
return libtextclassifier3::Status::OK;
}
@@ -130,37 +146,6 @@
return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
}
-// Helper function to wrap results in ScoredDocumentHit without changing the
-// order.
-std::vector<ScoredDocumentHit> WrapResults(
- std::unique_ptr<DocHitInfoIterator> result_iterator, int num_to_return) {
- std::vector<ScoredDocumentHit> document_hits;
- while (result_iterator->Advance().ok() && num_to_return-- > 0) {
- const DocHitInfo& doc_hit_info = result_iterator->doc_hit_info();
- // Score is just a placeholder here and has no meaning.
- document_hits.emplace_back(doc_hit_info.document_id(),
- doc_hit_info.hit_section_ids_mask(),
- /*score=*/0);
- }
- return document_hits;
-}
-
-libtextclassifier3::StatusOr<std::vector<ScoredDocumentHit>> RunScoring(
- std::unique_ptr<DocHitInfoIterator> result_iterator,
- const ScoringSpecProto& scoring_spec, int num_to_return,
- const DocumentStore* document_store) {
- if (scoring_spec.rank_by() == ScoringSpecProto::RankingStrategy::NONE) {
- // No scoring needed, return in original order
- return WrapResults(std::move(result_iterator), num_to_return);
- }
-
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store));
- return scoring_processor->ScoreAndRank(std::move(result_iterator),
- num_to_return);
-}
-
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
switch (internal_status.CanonicalCode()) {
@@ -199,8 +184,9 @@
default:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
- ICING_LOG(FATAL)
- << "Internal status code not supported in the external API";
+ ICING_LOG(FATAL) << IcingStringUtil::StringPrintf(
+ "Internal status code %d not supported in the external API",
+ internal_status.error_code());
break;
}
@@ -209,17 +195,22 @@
} // namespace
-IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options)
+IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const JniCache> jni_cache)
: IcingSearchEngine(options, std::make_unique<Filesystem>(),
- std::make_unique<Clock>()) {}
+ std::make_unique<Clock>(), std::move(jni_cache)) {}
IcingSearchEngine::IcingSearchEngine(
IcingSearchEngineOptions options,
- std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock)
+ std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock,
+ std::unique_ptr<const JniCache> jni_cache)
: options_(std::move(options)),
filesystem_(std::move(filesystem)),
icing_filesystem_(std::make_unique<IcingFilesystem>()),
- clock_(std::move(clock)) {
+ clock_(std::move(clock)),
+ result_state_manager_(performance_configuration_.max_num_hits_per_query,
+ performance_configuration_.max_num_cache_results),
+ jni_cache_(std::move(jni_cache)) {
ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
}
@@ -233,22 +224,27 @@
}
InitializeResultProto IcingSearchEngine::Initialize() {
+ // This method does both read and write so we need a writer lock. Using two
+ // locks (reader and writer) has the chance to be interrupted during
+ // switching.
+ absl_ports::unique_lock l(&mutex_);
+ return InternalInitialize();
+}
+
+InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
InitializeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
-
if (initialized_) {
// Already initialized.
result_status->set_code(StatusProto::OK);
return result_proto;
}
- // This method does both read and write so we need a writer lock. Using two
- // locks (reader and writer) has the chance to be interrupted during
- // switching.
- absl_ports::unique_lock l(&mutex_);
+ // Releases result / query cache if any
+ result_state_manager_.InvalidateAllResultStates();
libtextclassifier3::Status status = InitializeMembers();
if (!status.ok()) {
@@ -279,10 +275,14 @@
ICING_RETURN_IF_ERROR(InitializeSchemaStore());
ICING_RETURN_IF_ERROR(InitializeDocumentStore());
- TC3_ASSIGN_OR_RETURN(language_segmenter_, LanguageSegmenter::Create());
+ // TODO(b/156383798) : Resolve how to specify the locale.
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_US, jni_cache_.get());
+ TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
+ std::move(segmenter_options)));
TC3_ASSIGN_OR_RETURN(normalizer_,
- Normalizer::Create(options_.max_token_length()));
+ normalizer_factory::Create(options_.max_token_length()));
ICING_RETURN_IF_ERROR(InitializeIndex());
@@ -422,14 +422,19 @@
SetSchemaResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
libtextclassifier3::Status status = SchemaUtil::Validate(new_schema);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
- absl_ports::unique_lock l(&mutex_);
-
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
@@ -504,6 +509,11 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto schema_or = schema_store_->GetSchema();
if (!schema_or.ok()) {
@@ -522,6 +532,11 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
if (!type_config_or.ok()) {
@@ -548,6 +563,11 @@
// the schema file to validate, and the schema could be changed in
// SetSchema() which is protected by the same mutex.
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto document_id_or = document_store_->Put(document);
if (!document_id_or.ok()) {
@@ -582,6 +602,11 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto document_or = document_store_->Get(name_space, uri);
if (!document_or.ok()) {
@@ -594,6 +619,22 @@
return result_proto;
}
+GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
+ GetAllNamespacesResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::shared_lock l(&mutex_);
+
+ std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
+
+ for (const std::string& namespace_ : namespaces) {
+ result_proto.add_namespaces(namespace_);
+ }
+
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+}
+
DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
const std::string_view uri) {
ICING_VLOG(1) << "Deleting document from doc store";
@@ -602,6 +643,11 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
@@ -622,14 +668,20 @@
const std::string_view name_space) {
ICING_VLOG(1) << "Deleting namespace from doc store";
+ DeleteByNamespaceResultProto delete_result;
+ StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return delete_result;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteByNamespace(name_space);
- DeleteByNamespaceResultProto delete_result;
- TransformStatus(status, delete_result.mutable_status());
+ TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete Namespace: " << name_space;
@@ -642,14 +694,20 @@
const std::string_view schema_type) {
ICING_VLOG(1) << "Deleting type from doc store";
+ DeleteBySchemaTypeResultProto delete_result;
+ StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return delete_result;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteBySchemaType(schema_type);
- DeleteBySchemaTypeResultProto delete_result;
- TransformStatus(status, delete_result.mutable_status());
+ TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
@@ -665,6 +723,11 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto status = InternalPersistToDisk();
TransformStatus(status, result_status);
@@ -684,6 +747,14 @@
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ // Releases result / query cache if any
+ result_state_manager_.InvalidateAllResultStates();
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk();
@@ -732,6 +803,54 @@
return result_proto;
}
+GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
+ ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
+
+ GetOptimizeInfoResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ // Get stats from DocumentStore
+ auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
+ if (!doc_store_optimize_info_or.ok()) {
+ TransformStatus(doc_store_optimize_info_or.status(), result_status);
+ return result_proto;
+ }
+ DocumentStore::OptimizeInfo doc_store_optimize_info =
+ doc_store_optimize_info_or.ValueOrDie();
+ result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
+
+ if (doc_store_optimize_info.optimizable_docs == 0) {
+ // Can return early since there's nothing to calculate on the index side
+ result_proto.set_estimated_optimizable_bytes(0);
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+ }
+
+ // Get stats from Index.
+ auto index_elements_size_or = index_->GetElementsSize();
+ if (!index_elements_size_or.ok()) {
+ TransformStatus(index_elements_size_or.status(), result_status);
+ return result_proto;
+ }
+ int64_t index_elements_size = index_elements_size_or.ValueOrDie();
+
+ // Sum up the optimizable sizes from DocumentStore and Index
+ result_proto.set_estimated_optimizable_bytes(
+ index_elements_size * doc_store_optimize_info.optimizable_docs /
+ doc_store_optimize_info.total_docs +
+ doc_store_optimize_info.estimated_optimizable_bytes);
+
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+}
+
libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() {
ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
ICING_RETURN_IF_ERROR(document_store_->PersistToDisk());
@@ -811,15 +930,24 @@
const ResultSpecProto& result_spec) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ // TODO(b/146008613) Explore ideas to make this function read-only.
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
libtextclassifier3::Status status = ValidateResultSpec(result_spec);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
-
- // TODO(b/146008613) Explore ideas to make this function read-only.
- absl_ports::unique_lock l(&mutex_);
+ status = ValidateSearchSpec(search_spec, performance_configuration_);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
// Gets unordered results from query processor
auto query_processor_or = QueryProcessor::Create(
@@ -840,31 +968,56 @@
QueryProcessor::QueryResults query_results =
std::move(query_results_or).ValueOrDie();
- // Generates the final list of document hits
- auto result_document_hits_or =
- RunScoring(std::move(query_results.root_iterator), scoring_spec,
- result_spec.num_to_retrieve(), document_store_.get());
- if (!result_document_hits_or.ok()) {
- TransformStatus(result_document_hits_or.status(), result_status);
+ // Scores but does not rank the results.
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
+ scoring_processor_or =
+ ScoringProcessor::Create(scoring_spec, document_store_.get());
+ if (!scoring_processor_or.ok()) {
+ TransformStatus(scoring_processor_or.status(), result_status);
return result_proto;
}
+ std::unique_ptr<ScoringProcessor> scoring_processor =
+ std::move(scoring_processor_or).ValueOrDie();
std::vector<ScoredDocumentHit> result_document_hits =
- std::move(result_document_hits_or).ValueOrDie();
+ scoring_processor->Score(std::move(query_results.root_iterator),
+ performance_configuration_.num_to_score);
+
+ // Returns early for empty result
+ if (result_document_hits.empty()) {
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+ }
+
+ // Ranks and paginates results
+ libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
+ result_state_manager_.RankAndPaginate(ResultState(
+ std::move(result_document_hits), std::move(query_results.query_terms),
+ search_spec, scoring_spec, result_spec));
+ if (!page_result_state_or.ok()) {
+ TransformStatus(page_result_state_or.status(), result_status);
+ return result_proto;
+ }
+ PageResultState page_result_state =
+ std::move(page_result_state_or).ValueOrDie();
// Retrieves the document protos and snippets if requested
- auto result_retriever_or = ResultRetriever::Create(
- document_store_.get(), schema_store_.get(), language_segmenter_.get());
+ auto result_retriever_or =
+ ResultRetriever::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
+ result_state_manager_.InvalidateResultState(
+ page_result_state.next_page_token);
TransformStatus(result_retriever_or.status(), result_status);
return result_proto;
}
std::unique_ptr<ResultRetriever> result_retriever =
std::move(result_retriever_or).ValueOrDie();
- auto results_or = result_retriever->RetrieveResults(
- result_spec, query_results.query_terms, search_spec.term_match_type(),
- result_document_hits);
+ libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
+ results_or = result_retriever->RetrieveResults(page_result_state);
if (!results_or.ok()) {
+ result_state_manager_.InvalidateResultState(
+ page_result_state.next_page_token);
TransformStatus(results_or.status(), result_status);
return result_proto;
}
@@ -877,9 +1030,84 @@
result_proto.mutable_results()->Add(std::move(result));
}
result_status->set_code(StatusProto::OK);
+ if (page_result_state.next_page_token != kInvalidNextPageToken) {
+ result_proto.set_next_page_token(page_result_state.next_page_token);
+ }
return result_proto;
}
+SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
+ SearchResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ // ResultStateManager has its own writer lock, so here we only need a reader
+ // lock for other components.
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
+ result_state_manager_.GetNextPage(next_page_token);
+
+ if (!page_result_state_or.ok()) {
+ if (absl_ports::IsNotFound(page_result_state_or.status())) {
+ // NOT_FOUND means an empty result.
+ result_status->set_code(StatusProto::OK);
+ } else {
+ // Real error, pass up.
+ TransformStatus(page_result_state_or.status(), result_status);
+ }
+ return result_proto;
+ }
+
+ PageResultState page_result_state =
+ std::move(page_result_state_or).ValueOrDie();
+
+ // Retrieves the document protos.
+ auto result_retriever_or =
+ ResultRetriever::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
+ if (!result_retriever_or.ok()) {
+ TransformStatus(result_retriever_or.status(), result_status);
+ return result_proto;
+ }
+ std::unique_ptr<ResultRetriever> result_retriever =
+ std::move(result_retriever_or).ValueOrDie();
+
+ libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
+ results_or = result_retriever->RetrieveResults(page_result_state);
+ if (!results_or.ok()) {
+ TransformStatus(results_or.status(), result_status);
+ return result_proto;
+ }
+ std::vector<SearchResultProto::ResultProto> results =
+ std::move(results_or).ValueOrDie();
+
+ // Assembles the final search result proto
+ result_proto.mutable_results()->Reserve(results.size());
+ for (SearchResultProto::ResultProto& result : results) {
+ result_proto.mutable_results()->Add(std::move(result));
+ }
+
+ result_status->set_code(StatusProto::OK);
+ if (result_proto.results_size() > 0) {
+ result_proto.set_next_page_token(next_page_token);
+ }
+ return result_proto;
+}
+
+void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
+ return;
+ }
+ result_state_manager_.InvalidateResultState(next_page_token);
+}
+
libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
// Gets the current directory path and an empty tmp directory path for
// document store optimization.
@@ -1024,5 +1252,49 @@
return document_store_->last_added_document_id() != kInvalidDocumentId;
}
+ResetResultProto IcingSearchEngine::Reset() {
+ ICING_VLOG(1) << "Resetting IcingSearchEngine";
+
+ ResetResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+
+ if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
+ int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+ if (after_size != before_size) {
+ // Our filesystem doesn't atomically delete. If we have a discrepancy in
+ // size, then that means we may have deleted some files, but not others.
+ // So our data is in an invalid state now.
+ result_status->set_code(StatusProto::INTERNAL);
+ return result_proto;
+ }
+
+ result_status->set_code(StatusProto::ABORTED);
+ return result_proto;
+ }
+
+ absl_ports::unique_lock l(&mutex_);
+ initialized_ = false;
+ if (InternalInitialize().status().code() != StatusProto::OK) {
+ // We shouldn't hit the following Initialize errors:
+ // NOT_FOUND: all data was cleared, we aren't expecting anything
+ // DATA_LOSS: all data was cleared, we aren't expecting anything
+ // RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
+ //
+ // We can't tell if Initialize failed and left Icing in an inconsistent
+ // state or if it was a temporary I/O error. Group everything under INTERNAL
+ // to be safe.
+ //
+ // TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
+ // status code, we can just propagate it up from here.
+ result_status->set_code(StatusProto::INTERNAL);
+ return result_proto;
+ }
+
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 281db30..6ae76d7 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -20,6 +20,7 @@
#include <string>
#include <string_view>
+#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/mutex.h"
@@ -27,13 +28,16 @@
#include "icing/file/filesystem.h"
#include "icing/index/index.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/performance-configuration.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
+#include "icing/result/result-state-manager.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter.h"
@@ -57,7 +61,12 @@
uint32_t checksum;
};
- explicit IcingSearchEngine(const IcingSearchEngineOptions& options);
+ // Note: It is only required to provide a pointer to a valid instance of
+ // JniCache if this instance needs to perform reverse-jni calls. Users on
+ // Linux and iOS should always provide a nullptr.
+ explicit IcingSearchEngine(
+ const IcingSearchEngineOptions& options,
+ std::unique_ptr<const JniCache> jni_cache = nullptr);
// Calculates integrity checks and persists files to disk.
~IcingSearchEngine();
@@ -85,7 +94,7 @@
// IcingSearchEngineOptions.base_dir and reinitialize.
// RESOURCE_EXHAUSTED if not enough storage space
// NOT_FOUND if missing some internal data
- InitializeResultProto Initialize() LOCKS_EXCLUDED(mutex_);
+ InitializeResultProto Initialize() ICING_LOCKS_EXCLUDED(mutex_);
// Specifies the schema to be applied on all Documents that are already
// stored as well as future documents. A schema can be 'invalid' and/or
@@ -114,14 +123,17 @@
// So, callers should only have to call this if the schema changed.
// However, calling it multiple times with the same schema is a no-op.
//
- // On any error, Icing will keep using the older schema.
+ // On some errors, Icing will keep using the older schema, but on
+ // INTERNAL_ERROR, it is undefined to continue using Icing.
//
// Returns:
// OK on success
// INVALID_ARGUMENT if 'new_schema' is invalid
- // FAILED_PRECONDITION if 'new_schema' is incompatible
+ // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
+ // has not been initialized yet.
// INTERNAL_ERROR if Icing failed to store the new schema or upgrade
- // existing data based on the new schema.
+ // existing data based on the new schema. Using Icing beyond this error is
+ // undefined and may cause crashes.
//
// TODO(cassiewang) Figure out, document (and maybe even enforce) the best
// way ordering of calls between Initialize() and SetSchema(), both when
@@ -129,7 +141,7 @@
// time and when the caller is reinitializing an existing index on disk.
SetSchemaResultProto SetSchema(
SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// This function makes a copy of the schema and calls SetSchema(SchemaProto&&
// new_schema, bool ignore_errors_and_delete_documents)
@@ -137,27 +149,30 @@
// NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool
// ignore_errors_and_delete_documents) directly to avoid a copy if the caller
// can make an rvalue SchemaProto.
- SetSchemaResultProto SetSchema(
- const SchemaProto& new_schema,
- bool ignore_errors_and_delete_documents = false) LOCKS_EXCLUDED(mutex_);
+ SetSchemaResultProto SetSchema(const SchemaProto& new_schema,
+ bool ignore_errors_and_delete_documents =
+ false) ICING_LOCKS_EXCLUDED(mutex_);
// Get Icing's current copy of the schema.
//
// Returns:
// SchemaProto on success
// NOT_FOUND if a schema has not been set yet
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
// INTERNAL_ERROR on IO error
- GetSchemaResultProto GetSchema() LOCKS_EXCLUDED(mutex_);
+ GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_);
// Get Icing's copy of the SchemaTypeConfigProto of name schema_type
//
// Returns:
// SchemaTypeConfigProto on success
- // NOT_FOUND if a schema has not been set yet or if there is no
- // SchemaTypeConfig of schema_type in the SchemaProto
+ // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+ // has not been initialized yet.
+ // NOT_FOUND if there is no SchemaTypeConfig of schema_type in the
+ // SchemaProto
// INTERNAL_ERROR on IO error
GetSchemaTypeResultProto GetSchemaType(std::string_view schema_type)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// Puts the document into icing search engine so that it's stored and
// indexed. Documents are automatically written to disk, callers can also
@@ -165,15 +180,20 @@
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+ // has not been initialized yet.
+ // NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
+ // the document's schema
// INTERNAL_ERROR on IO error
- PutResultProto Put(DocumentProto&& document) LOCKS_EXCLUDED(mutex_);
+ PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_);
// This function makes a copy of document and calls Put(DocumentProto&&
// document).
//
// NOTE: It's recommended to call Put(DocumentProto&& document) directly to
// avoid a copy if the caller can make an rvalue DocumentProto.
- PutResultProto Put(const DocumentProto& document) LOCKS_EXCLUDED(mutex_);
+ PutResultProto Put(const DocumentProto& document)
+ ICING_LOCKS_EXCLUDED(mutex_);
// Finds and returns the document identified by the given key (namespace +
// uri)
@@ -181,50 +201,94 @@
// Returns:
// The document found on success
// NOT_FOUND if the key doesn't exist or doc has been deleted
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
GetResultProto Get(std::string_view name_space, std::string_view uri);
+ // Returns all the namespaces that have at least one valid document in it.
+ //
+ // Returns:
+ // All namespaces on success
+ GetAllNamespacesResultProto GetAllNamespaces();
+
// Deletes the Document specified by the given namespace / uri pair from the
// search engine. Delete changes are automatically applied to disk, callers
// can also call PersistToDisk() to flush changes immediately.
//
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
// Returns:
// OK on success
+ // NOT_FOUND if no document exists with namespace, uri
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
DeleteResultProto Delete(std::string_view name_space, std::string_view uri)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// Deletes all Documents belonging to the specified namespace from the search
// engine. Delete changes are automatically applied to disk, callers can also
// call PersistToDisk() to flush changes immediately.
//
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
// Returns:
// OK on success
+ // NOT_FOUND if namespace doesn't exist
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
// Deletes all Documents belonging to the specified type from the search
// engine. Delete changes are automatically applied to disk, callers can also
// call PersistToDisk() to flush changes immediately.
//
- // Returns:
- // OK on success
- // INTERNAL_ERROR on IO error
- DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
- LOCKS_EXCLUDED(mutex_);
-
- // Retrieves, scores, ranks, and returns the results according to the specs.
- // Please refer to each proto file for spec definitions.
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
- // A SearchResultProto on success
+ // OK on success
+ // NOT_FOUND if schema type doesn't exist
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Retrieves, scores, ranks, and returns the results according to the specs.
+ // Results can be empty. If there're multiple pages of results,
+ // SearchResultProto.next_page_token will be populated and that can be used to
+ // fetch more pages via GetNextPage() method. Clients should call
+ // InvalidateNextPageToken() after they get the pages they need to release
+ // result cache in memory. Please refer to each proto file for spec
+ // definitions.
+ //
+ // Returns a SearchResultProto with status:
+ // OK with results on success
// INVALID_ARGUMENT if any of specs is invalid
+ // ABORTED if failed to perform search but existing data is not affected
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on any other errors
SearchResultProto Search(const SearchSpecProto& search_spec,
const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec)
- LOCKS_EXCLUDED(mutex_);
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Fetches the next page of results of a previously executed query. Results
+ // can be empty if next-page token is invalid or all pages have been returned.
+ //
+ // Returns a SearchResultProto with status:
+ // OK with results on success
+ // ABORTED if failed to get results but existing data is not affected
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on any other errors
+ SearchResultProto GetNextPage(uint64_t next_page_token)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Invalidates the next-page token so that no more results of the related
+ // query can be returned.
+ void InvalidateNextPageToken(uint64_t next_page_token);
// Makes sure that every update/delete received till this point is flushed
// to disk. If the app crashes after a call to PersistToDisk(), Icing
@@ -236,32 +300,52 @@
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL on I/O error
- PersistToDiskResultProto PersistToDisk() LOCKS_EXCLUDED(mutex_);
+ PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
// Allows Icing to run tasks that are too expensive and/or unnecessary to be
// executed in real-time, but are useful to keep it fast and be
// resource-efficient. This method purely optimizes the internal files and
// has no functional impact on what gets accepted/returned.
//
- // NOTE: This method should be called about once every 24 hours when the
- // device is idle and charging. It can also be called when the system needs
- // to free up extra disk-space.
- //
// WARNING: This method is CPU and IO intensive and depending on the
// contents stored, it can take from a few seconds to a few minutes.
// This call also blocks all read/write operations on Icing.
//
+ // SUGGESTION: Assuming the client has no restrictions on their side, it's
+ // recommended to call this method about once every 24 hours when the
+ // device is idle and charging. It can also be called when the system needs
+ // to free up extra disk-space.
+ //
// Returns:
// OK on success
// ABORTED_ERROR if optimization is aborted due to non-fatal errors before
// actual modifications are made.
// DATA_LOSS_ERROR on errors that could potentially cause data loss,
// IcingSearchEngine is still functioning.
- // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Icing
- // could be in an inconsistent state and might not be usable.
+ // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued
+ // use of Icing is undefined.
// Clients could clear and reinitialize IcingSearchEngine.
- OptimizeResultProto Optimize() LOCKS_EXCLUDED(mutex_);
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Returns potential size and document savings if Optimize were called.
+ //
+ // Returns:
+ // OK on success
+ // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Clears all data from Icing and re-initializes. Clients DO NOT need to call
+ // Initialize again.
+ //
+ // Returns:
+ // OK on success
+ // ABORTED_ERROR if failed to delete underlying files
+ // INTERNAL_ERROR if internal state is no longer consistent
+ ResetResultProto Reset() ICING_LOCKS_EXCLUDED(mutex_);
// Disallow copy and move.
IcingSearchEngine(const IcingSearchEngine&) = delete;
@@ -270,40 +354,58 @@
protected:
IcingSearchEngine(IcingSearchEngineOptions options,
std::unique_ptr<const Filesystem> filesystem,
- std::unique_ptr<Clock> clock);
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<const JniCache> jni_cache = nullptr);
private:
const IcingSearchEngineOptions options_;
const std::unique_ptr<const Filesystem> filesystem_;
const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
- bool initialized_ = false;
+ bool initialized_ ICING_GUARDED_BY(mutex_) = false;
// Abstraction for accessing time values.
std::unique_ptr<Clock> clock_;
+ // Provides key thresholds that affects the running time and memory of major
+ // components in Icing search engine.
+ PerformanceConfiguration performance_configuration_;
+
+ // Used to manage pagination state of query results. A lock is not needed here
+ // because ResultStateManager has its own reader-writer lock.
+ ResultStateManager result_state_manager_;
+
// Used to provide reader and writer locks
absl_ports::shared_mutex mutex_;
// Stores and processes the schema
- std::unique_ptr<SchemaStore> schema_store_ GUARDED_BY(mutex_);
+ std::unique_ptr<SchemaStore> schema_store_ ICING_GUARDED_BY(mutex_);
// Used to store all valid documents
- std::unique_ptr<DocumentStore> document_store_ GUARDED_BY(mutex_);
+ std::unique_ptr<DocumentStore> document_store_ ICING_GUARDED_BY(mutex_);
std::unique_ptr<const LanguageSegmenter> language_segmenter_
- GUARDED_BY(mutex_);
+ ICING_GUARDED_BY(mutex_);
- std::unique_ptr<const Normalizer> normalizer_ GUARDED_BY(mutex_);
+ std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_);
// Storage for all hits of content from the document store.
- std::unique_ptr<Index> index_ GUARDED_BY(mutex_);
+ std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
+
+ // Pointer to JNI class references
+ const std::unique_ptr<const JniCache> jni_cache_;
// Helper method to do the actual work to persist data to disk. We need this
// separate method so that other public methods don't need to call
// PersistToDisk(). Public methods calling each other may cause deadlock
// issues.
libtextclassifier3::Status InternalPersistToDisk()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to the actual work to Initialize. We need this separate
+ // method so that other public methods don't need to call Initialize(). Public
+ // methods calling each other may cause deadlock issues.
+ InitializeResultProto InternalInitialize()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to initialize member variables.
//
@@ -313,7 +415,7 @@
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on any I/O errors
libtextclassifier3::Status InitializeMembers()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any validation/setup required for the given IcingSearchEngineOptions
//
@@ -322,7 +424,7 @@
// INVALID_ARGUMENT if options has invalid values
// INTERNAL on I/O error
libtextclassifier3::Status InitializeOptions()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a SchemaStore instance.
//
@@ -330,7 +432,7 @@
// OK on success
// INTERNAL on I/O error
libtextclassifier3::Status InitializeSchemaStore()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a DocumentStore
// instance.
@@ -339,7 +441,7 @@
// OK on success
// INTERNAL on I/O error
libtextclassifier3::Status InitializeDocumentStore()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a DocumentStore
// instance.
@@ -349,7 +451,8 @@
// RESOURCE_EXHAUSTED if the index runs out of storage
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeIndex() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ libtextclassifier3::Status InitializeIndex()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Many of the internal components rely on other components' derived data.
// Check that everything is consistent with each other so that we're not
@@ -364,7 +467,7 @@
// NOT_FOUND if missing header file
// INTERNAL_ERROR on any IO errors or if header is inconsistent
libtextclassifier3::Status CheckConsistency()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Repopulates derived data off our ground truths.
//
@@ -372,7 +475,7 @@
// OK on success
// INTERNAL_ERROR on any IO errors
libtextclassifier3::Status RegenerateDerivedFiles()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Optimizes the DocumentStore by removing any unneeded documents (i.e.
// deleted, expired, etc.) from the filesystem storage.
@@ -389,7 +492,7 @@
// INTERNAL_ERROR on any IO errors or other errors that we can't recover
// from
libtextclassifier3::Status OptimizeDocumentStore()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to restore missing document data in index_. All documents
// will be reindexed. This does not clear the index, so it is recommended to
@@ -400,7 +503,8 @@
// RESOURCE_EXHAUSTED if the index fills up before finishing indexing
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RestoreIndex() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ libtextclassifier3::Status RestoreIndex()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Computes the combined checksum of the IcingSearchEngine - includes all its
// subcomponents
@@ -409,16 +513,16 @@
// Combined checksum on success
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Checks if the header exists already. This does not create the header file
// if it doesn't exist.
- bool HeaderExists() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ bool HeaderExists() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Update and replace the header file. Creates the header file if it doesn't
// exist.
libtextclassifier3::Status UpdateHeader(const Crc32& checksum)
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// If we lost the schema during a previous failure, it may "look" the same as
// not having a schema set before: we don't have a schema proto file. So do
@@ -430,7 +534,7 @@
// bool indicating if we had a schema and unintentionally lost it
// INTERNAL_ERROR on I/O error
libtextclassifier3::StatusOr<bool> LostPreviousSchema()
- EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
};
} // namespace lib
diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc
new file mode 100644
index 0000000..a6d96e0
--- /dev/null
+++ b/icing/icing-search-engine_benchmark.cc
@@ -0,0 +1,313 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/document-generator.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/recorder-test-utils.h"
+#include "icing/testing/schema-generator.h"
+#include "icing/testing/tmp-directory.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing:icing-search-engine_benchmark
+//
+// $ blaze-bin/icing/icing-search-engine_benchmark
+// --benchmarks=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing:icing-search-engine_benchmark
+//
+// $ adb push blaze-bin/icing/icing-search-engine_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/icing-search-engine_benchmark --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+// Icing GMSCore has, on average, 17 corpora on a device and 30 corpora at the
+// 95th pct. Most clients use a single type. This is a function of Icing's
+// constrained type offering. Assume that each package will use 3 types on
+// average.
+constexpr int kAvgNumNamespaces = 10;
+constexpr int kAvgNumTypes = 3;
+
+// ASSUME: Properties will have at most ten properties. Types will be created
+// with [1, 10] properties.
+constexpr int kMaxNumProperties = 10;
+
+// Based on logs from Icing GMSCore.
+constexpr int kAvgDocumentSize = 300;
+
+// ASSUME: ~75% of the document's size comes from it's content.
+constexpr float kContentSizePct = 0.7;
+
+// Average length of word in English is 4.7 characters.
+constexpr int kAvgTokenLen = 5;
+// Made up value. This results in a fairly reasonable language - the majority of
+// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
+// longest ones are 27 chars, (roughly consistent with the longest,
+// non-contrived English words
+// https://en.wikipedia.org/wiki/Longest_word_in_English)
+constexpr int kTokenStdDev = 7;
+constexpr int kLanguageSize = 1000;
+
+// Lite Index size required to fit 128k docs, each doc requires ~64 bytes of
+// space in the lite index.
+constexpr int kIcingFullIndexSize = 1024 * 1024 * 8;
+
+// Query params
+constexpr int kNumPerPage = 10;
+constexpr int kNumToSnippet = 10000;
+constexpr int kMatchesPerProperty = 1;
+
+std::vector<std::string> CreateNamespaces(int num_namespaces) {
+ std::vector<std::string> namespaces;
+ while (--num_namespaces >= 0) {
+ namespaces.push_back("comgooglepackage" + std::to_string(num_namespaces));
+ }
+ return namespaces;
+}
+
+// Creates a vector containing num_words randomly-generated words for use by
+// documents.
+template <typename Rand>
+std::vector<std::string> CreateLanguage(int num_words, Rand* r) {
+ std::vector<std::string> language;
+ std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
+ while (--num_words >= 0) {
+ int word_length = 0;
+ while (word_length < 1) {
+ word_length = std::round(norm_dist(*r));
+ }
+ language.push_back(RandomString(kAlNumAlphabet, word_length, r));
+ }
+ return language;
+}
+
+SearchSpecProto CreateSearchSpec(const std::string& query,
+ const std::vector<std::string>& namespaces,
+ TermMatchType::Code match_type) {
+ SearchSpecProto search_spec;
+ search_spec.set_query(query);
+ for (const std::string& name_space : namespaces) {
+ search_spec.add_namespace_filters(name_space);
+ }
+ search_spec.set_term_match_type(match_type);
+ return search_spec;
+}
+
+ResultSpecProto CreateResultSpec(int num_per_page, int num_to_snippet,
+ int matches_per_property) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(num_to_snippet);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+ matches_per_property);
+ return result_spec;
+}
+
+ScoringSpecProto CreateScoringSpec(
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ranking_strategy);
+ return scoring_spec;
+}
+
+class DestructibleDirectory {
+ public:
+ explicit DestructibleDirectory(const Filesystem& filesystem,
+ const std::string& dir)
+ : filesystem_(filesystem), dir_(dir) {
+ filesystem_.CreateDirectoryRecursively(dir_.c_str());
+ }
+ ~DestructibleDirectory() {
+ filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+ }
+
+ private:
+ Filesystem filesystem_;
+ std::string dir_;
+};
+
+void BM_MutlipleIndices(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ RandomSchemaGenerator<std::default_random_engine,
+ ExactStringPropertyGenerator>
+ schema_generator(&random, &property_generator);
+ SchemaProto schema =
+ schema_generator.GenerateSchema(num_types, kMaxNumProperties);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Create the indices.
+ std::vector<std::unique_ptr<IcingSearchEngine>> icings;
+ int num_indices = state.range(0);
+ for (int i = 0; i < num_indices; ++i) {
+ IcingSearchEngineOptions options;
+ std::string base_dir = test_dir + "/" + std::to_string(i);
+ options.set_base_dir(base_dir);
+ options.set_index_merge_size(kIcingFullIndexSize / num_indices);
+ auto icing = std::make_unique<IcingSearchEngine>(options);
+
+ InitializeResultProto init_result = icing->Initialize();
+ ASSERT_THAT(init_result.status().code(), Eq(StatusProto::OK));
+
+ SetSchemaResultProto schema_result = icing->SetSchema(schema);
+ ASSERT_THAT(schema_result.status().code(), Eq(StatusProto::OK));
+ icings.push_back(std::move(icing));
+ }
+
+ // Setup namespace info and language
+ std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
+ EvenDistributionNamespaceSelector namespace_selector(namespaces);
+
+ std::vector<std::string> language = CreateLanguage(kLanguageSize, &random);
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>
+ token_generator(language, &random);
+
+ // Fill the index.
+ DocumentGenerator<
+ EvenDistributionNamespaceSelector, EvenDistributionTypeSelector,
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>>
+ generator(&namespace_selector, &type_selector, &token_generator,
+ kAvgDocumentSize * kContentSizePct);
+ for (int i = 0; i < state.range(1); ++i) {
+ DocumentProto doc = generator.generateDoc();
+ PutResultProto put_result;
+ if (icings.empty()) {
+ ASSERT_THAT(put_result.status().code(), Eq(StatusProto::UNKNOWN));
+ continue;
+ }
+ put_result = icings.at(i % icings.size())->Put(doc);
+ ASSERT_THAT(put_result.status().code(), Eq(StatusProto::OK));
+ }
+
+ // QUERY!
+ // Every document has its own namespace as a token. This query that should
+ // match 1/kAvgNumNamespace% of all documents.
+ const std::string& name_space = namespaces.at(0);
+ SearchSpecProto search_spec = CreateSearchSpec(
+ /*query=*/name_space, {name_space}, TermMatchType::EXACT_ONLY);
+ ResultSpecProto result_spec =
+ CreateResultSpec(kNumPerPage, kNumToSnippet, kMatchesPerProperty);
+ ScoringSpecProto scoring_spec =
+ CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ int num_results = 0;
+ for (auto _ : state) {
+ num_results = 0;
+ SearchResultProto result;
+ if (icings.empty()) {
+ ASSERT_THAT(result.status().code(), Eq(StatusProto::UNKNOWN));
+ continue;
+ }
+ result = icings.at(0)->Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(result.status().code(), Eq(StatusProto::OK));
+ while (!result.results().empty()) {
+ num_results += result.results_size();
+ if (!icings.empty()) {
+ result = icings.at(0)->GetNextPage(result.next_page_token());
+ }
+ ASSERT_THAT(result.status().code(), Eq(StatusProto::OK));
+ }
+ }
+
+ // Measure size.
+ int64_t disk_usage = filesystem.GetDiskUsage(test_dir.c_str());
+ std::cout << "Num results:\t" << num_results << "\t\tDisk Use:\t"
+ << disk_usage / 1024.0 << std::endl;
+}
+BENCHMARK(BM_MutlipleIndices)
+ // First argument: num_indices, Second argument: num_total_documents
+ // So each index will contain (num_total_documents / num_indices) documents.
+ ->ArgPair(0, 0)
+ ->ArgPair(0, 1024)
+ ->ArgPair(0, 131072)
+ ->ArgPair(1, 0)
+ ->ArgPair(1, 1)
+ ->ArgPair(1, 2)
+ ->ArgPair(1, 8)
+ ->ArgPair(1, 32)
+ ->ArgPair(1, 128)
+ ->ArgPair(1, 1024)
+ ->ArgPair(1, 8192)
+ ->ArgPair(1, 32768)
+ ->ArgPair(1, 131072)
+ ->ArgPair(2, 0)
+ ->ArgPair(2, 1)
+ ->ArgPair(2, 2)
+ ->ArgPair(2, 8)
+ ->ArgPair(2, 32)
+ ->ArgPair(2, 128)
+ ->ArgPair(2, 1024)
+ ->ArgPair(2, 8192)
+ ->ArgPair(2, 32768)
+ ->ArgPair(2, 131072)
+ ->ArgPair(10, 0)
+ ->ArgPair(10, 1)
+ ->ArgPair(10, 2)
+ ->ArgPair(10, 8)
+ ->ArgPair(10, 32)
+ ->ArgPair(10, 128)
+ ->ArgPair(10, 1024)
+ ->ArgPair(10, 8192)
+ ->ArgPair(10, 32768)
+ ->ArgPair(10, 131072);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 86cc8c8..d31f836 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,6 +18,7 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
@@ -31,8 +32,6 @@
IcingSearchEngineOptions Setup() {
IcingSearchEngineOptions icing_options;
- libtextclassifier3::Status status =
- SetUpICUDataFile("icing/icu.dat");
icing_options.set_base_dir(GetTestTempDir() + "/icing");
return icing_options;
}
@@ -54,7 +53,7 @@
DocumentProto MakeDocument(const uint8_t* data, size_t size) {
// TODO (sidchhabra): Added more optimized fuzzing techniques.
DocumentProto document;
- string string_prop(reinterpret_cast<const char*>(data), size);
+ std::string string_prop(reinterpret_cast<const char*>(data), size);
return DocumentBuilder()
.SetKey("namespace", "uri1")
.SetSchema("Message")
@@ -66,7 +65,7 @@
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::PREFIX);
// TODO (sidchhabra): Added more optimized fuzzing techniques.
- string query_string(reinterpret_cast<const char*>(data), size);
+ std::string query_string(reinterpret_cast<const char*>(data), size);
search_spec.set_query(query_string);
return search_spec;
}
@@ -74,6 +73,10 @@
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
// Initialize
IcingSearchEngineOptions icing_options = Setup();
+ std::string icu_data_file_path = GetTestFilePath("icing/icu.dat");
+ if (!icu_data_file_helper::SetUpICUDataFile(icu_data_file_path).ok()) {
+ return 1;
+ }
IcingSearchEngine icing(icing_options);
const Filesystem filesystem_;
// TODO (b/145758378): Deleting directory should not be required.
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 57aeac2..b0946c9 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -26,10 +26,10 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
-#include "icing/proto/schema.proto.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
@@ -46,14 +46,19 @@
namespace lib {
namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::_;
using ::testing::Eq;
+using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::IsEmpty;
using ::testing::Lt;
+using ::testing::Matcher;
using ::testing::Return;
using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::testing::UnorderedElementsAre;
// For mocking purpose, we allow tests to provide a custom Filesystem.
class TestIcingSearchEngine : public IcingSearchEngine {
@@ -69,9 +74,10 @@
class IcingSearchEngineTest : public testing::Test {
protected:
void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
}
@@ -106,16 +112,16 @@
return icing_options;
}
-DocumentProto GetDefaultDocument() {
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
return DocumentBuilder()
- .SetKey("namespace", "uri")
+ .SetKey(std::move(name_space), std::move(uri))
.SetSchema("Message")
.AddStringProperty("body", "message body")
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
}
-SchemaProto GetDefaultSchema() {
+SchemaProto CreateMessageSchema() {
SchemaProto schema;
auto type = schema.add_types();
type->set_schema_type("Message");
@@ -131,6 +137,29 @@
return schema;
}
+SchemaProto CreateEmailSchema() {
+ SchemaProto schema;
+ auto* type = schema.add_types();
+ type->set_schema_type("Email");
+
+ auto* body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ body->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+ auto* subj = type->add_properties();
+ subj->set_property_name("subject");
+ subj->set_data_type(PropertyConfigProto::DataType::STRING);
+ subj->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ subj->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
+ subj->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+
+ return schema;
+}
+
ScoringSpecProto GetDefaultScoringSpec() {
ScoringSpecProto scoring_spec;
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
@@ -140,10 +169,10 @@
TEST_F(IcingSearchEngineTest, SimpleInitialization) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- DocumentProto document = GetDefaultDocument();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(DocumentProto(document)).status().code(),
Eq(StatusProto::OK));
@@ -152,10 +181,10 @@
TEST_F(IcingSearchEngineTest, InitializingAgainSavesNonPersistedData) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- DocumentProto document = GetDefaultDocument();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
GetResultProto expected_get_result_proto;
@@ -256,16 +285,10 @@
options.set_max_token_length(1);
IcingSearchEngine icing(options);
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- DocumentProto document =
- DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Message")
- .AddStringProperty("body", "message")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
EXPECT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
// "message" should have been truncated to "m"
@@ -276,7 +299,7 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document;
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -306,7 +329,7 @@
options.set_max_token_length(std::numeric_limits<int32_t>::max());
IcingSearchEngine icing(options);
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Add a document that just barely fits under the max document limit.
@@ -386,20 +409,15 @@
Eq(StatusProto::INVALID_ARGUMENT));
}
-TEST_F(IcingSearchEngineTest, NoSchemaSet) {
+TEST_F(IcingSearchEngineTest, PutWithoutSchemaFailedPrecondition) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- DocumentProto document = GetDefaultDocument();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
PutResultProto put_result_proto = icing.Put(document);
- EXPECT_THAT(put_result_proto.status().code(), Eq(StatusProto::NOT_FOUND));
- EXPECT_THAT(put_result_proto.status().message(),
- HasSubstr("'Message' not found"));
-
- put_result_proto = icing.Put(document);
- EXPECT_THAT(put_result_proto.status().code(), Eq(StatusProto::NOT_FOUND));
- EXPECT_THAT(put_result_proto.status().message(),
- HasSubstr("'Message' not found"));
+ EXPECT_THAT(put_result_proto.status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(put_result_proto.status().message(), HasSubstr("Schema not set"));
}
TEST_F(IcingSearchEngineTest, FailToReadSchema) {
@@ -409,7 +427,7 @@
// Successfully initialize and set a schema
IcingSearchEngine icing(icing_options);
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
}
@@ -445,7 +463,7 @@
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
SetSchemaResultProto set_schema_result_proto =
- icing.SetSchema(GetDefaultSchema());
+ icing.SetSchema(CreateMessageSchema());
EXPECT_THAT(set_schema_result_proto.status().code(),
Eq(StatusProto::INTERNAL));
EXPECT_THAT(set_schema_result_proto.status().message(),
@@ -456,9 +474,9 @@
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- auto message_document = GetDefaultDocument();
+ auto message_document = CreateMessageDocument("namespace", "uri");
- auto schema_with_message = GetDefaultSchema();
+ auto schema_with_message = CreateMessageSchema();
SchemaProto schema_with_email;
SchemaTypeConfigProto* type = schema_with_email.add_types();
@@ -519,7 +537,7 @@
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- SchemaProto schema_with_no_indexed_property = GetDefaultSchema();
+ SchemaProto schema_with_no_indexed_property = CreateMessageSchema();
schema_with_no_indexed_property.mutable_types(0)
->mutable_properties(0)
->clear_indexing_config();
@@ -527,8 +545,9 @@
EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status().code(),
Eq(StatusProto::OK));
// Nothing will be index and Search() won't return anything.
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
SearchSpecProto search_spec;
search_spec.set_query("message");
@@ -541,7 +560,7 @@
ResultSpecProto::default_instance()),
EqualsProto(empty_result));
- SchemaProto schema_with_indexed_property = GetDefaultSchema();
+ SchemaProto schema_with_indexed_property = CreateMessageSchema();
// Index restoration should be triggered here because new schema requires more
// properties to be indexed.
EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status().code(),
@@ -549,8 +568,8 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
EqualsProto(expected_search_result_proto));
@@ -716,52 +735,58 @@
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
GetSchemaResultProto expected_get_schema_result_proto;
expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_schema_result_proto.mutable_schema() = GetDefaultSchema();
+ *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
}
-TEST_F(IcingSearchEngineTest, GetSchemaTypeNotFound) {
+TEST_F(IcingSearchEngineTest, GetSchemaTypeFailedPrecondition) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.GetSchemaType("nonexistent_schema").status().code(),
- Eq(StatusProto::NOT_FOUND));
+ GetSchemaTypeResultProto get_schema_type_result_proto =
+ icing.GetSchemaType("nonexistent_schema");
+ EXPECT_THAT(get_schema_type_result_proto.status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(get_schema_type_result_proto.status().message(),
+ HasSubstr("Schema not set"));
}
TEST_F(IcingSearchEngineTest, GetSchemaTypeOk) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
GetSchemaTypeResultProto expected_get_schema_type_result_proto;
expected_get_schema_type_result_proto.mutable_status()->set_code(
StatusProto::OK);
*expected_get_schema_type_result_proto.mutable_schema_type_config() =
- GetDefaultSchema().types(0);
- EXPECT_THAT(icing.GetSchemaType(GetDefaultSchema().types(0).schema_type()),
+ CreateMessageSchema().types(0);
+ EXPECT_THAT(icing.GetSchemaType(CreateMessageSchema().types(0).schema_type()),
EqualsProto(expected_get_schema_type_result_proto));
}
TEST_F(IcingSearchEngineTest, GetDocument) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Simple put and get
- ASSERT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
ASSERT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
@@ -784,24 +809,13 @@
TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- DocumentProto document_one =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
SearchSpecProto search_spec;
@@ -848,24 +862,13 @@
TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- DocumentProto document_one =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
SearchSpecProto search_spec;
@@ -873,14 +876,20 @@
search_spec.set_query("message");
ResultSpecProto result_spec;
- result_spec.set_num_to_retrieve(1);
+ result_spec.set_num_per_page(1);
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document_two;
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
- EqualsProto(expected_search_result_proto));
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status().code(), Eq(StatusProto::OK));
+ // The token is a random number so we don't verify it.
+ expected_search_result_proto.set_next_page_token(
+ search_result_proto.next_page_token());
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, SearchZeroResultLimitReturnsEmptyResults) {
@@ -892,7 +901,7 @@
search_spec.set_query("");
ResultSpecProto result_spec;
- result_spec.set_num_to_retrieve(0);
+ result_spec.set_num_per_page(0);
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -909,13 +918,13 @@
search_spec.set_query("");
ResultSpecProto result_spec;
- result_spec.set_num_to_retrieve(-5);
+ result_spec.set_num_per_page(-5);
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(
StatusProto::INVALID_ARGUMENT);
expected_search_result_proto.mutable_status()->set_message(
- "ResultSpecProto.num_to_retrieve cannot be negative.");
+ "ResultSpecProto.num_per_page cannot be negative.");
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
EqualsProto(expected_search_result_proto));
}
@@ -927,7 +936,7 @@
// Set the schema up beforehand.
IcingSearchEngine icing(icing_options);
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Schema will be persisted to disk when icing goes out of scope.
}
@@ -938,8 +947,9 @@
IcingSearchEngine icing(icing_options);
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
// The index and document store will be persisted to disk when icing goes
// out of scope.
}
@@ -956,9 +966,8 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()
- ->Add()
- ->mutable_document()) = GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
@@ -974,14 +983,330 @@
}
}
+TEST_F(IcingSearchEngineTest, SearchShouldReturnEmpty) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ // Empty result, no next-page token
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+
+ // Searches and gets the first page, 2 results
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document5;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+
+ // Second page, 2 results
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+
+ // Third page, 1 result
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+
+ // No more results
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+
+ // Searches and gets the first page, 2 results
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document5;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+
+ // Second page, 2 results
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+
+ // Third page, 1 result
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+
+ // No more results
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
+
+ // Searches and gets the first page, 2 results with 2 snippets
+ SearchResultProto search_result =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ EXPECT_THAT(search_result.results(0).document(), EqualsProto(document5));
+ EXPECT_THAT(GetMatch(search_result.results(0).document(),
+ search_result.results(0).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message"));
+ EXPECT_THAT(GetWindow(search_result.results(0).document(),
+ search_result.results(0).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message body"));
+ EXPECT_THAT(search_result.results(1).document(), EqualsProto(document4));
+ EXPECT_THAT(GetMatch(search_result.results(1).document(),
+ search_result.results(1).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message"));
+ EXPECT_THAT(GetWindow(search_result.results(1).document(),
+ search_result.results(1).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message body"));
+
+ // Second page, 2 result with 1 snippet
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ EXPECT_THAT(search_result.results(0).document(), EqualsProto(document3));
+ EXPECT_THAT(GetMatch(search_result.results(0).document(),
+ search_result.results(0).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message"));
+ EXPECT_THAT(GetWindow(search_result.results(0).document(),
+ search_result.results(0).snippet(), "body",
+ /*snippet_index=*/0),
+ Eq("message body"));
+ EXPECT_THAT(search_result.results(1).document(), EqualsProto(document2));
+ EXPECT_THAT(search_result.results(1).snippet().entries_size(), Eq(0));
+
+ // Third page, 1 result with 0 snippets
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(search_result.results(), SizeIs(1));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ EXPECT_THAT(search_result.results(0).document(), EqualsProto(document1));
+ EXPECT_THAT(search_result.results(0).snippet().entries_size(), Eq(0));
+}
+
+TEST_F(IcingSearchEngineTest, ShouldInvalidateNextPageToken) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ // Searches and gets the first page, 1 result
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+ // Now document1 is still to be fetched.
+
+ // Invalidates token
+ icing.InvalidateNextPageToken(next_page_token);
+
+ // Tries to fetch the second page, no result since it's invalidated
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest,
+ AllPageTokensShouldBeInvalidatedAfterOptimization) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ // Searches and gets the first page, 1 result
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+ // Now document1 is still to be fetched.
+
+ OptimizeResultProto optimize_result_proto;
+ optimize_result_proto.mutable_status()->set_code(StatusProto::OK);
+ optimize_result_proto.mutable_status()->set_message("");
+ ASSERT_THAT(icing.Optimize(), EqualsProto(optimize_result_proto));
+
+ // Tries to fetch the second page, no results since all tokens have been
+ // invalidated during Optimize()
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ EXPECT_THAT(icing.GetNextPage(next_page_token),
+ EqualsProto(expected_search_result_proto));
+}
+
TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) {
IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body1")
- .Build();
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
@@ -990,7 +1315,7 @@
{
IcingSearchEngine icing(icing_options);
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
@@ -1022,7 +1347,7 @@
IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
IcingSearchEngine icing(icing_options);
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Create a tmp dir that will be used in Optimize() to swap files,
@@ -1043,28 +1368,76 @@
EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
}
+TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(1000);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Just initialized, nothing is optimizable yet.
+ GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+
+ // Only have active documents, nothing is optimizable yet.
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+ // Deletes document1
+ ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(),
+ Eq(StatusProto::OK));
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0));
+ int64_t first_estimated_optimizable_bytes =
+ optimize_info.estimated_optimizable_bytes();
+
+ // Add a second document, but it'll be expired since the time (1000) is
+ // greater than the document's creation timestamp (100) + the document's ttl
+ // (500)
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(),
+ Gt(first_estimated_optimizable_bytes));
+
+ // Optimize
+ ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
+
+ // Nothing is optimizable now that everything has been optimized away.
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+}
+
TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body1")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body2")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document3 =
- DocumentBuilder()
- .SetKey("namespace", "uri3")
- .SetSchema("Message")
- .AddStringProperty("body", "message body3")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
@@ -1073,7 +1446,7 @@
{
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
@@ -1098,24 +1471,12 @@
}
TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) {
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body1")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body2")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
{
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
@@ -1238,9 +1599,10 @@
.AddStringProperty("body", "message body2")
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
+
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
@@ -1276,7 +1638,7 @@
TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
// Creates 3 test schemas
- SchemaProto schema1 = SchemaProto(GetDefaultSchema());
+ SchemaProto schema1 = SchemaProto(CreateMessageSchema());
SchemaProto schema2 = SchemaProto(schema1);
auto new_property2 = schema2.mutable_types(0)->add_properties();
@@ -1314,25 +1676,19 @@
}
TEST_F(IcingSearchEngineTest, SearchShouldWorkAfterOptimization) {
- DocumentProto document =
- DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Message")
- .AddStringProperty("body", "message")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::PREFIX);
search_spec.set_query("m");
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document;
{
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
@@ -1351,14 +1707,14 @@
}
TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
{
// Initializes a normal icing to create files needed
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
}
// Creates a mock filesystem in which DeleteDirectoryRecursively() always
@@ -1372,7 +1728,6 @@
std::move(mock_filesystem),
std::make_unique<FakeClock>());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
EXPECT_THAT(icing.Optimize().status().code(), Eq(StatusProto::ABORTED));
// Now optimization is aborted, we verify that document-related functions
@@ -1380,19 +1735,13 @@
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
- EXPECT_THAT(icing.Get("namespace", "uri"),
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
EqualsProto(expected_get_result_proto));
- DocumentProto new_document =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "new body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- EXPECT_THAT(icing.Put(new_document).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
SearchSpecProto search_spec;
search_spec.set_query("m");
@@ -1400,8 +1749,10 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
@@ -1425,10 +1776,11 @@
std::make_unique<FakeClock>());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
// Optimize() fails due to filesystem error
EXPECT_THAT(icing.Optimize().status().code(),
@@ -1467,7 +1819,7 @@
search_spec.set_query("n");
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
new_document;
// Searching new content returns the new document
@@ -1494,10 +1846,11 @@
std::make_unique<FakeClock>());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
// Optimize() fails due to filesystem error
EXPECT_THAT(icing.Optimize().status().code(),
@@ -1535,7 +1888,7 @@
search_spec.set_query("n");
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
new_document;
// Searching new content returns the new document
@@ -1571,11 +1924,11 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document;
// Time just has to be less than the document's creation timestamp (100) + the
- // schema's ttl (500)
+ // document's ttl (500)
auto fake_clock = std::make_unique<FakeClock>();
fake_clock->SetSystemTimeMilliseconds(400);
@@ -1622,7 +1975,7 @@
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
// Time just has to be greater than the document's creation timestamp (100) +
- // the schema's ttl (500)
+ // the document's ttl (500)
auto fake_clock = std::make_unique<FakeClock>();
fake_clock->SetSystemTimeMilliseconds(700);
@@ -1680,7 +2033,7 @@
search_spec.Clear();
search_spec.add_schema_type_filters("message");
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
message_document;
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -1726,21 +2079,23 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
// Basic initialization/setup
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -1764,8 +2119,9 @@
EqualsProto(expected_search_result_proto));
// Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
}
TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderMagic) {
@@ -1775,21 +2131,23 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
// Basic initialization/setup
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -1817,8 +2175,9 @@
EqualsProto(expected_search_result_proto));
// Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
}
TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderChecksum) {
@@ -1828,21 +2187,23 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
// Basic initialization/setup
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -1871,8 +2232,9 @@
EqualsProto(expected_search_result_proto));
// Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
}
TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptSchema) {
@@ -1880,14 +2242,16 @@
// Basic initialization/setup
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
EXPECT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
@@ -1908,14 +2272,16 @@
// Basic initialization/setup
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
EXPECT_THAT(icing.Get("namespace", "uri"),
EqualsProto(expected_get_result_proto));
@@ -1932,7 +2298,8 @@
}
TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) {
- DocumentProto document2 =
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2_with_additional_property =
DocumentBuilder()
.SetKey("namespace", "uri2")
.SetSchema("Message")
@@ -1940,6 +2307,7 @@
.AddStringProperty("body", "message body")
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
+
{
// Initializes folder and schema
IcingSearchEngine icing(GetDefaultIcingOptions());
@@ -1964,9 +2332,9 @@
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2_with_additional_property).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
// Won't get us anything because "additional" isn't marked as an indexed
// property in the schema
@@ -2052,8 +2420,8 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2_with_additional_property;
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
@@ -2061,29 +2429,23 @@
}
TEST_F(IcingSearchEngineTest, RecoverFromInconsistentDocumentStore) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+
{
// Initializes folder and schema, index one document
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
} // This should shut down IcingSearchEngine and persist anything it needs to
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
-
{
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
SchemaStore::Create(filesystem(), GetSchemaDir()));
- ICING_EXPECT_OK(schema_store->SetSchema(GetDefaultSchema()));
+ ICING_EXPECT_OK(schema_store->SetSchema(CreateMessageSchema()));
// Puts a second document into DocumentStore but doesn't index it.
FakeClock fake_clock;
@@ -2101,10 +2463,10 @@
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() = document1;
// DocumentStore kept the additional document
- EXPECT_THAT(icing.Get("namespace", "uri"),
+ EXPECT_THAT(icing.Get("namespace", "uri1"),
EqualsProto(expected_get_result_proto));
*expected_get_result_proto.mutable_document() = document2;
@@ -2118,10 +2480,11 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
@@ -2135,17 +2498,18 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
// Initializes folder and schema, index one document
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
EqualsProto(expected_search_result_proto));
@@ -2171,17 +2535,18 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
- GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
// Initializes folder and schema, index one document
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
EqualsProto(expected_search_result_proto));
@@ -2205,7 +2570,7 @@
TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) {
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Creates 3 documents and ensures the relationship in terms of document
@@ -2249,11 +2614,11 @@
// Result should be in descending score order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document3;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document1;
ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
@@ -2266,7 +2631,7 @@
TEST_F(IcingSearchEngineTest, SearchShouldAllowNoScoring) {
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Creates 3 documents and ensures the relationship of them is:
@@ -2306,11 +2671,11 @@
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document1;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document3;
// Results should not be ranked by score but returned in reverse insertion
@@ -2325,7 +2690,7 @@
TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByCreationTimestamp) {
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Creates 3 documents and ensures the relationship in terms of creation
@@ -2363,11 +2728,11 @@
// Result should be in descending timestamp order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document3;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document1;
ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
@@ -2381,7 +2746,7 @@
TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedAscendingly) {
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
// Creates 3 documents and ensures the relationship in terms of document
@@ -2425,11 +2790,11 @@
// Result should be in ascending score order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document1;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- (*expected_search_result_proto.mutable_results()->Add()->mutable_document()) =
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document3;
ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
@@ -2499,7 +2864,7 @@
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- DocumentProto document = GetDefaultDocument();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
// Can retrieve by namespace/uri
@@ -2513,9 +2878,8 @@
// Can search for it
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- (*expected_search_result_proto.mutable_results()
- ->Add()
- ->mutable_document()) = GetDefaultDocument();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
ASSERT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance()),
EqualsProto(expected_search_result_proto));
@@ -2549,15 +2913,17 @@
TEST_F(IcingSearchEngineTest, PersistToDisk) {
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = GetDefaultDocument();
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
{
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(GetDefaultSchema()).status().code(),
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(GetDefaultDocument()).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
// Persisting shouldn't affect anything
EXPECT_THAT(icing.PersistToDisk().status().code(), Eq(StatusProto::OK));
@@ -2572,6 +2938,430 @@
EqualsProto(expected_get_result_proto));
}
+TEST_F(IcingSearchEngineTest, ResetOk) {
+ SchemaProto message_schema = CreateMessageSchema();
+ SchemaProto empty_schema = SchemaProto(message_schema);
+ empty_schema.clear_types();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(message_schema).status().code(),
+ Eq(StatusProto::OK));
+
+ int64_t empty_state_size =
+ filesystem()->GetFileDiskUsage(GetTestBaseDir().c_str());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
+
+ // Check that things have been added
+ EXPECT_THAT(filesystem()->GetDiskUsage(GetTestBaseDir().c_str()),
+ Gt(empty_state_size));
+
+ EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK));
+
+ // Check that we're back to an empty state
+ EXPECT_EQ(filesystem()->GetFileDiskUsage(GetTestBaseDir().c_str()),
+ empty_state_size);
+
+ // Sanity check that we can still call other APIs. If things aren't cleared,
+ // then this should raise an error since the empty schema is incompatible with
+ // the old message_schema.
+ EXPECT_THAT(icing.SetSchema(empty_schema).status().code(),
+ Eq(StatusProto::OK));
+}
+
+TEST_F(IcingSearchEngineTest, ResetAbortedError) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+
+ // This fails IcingSearchEngine::Reset(). But since we didn't actually delete
+ // anything, we'll be able to consider this just an ABORTED call.
+ ON_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(StrEq(GetTestBaseDir().c_str())))
+ .WillByDefault(Return(false));
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::ABORTED));
+
+ // Everything is still intact.
+ // Can get old data.
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+ EXPECT_THAT(icing.Get(document.namespace_(), document.uri()),
+ EqualsProto(expected_get_result_proto));
+
+ // Can add new data.
+ EXPECT_THAT(
+ icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
+ Eq(StatusProto::OK));
+}
+
+TEST_F(IcingSearchEngineTest, ResetInternalError) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+
+ // Let all other calls succeed.
+ EXPECT_CALL(*mock_filesystem, Write(Matcher<const char*>(_), _, _))
+ .WillRepeatedly(Return(true));
+
+ // This prevents IcingSearchEngine from creating a DocumentStore instance on
+ // reinitialization
+ const std::string document_log_path =
+ GetTestBaseDir() + "/document_dir/document_log";
+ EXPECT_CALL(
+ *mock_filesystem,
+ Write(Matcher<const char*>(StrEq(document_log_path.c_str())), _, _))
+ .WillOnce(Return(true))
+ .WillOnce(Return(false));
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::INTERNAL));
+}
+
+TEST_F(IcingSearchEngineTest, SnippetNormalization) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "mdi Zürich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("mdi Zürich");
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(results.results(), SizeIs(2));
+ const DocumentProto& result_document_1 = results.results(0).document();
+ const SnippetProto& result_snippet_1 = results.results(0).snippet();
+ EXPECT_THAT(result_document_1, EqualsProto(document_two));
+ EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/0),
+ Eq("mdi"));
+ EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/0),
+ Eq("mdi Zürich Team Meeting"));
+ EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/1),
+ Eq("Zürich"));
+ EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/1),
+ Eq("mdi Zürich Team Meeting"));
+
+ const DocumentProto& result_document_2 = results.results(1).document();
+ const SnippetProto& result_snippet_2 = results.results(1).snippet();
+ EXPECT_THAT(result_document_2, EqualsProto(document_one));
+ EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/0),
+ Eq("MDI"));
+ EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/0),
+ Eq("MDI zurich Team Meeting"));
+ EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/1),
+ Eq("zurich"));
+ EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/1),
+ Eq("MDI zurich Team Meeting"));
+}
+
+TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "mdi Zürich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("md Zür");
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(results.results(), SizeIs(2));
+ const DocumentProto& result_document_1 = results.results(0).document();
+ const SnippetProto& result_snippet_1 = results.results(0).snippet();
+ EXPECT_THAT(result_document_1, EqualsProto(document_two));
+ EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/0),
+ Eq("mdi"));
+ EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/0),
+ Eq("mdi Zürich Team Meeting"));
+ EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/1),
+ Eq("Zürich"));
+ EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
+ /*snippet_index=*/1),
+ Eq("mdi Zürich Team Meeting"));
+
+ const DocumentProto& result_document_2 = results.results(1).document();
+ const SnippetProto& result_snippet_2 = results.results(1).snippet();
+ EXPECT_THAT(result_document_2, EqualsProto(document_one));
+ EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/0),
+ Eq("MDI"));
+ EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/0),
+ Eq("MDI zurich Team Meeting"));
+ EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/1),
+ Eq("zurich"));
+ EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
+ /*snippet_index=*/1),
+ Eq("MDI zurich Team Meeting"));
+}
+
+TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "MDI zurich Team Meeting")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("body:Zür");
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(10);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(10);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(results.results(), SizeIs(1));
+ const DocumentProto& result_document = results.results(0).document();
+ const SnippetProto& result_snippet = results.results(0).snippet();
+ EXPECT_THAT(result_document, EqualsProto(document_one));
+ EXPECT_THAT(
+ GetMatch(result_document, result_snippet, "body", /*snippet_index=*/0),
+ Eq("zurich"));
+ EXPECT_THAT(
+ GetWindow(result_document, result_snippet, "body", /*snippet_index=*/0),
+ Eq("MDI zurich Team Meeting"));
+ EXPECT_THAT(
+ GetMatch(result_document, result_snippet, "subject", /*snippet_index=*/0),
+ IsEmpty());
+ EXPECT_THAT(GetWindow(result_document, result_snippet, "subject",
+ /*snippet_index=*/0),
+ IsEmpty());
+}
+
+TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+
+ SchemaProto email_schema = CreateMessageSchema();
+ EXPECT_THAT(icing.SetSchema(email_schema).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.GetSchema().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.GetSchemaType(email_schema.types(0).schema_type()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ DocumentProto doc = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing.Put(doc).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Get(doc.namespace_(), doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type())
+ .status()
+ .code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ SearchSpecProto search_spec = SearchSpecProto::default_instance();
+ ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance();
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ constexpr int kSomePageToken = 12;
+ EXPECT_THAT(icing.GetNextPage(kSomePageToken).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash.
+
+ EXPECT_THAT(icing.PersistToDisk().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Optimize().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+}
+
+TEST_F(IcingSearchEngineTest, GetAllNamespaces) {
+ DocumentProto namespace1 = DocumentBuilder()
+ .SetKey("namespace1", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+ DocumentProto namespace2_uri1 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+ DocumentProto namespace2_uri2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ DocumentProto namespace3 = DocumentBuilder()
+ .SetKey("namespace3", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+ {
+ // Some arbitrary time that's less than all the document's creation time +
+ // ttl
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(500);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+
+ // No namespaces exist yet
+ GetAllNamespacesResultProto result = icing.GetAllNamespaces();
+ EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.namespaces(), IsEmpty());
+
+ ASSERT_THAT(icing.Put(namespace1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(namespace2_uri1).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(namespace2_uri2).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(namespace3).status().code(), Eq(StatusProto::OK));
+
+ // All namespaces should exist now
+ result = icing.GetAllNamespaces();
+ EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.namespaces(),
+ UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
+
+ // After deleting namespace2_uri1 document, we still have namespace2_uri2 in
+ // "namespace2" so it should still show up
+ ASSERT_THAT(icing.Delete("namespace2", "uri1").status().code(),
+ Eq(StatusProto::OK));
+
+ result = icing.GetAllNamespaces();
+ EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.namespaces(),
+ UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
+
+ // After deleting namespace2_uri2 document, we no longer have any documents
+ // in "namespace2"
+ ASSERT_THAT(icing.Delete("namespace2", "uri2").status().code(),
+ Eq(StatusProto::OK));
+
+ result = icing.GetAllNamespaces();
+ EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.namespaces(),
+ UnorderedElementsAre("namespace1", "namespace3"));
+ }
+
+ // We reinitialize here so we can feed in a fake clock this time
+ {
+ // Time needs to be past namespace3's creation time (100) + ttl (500) for it
+ // to count as "expired"
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(1000);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Only valid document left is the one in "namespace1"
+ GetAllNamespacesResultProto result = icing.GetAllNamespaces();
+ EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.namespaces(), UnorderedElementsAre("namespace1"));
+ }
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 8488d4e..7076257 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -68,8 +68,10 @@
uint32_t num_tokens = 0;
libtextclassifier3::Status overall_status;
for (const Section& section : sections) {
- Index::Editor editor = index_->Edit(document_id, section.metadata.id,
- section.metadata.term_match_type);
+ // TODO(b/152934343): pass real namespace ids in
+ Index::Editor editor =
+ index_->Edit(document_id, section.metadata.id,
+ section.metadata.term_match_type, /*namespace_id=*/0);
for (std::string_view subcontent : section.content) {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 30544fd..00d116f 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,6 +16,7 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -25,7 +26,9 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/logging.h"
@@ -136,7 +139,8 @@
}
std::unique_ptr<Normalizer> CreateNormalizer() {
- return Normalizer::Create(
+ return normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
}
@@ -178,7 +182,8 @@
void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem filesystem;
@@ -188,7 +193,7 @@
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -224,7 +229,8 @@
void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem filesystem;
@@ -234,7 +240,7 @@
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -271,7 +277,8 @@
void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem filesystem;
@@ -281,7 +288,7 @@
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -318,7 +325,8 @@
void BM_IndexDocumentWithHiragana(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem filesystem;
@@ -328,7 +336,7 @@
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 3fb7f2b..8dfb9c2 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -27,6 +27,7 @@
#include "icing/absl_ports/str_cat.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -42,7 +43,9 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
namespace icing {
@@ -80,18 +83,21 @@
void SetUp() override {
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
index_dir_ = GetTestTempDir() + "/index_test/";
Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_, LanguageSegmenter::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
normalizer_,
- Normalizer::Create(
+ normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -409,7 +415,8 @@
options.max_tokens_per_document = 1000;
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(/*max_token_length=*/4));
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/4));
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
diff --git a/icing/index/index.cc b/icing/index/index.cc
index a5eb429..d4a2508 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -26,8 +26,9 @@
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator-term.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
@@ -61,6 +62,21 @@
return IcingDynamicTrie::Options();
}
+// Helper function to check if a term is in the given namespaces.
+// TODO(samzheng): Implement a method PropertyReadersAll.HasAnyProperty().
+bool IsTermInNamespaces(
+ const IcingDynamicTrie::PropertyReadersAll& property_reader,
+ uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
+ for (NamespaceId namespace_id : namespace_ids) {
+ if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
+ value_index)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
@@ -98,6 +114,47 @@
}
}
+libtextclassifier3::StatusOr<std::vector<TermMetadata>>
+Index::FindTermsByPrefix(const std::string& prefix,
+ const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return) {
+ std::vector<TermMetadata> term_metadata_list;
+ if (num_to_return <= 0) {
+ return term_metadata_list;
+ }
+
+ // Finds all the terms that start with the given prefix in the lexicon.
+ IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
+ prefix.c_str());
+
+ // A property reader to help check if a term has some property.
+ IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
+
+ while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
+ uint32_t term_value_index = term_iterator.GetValueIndex();
+
+ // Skips the terms that don't exist in the given namespaces. We won't skip
+ // any terms if namespace_ids is empty.
+ if (!namespace_ids.empty() &&
+ !IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
+ term_iterator.Advance();
+ continue;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t term_id,
+ term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
+ absl_ports::InternalError("Failed to access terms in lexicon."));
+
+ term_metadata_list.emplace_back(term_iterator.GetKey(),
+ lite_index_->CountHits(term_id));
+
+ term_iterator.Advance();
+ }
+
+ return term_metadata_list;
+}
+
libtextclassifier3::Status Index::Editor::AddHit(const char* term,
Hit::Score score) {
// Step 1: See if this term is already in the lexicon
@@ -110,12 +167,13 @@
<< " is already present in lexicon. Updating.";
tvi = tvi_or.ValueOrDie();
// Already in the lexicon. Just update the properties.
- ICING_RETURN_IF_ERROR(lite_index_->UpdateTerm(tvi, term_match_type_));
+ ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
+ tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
} else {
ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting.";
// Haven't seen this term before. Add it to the lexicon.
- ICING_ASSIGN_OR_RETURN(tvi,
- lite_index_->InsertTerm(term, term_match_type_));
+ ICING_ASSIGN_OR_RETURN(
+ tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
}
// Step 3: Add the hit itself
diff --git a/icing/index/index.h b/icing/index/index.h
index c8bfe65..d8e409c 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -25,12 +25,14 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
+#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
#include "icing/util/crc32.h"
namespace icing {
@@ -111,6 +113,17 @@
lite_index_->GetDebugInfo(verbosity, out);
}
+ // Returns the byte size of the all the elements held in the index. This
+ // excludes the size of any internal metadata of the index, e.g. the index's
+ // header.
+ //
+ // Returns:
+ // Byte size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
+ return lite_index_->GetElementsSize();
+ }
+
// Create an iterator to iterate through all doc hit infos in the index that
// match the term. section_id_mask can be set to ignore hits from sections not
// listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits
@@ -123,6 +136,19 @@
const std::string& term, SectionIdMask section_id_mask,
TermMatchType::Code term_match_type);
+ // Finds terms with the given prefix in the given namespaces. If
+ // 'namespace_ids' is empty, returns results from all the namespaces. The
+ // input prefix must be normalized, otherwise inaccurate results may be
+ // returned. Results are not sorted specifically and are in their original
+ // order. Number of results are no more than 'num_to_return'.
+ //
+ // Returns:
+ // A list of TermMetadata on success
+ // INTERNAL_ERROR if failed to access term data.
+ libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
+ const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
+ int num_to_return);
+
// A class that can be used to add hits to the index.
//
// An editor groups hits from a particular section within a document together
@@ -136,11 +162,12 @@
// TODO(b/141180665): Add nullptr checks for the raw pointers
Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index,
DocumentId document_id, SectionId section_id,
- TermMatchType::Code term_match_type)
+ TermMatchType::Code term_match_type, NamespaceId namespace_id)
: term_id_codec_(term_id_codec),
lite_index_(lite_index),
document_id_(document_id),
term_match_type_(term_match_type),
+ namespace_id_(namespace_id),
section_id_(section_id) {}
libtextclassifier3::Status AddHit(const char* term,
@@ -155,12 +182,13 @@
LiteIndex* lite_index_;
DocumentId document_id_;
TermMatchType::Code term_match_type_;
+ NamespaceId namespace_id_;
SectionId section_id_;
};
Editor Edit(DocumentId document_id, SectionId section_id,
- TermMatchType::Code term_match_type) {
+ TermMatchType::Code term_match_type, NamespaceId namespace_id) {
return Editor(term_id_codec_.get(), lite_index_.get(), document_id,
- section_id, term_match_type);
+ section_id, term_match_type, namespace_id);
}
private:
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index b1214e3..070e82a 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -45,9 +45,14 @@
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::IsEmpty;
using ::testing::IsTrue;
using ::testing::NiceMock;
+using ::testing::Not;
+using ::testing::SizeIs;
using ::testing::Test;
+using ::testing::UnorderedElementsAre;
class IndexTest : public Test {
protected:
@@ -94,6 +99,15 @@
actual.hit_section_ids_mask() == section_mask;
}
+MATCHER_P2(EqualsTermMetadata, content, hit_count, "") {
+ const TermMetadata& actual = arg;
+ *result_listener << "actual is {content=" << actual.content
+ << ", hit_count=" << actual.hit_count
+ << "}, but expected was {content=" << content
+ << ", hit_count=" << hit_count << "}.";
+ return actual.content == content && actual.hit_count == hit_count;
+}
+
TEST_F(IndexTest, CreationWithNullPointerShouldFail) {
Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
EXPECT_THAT(Index::Create(options, /*filesystem=*/nullptr),
@@ -119,8 +133,8 @@
TEST_F(IndexTest, AdvancePastEnd) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -143,8 +157,8 @@
TEST_F(IndexTest, SingleHitSingleTermIndex) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
// Assert
@@ -160,8 +174,8 @@
TEST_F(IndexTest, SingleHitMultiTermIndex) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
EXPECT_THAT(edit.AddHit("bar"), IsOk());
@@ -178,8 +192,8 @@
TEST_F(IndexTest, NoHitMultiTermIndex) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
EXPECT_THAT(edit.AddHit("bar"), IsOk());
@@ -194,14 +208,16 @@
TEST_F(IndexTest, MultiHitMultiTermIndex) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
- edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY);
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("bar"), IsOk());
- edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY);
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
// Assert
@@ -218,11 +234,12 @@
TEST_F(IndexTest, MultiHitSectionRestrict) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
- edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY);
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
// Assert
@@ -239,8 +256,8 @@
TEST_F(IndexTest, SingleHitDedupeIndex) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
EXPECT_THAT(edit.AddHit("foo"), IsOk());
@@ -257,8 +274,8 @@
TEST_F(IndexTest, PrefixHit) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("fool"), IsOk());
// Assert
@@ -274,11 +291,12 @@
TEST_F(IndexTest, MultiPrefixHit) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("fool"), IsOk());
- edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY);
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
// Assert
@@ -296,11 +314,12 @@
TEST_F(IndexTest, NoExactHitInPrefixQuery) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY);
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("fool"), IsOk());
- edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX);
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
// Assert
@@ -315,8 +334,8 @@
TEST_F(IndexTest, PrefixHitDedupe) {
// Act
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
ASSERT_THAT(edit.AddHit("fool"), IsOk());
@@ -365,8 +384,8 @@
}
TEST_F(IndexTest, NonAsciiTerms) {
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("こんにちは"), IsOk());
ASSERT_THAT(edit.AddHit("あなた"), IsOk());
@@ -397,7 +416,8 @@
while (status.ok()) {
for (int i = 0; i < 100; ++i) {
Index::Editor edit =
- index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY);
+ index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
std::string term = RandomString(kAlNumAlphabet, kTokenSize, &random);
status = edit.AddHit(term.c_str());
if (i % 50 == 0) {
@@ -414,7 +434,8 @@
// Assert
// Adding more hits should fail.
Index::Editor edit =
- index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY);
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(edit.AddHit("bar"),
@@ -445,8 +466,8 @@
TEST_F(IndexTest, IndexCreateCorruptionFailure) {
// Add some content to the index
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
ASSERT_THAT(edit.AddHit("bar"), IsOk());
@@ -474,8 +495,8 @@
TEST_F(IndexTest, IndexPersistence) {
// Add some content to the index
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
ASSERT_THAT(edit.AddHit("bar"), IsOk());
EXPECT_THAT(index_->PersistToDisk(), IsOk());
@@ -507,48 +528,186 @@
TEST_F(IndexTest, ComputeChecksumSameBetweenCalls) {
// Add some content to the index.
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
- Crc32 foo_checksum(757666244U);
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
-
+ Crc32 checksum = index_->ComputeChecksum();
// Calling it again shouldn't change the checksum
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(checksum));
}
TEST_F(IndexTest, ComputeChecksumSameAcrossInstances) {
// Add some content to the index.
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
- Crc32 foo_checksum(757666244U);
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+ Crc32 checksum = index_->ComputeChecksum();
// Recreate the index, checksum should still be the same across instances
index_.reset();
Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+ EXPECT_THAT(index_->ComputeChecksum(), Eq(checksum));
}
TEST_F(IndexTest, ComputeChecksumChangesOnModification) {
// Add some content to the index.
- Index::Editor edit =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX);
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
ASSERT_THAT(edit.AddHit("foo"), IsOk());
- Crc32 foo_checksum(757666244U);
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_checksum));
+ Crc32 checksum = index_->ComputeChecksum();
// Modifying the index changes the checksum;
EXPECT_THAT(edit.AddHit("bar"), IsOk());
- Crc32 foo_bar_checksum(1228959551U);
- EXPECT_THAT(index_->ComputeChecksum(), Eq(foo_bar_checksum));
+ EXPECT_THAT(index_->ComputeChecksum(), Not(Eq(checksum)));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) {
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ EXPECT_THAT(edit.AddHit("fool"), IsOk());
+
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/0),
+ IsOkAndHolds(IsEmpty()));
+
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
+ /*num_to_return=*/-1),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.AddHit("bar"), IsOk());
+
+ // "b" should only match "bar" but not "foo".
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.AddHit("fo"), IsOk());
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.AddHit("fool"), IsOk());
+
+ // We have 3 results but only 2 should be returned.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/2),
+ IsOkAndHolds(SizeIs(2)));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit1.AddHit("fo"), IsOk());
+ EXPECT_THAT(edit1.AddHit("foo"), IsOk());
+
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/1);
+ EXPECT_THAT(edit2.AddHit("fool"), IsOk());
+
+ // namespace with id 0 has 2 results.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1))));
+
+ // namespace with id 1 has 1 result.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit1.AddHit("fo"), IsOk());
+
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/1);
+ EXPECT_THAT(edit2.AddHit("foo"), IsOk());
+
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/2);
+ EXPECT_THAT(edit3.AddHit("fool"), IsOk());
+
+ // Should return "foo" and "fool" which are in namespaces with ids 1 and 2.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit1.AddHit("fo"), IsOk());
+
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/1);
+ EXPECT_THAT(edit2.AddHit("foo"), IsOk());
+
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/2);
+ EXPECT_THAT(edit3.AddHit("fool"), IsOk());
+
+ // Should return "fo", "foo" and "fool" across all namespaces.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit1.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit1.AddHit("fool"), IsOk());
+
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit2.AddHit("fool"), IsOk());
+
+ // 'foo' has 1 hit, 'fool' has 2 hits.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 2))));
+}
+
+TEST_F(IndexTest, GetElementsSize) {
+ // Check empty index.
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Eq(0)));
+
+ // Add an element.
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Gt(0)));
}
} // namespace
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index a2e67b6..bf027e4 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -25,8 +25,8 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/schema/schema-store.h"
-#include "icing/store/document-filter-data.h"
#include "icing/store/document-store.h"
+#include "icing/store/namespace-id.h"
#include "icing/util/clock.h"
namespace icing {
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/iterator/doc-hit-info-iterator-term.h
index 7d02fc2..21d1dd6 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.h
+++ b/icing/index/iterator/doc-hit-info-iterator-term.h
@@ -21,7 +21,7 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/schema/section.h"
diff --git a/icing/index/lite-index.cc b/icing/index/lite/lite-index.cc
similarity index 88%
rename from icing/index/lite-index.cc
rename to icing/index/lite/lite-index.cc
index 450768a..a72402e 100644
--- a/icing/index/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/lite-index.h"
#include <inttypes.h>
#include <stddef.h>
@@ -34,6 +34,7 @@
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/term-property-id.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
#include "icing/legacy/index/icing-array-storage.h"
@@ -276,26 +277,31 @@
}
libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
- const std::string& term, TermMatchType::Code term_match_type) {
+ const std::string& term, TermMatchType::Code term_match_type,
+ NamespaceId namespace_id) {
uint32_t tvi;
if (!lexicon_.Insert(term.c_str(), "", &tvi, false)) {
return absl_ports::ResourceExhaustedError(
absl_ports::StrCat("Unable to add term ", term, " to lexicon!"));
}
- ICING_RETURN_IF_ERROR(UpdateTerm(tvi, term_match_type));
+ ICING_RETURN_IF_ERROR(UpdateTermProperties(
+ tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
return tvi;
}
-libtextclassifier3::Status LiteIndex::UpdateTerm(
- uint32_t tvi, TermMatchType::Code term_match_type) {
- if (term_match_type != TermMatchType::PREFIX) {
- return libtextclassifier3::Status::OK;
+libtextclassifier3::Status LiteIndex::UpdateTermProperties(
+ uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
+ if (hasPrefixHits &&
+ !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::ResourceExhaustedError(
+ "Insufficient disk space to create prefix property!");
}
- if (!lexicon_.SetProperty(tvi, kHasHitsInPrefixSection)) {
+ if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
return absl_ports::ResourceExhaustedError(
- "Insufficient disk space to create property!");
+ "Insufficient disk space to create namespace property!");
}
+
return libtextclassifier3::Status::OK;
}
@@ -363,6 +369,12 @@
return count;
}
+uint32_t LiteIndex::CountHits(uint32_t term_id) {
+ return AppendHits(term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ /*hits_out=*/nullptr);
+}
+
bool LiteIndex::is_full() const {
return (header_->cur_size() == options_.hit_buffer_size ||
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
@@ -379,6 +391,29 @@
lexicon_.GetDebugInfo(verbosity, out);
}
+libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
+ int64_t header_and_hit_buffer_file_size =
+ filesystem_->GetFileSize(hit_buffer_fd_.get());
+
+ if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get element size of the LiteIndex's header and hit buffer");
+ }
+
+ int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
+ if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get element size of LiteIndex's lexicon");
+ }
+
+ // On initialization, we grow the file to a padded size first. So this size
+ // won't count towards the size taken up by elements
+ size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
+
+ return header_and_hit_buffer_file_size - header_padded_size +
+ lexicon_disk_usage;
+}
+
uint32_t LiteIndex::Seek(uint32_t term_id) {
// Make searchable by sorting by hit buffer.
uint32_t sort_len = header_->cur_size() - header_->searchable_end();
diff --git a/icing/index/lite-index.h b/icing/index/lite/lite-index.h
similarity index 82%
rename from icing/index/lite-index.h
rename to icing/index/lite/lite-index.h
index 5431c65..b60a947 100644
--- a/icing/index/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -39,16 +39,13 @@
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
#include "icing/util/bit-util.h"
#include "icing/util/crc32.h"
namespace icing {
namespace lib {
-enum TermPropertyId {
- kHasHitsInPrefixSection = 0,
-};
-
class LiteIndex {
public:
// An entry in the hit buffer.
@@ -153,14 +150,24 @@
return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
}
- // Insert a term. Returns non-OK if lexicon is full.
+ // Inserts a term with its properties.
+ //
+ // Returns:
+ // A value index on success
+ // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
libtextclassifier3::StatusOr<uint32_t> InsertTerm(
- const std::string& term, TermMatchType::Code term_match_type);
+ const std::string& term, TermMatchType::Code term_match_type,
+ NamespaceId namespace_id);
- // Updates term properties by setting the bit for has_hits_in_prefix_section
- // only if term_match_type == PREFIX. Otherwise, this does nothing.
- libtextclassifier3::Status UpdateTerm(uint32_t tvi,
- TermMatchType::Code term_match_type);
+ // Updates term properties by setting hasPrefixHits and namespace id of the
+ // term.
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED if no disk space is available
+ libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
+ bool hasPrefixHits,
+ NamespaceId namespace_id);
// Append hit to buffer. term_id must be encoded using the same term_id_codec
// supplied to the index constructor. Returns non-OK if hit cannot be added
@@ -174,6 +181,9 @@
bool only_from_prefix_sections,
std::vector<DocHitInfo>* hits_out);
+ // Returns the hit count of the term.
+ uint32_t CountHits(uint32_t term_id);
+
// Check if buffer has reached its capacity.
bool is_full() const;
@@ -188,11 +198,21 @@
return header_->last_added_docid();
}
+ const IcingDynamicTrie& lexicon() const { return lexicon_; }
+
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
void GetDebugInfo(int verbosity, std::string* out) const;
+ // Returns the byte size of all the elements held in the index. This excludes
+ // the size of any internal metadata of the index, e.g. the index's header.
+ //
+ // Returns:
+ // Byte size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
private:
static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
@@ -216,15 +236,29 @@
// hit buffer if term_id is not present.
uint32_t Seek(uint32_t term_id);
+ // File descriptor that points to where the header and hit buffer are written
+ // to.
ScopedFd hit_buffer_fd_;
+ // Mmapped region past the header that stores the hits.
IcingArrayStorage hit_buffer_;
+
+ // Crc checksum of the hits, excludes the header.
uint32_t hit_buffer_crc_;
+
+ // Trie that maps indexed terms to their term id
IcingDynamicTrie lexicon_;
+
// TODO(b/140437260): Port over to MemoryMappedFile
+ // Memory mapped region of the underlying file that reflects the header.
IcingMMapper header_mmap_;
+
+ // Wrapper around the mmapped header that contains stats on the lite index.
std::unique_ptr<IcingLiteIndex_Header> header_;
+
+ // Options used to initialize the LiteIndex.
const Options options_;
+
// TODO(b/139087650) Move to icing::Filesystem
const IcingFilesystem* const filesystem_;
};
diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc
new file mode 100644
index 0000000..9d7df3c
--- /dev/null
+++ b/icing/index/main/index-block.cc
@@ -0,0 +1,201 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/index-block.h"
+
+#include <inttypes.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+libtextclassifier3::Status ValidatePostingListBytes(uint32_t posting_list_bytes,
+ uint32_t block_size) {
+ if (posting_list_bytes >
+ IndexBlock::CalculateMaxPostingListBytes(block_size) ||
+ !posting_list_utils::IsValidPostingListSize(posting_list_bytes)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is illegal for a flash block with max "
+ "posting list size of %d",
+ posting_list_bytes,
+ IndexBlock::CalculateMaxPostingListBytes(block_size)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace
+
+uint32_t IndexBlock::ApproximateFullPostingListHitsForBlock(
+ uint32_t block_size, int posting_list_index_bits) {
+ // Assume 50% compressed and most don't have scores.
+ uint32_t bytes_per_hit = sizeof(Hit::Value) / 2;
+ return (block_size - sizeof(BlockHeader)) /
+ ((1u << posting_list_index_bits) * bytes_per_hit);
+}
+
+libtextclassifier3::StatusOr<IndexBlock>
+IndexBlock::CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem,
+ std::string_view file_path,
+ off_t offset,
+ uint32_t block_size) {
+ if (block_size < sizeof(BlockHeader)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Provided block_size %d is too small to fit even the BlockHeader!",
+ block_size));
+ }
+ MemoryMappedFile mmapped_file(
+ filesystem, file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size));
+ IndexBlock block(std::move(mmapped_file));
+ ICING_RETURN_IF_ERROR(
+ ValidatePostingListBytes(block.get_posting_list_bytes(), block_size));
+ return block;
+}
+
+libtextclassifier3::StatusOr<IndexBlock>
+IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem,
+ std::string_view file_path,
+ off_t offset, uint32_t block_size,
+ uint32_t posting_list_bytes) {
+ if (block_size < sizeof(BlockHeader)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Provided block_size %d is too small to fit even the BlockHeader!",
+ block_size));
+ }
+ ICING_RETURN_IF_ERROR(
+ ValidatePostingListBytes(posting_list_bytes, block_size));
+ MemoryMappedFile mmapped_file(
+ filesystem, file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size));
+ IndexBlock block(std::move(mmapped_file));
+ // Safe to ignore the return value of Reset. Reset returns an error if
+ // posting_list_bytes is invalid, but this function ensures that
+ // posting_list_bytes is valid thanks to the call to ValidatePostingListBytes
+ // above.
+ block.Reset(posting_list_bytes);
+ return block;
+}
+
+IndexBlock::IndexBlock(MemoryMappedFile mmapped_block)
+ : header_(reinterpret_cast<BlockHeader*>(mmapped_block.mutable_region())),
+ posting_lists_start_ptr_(mmapped_block.mutable_region() +
+ sizeof(BlockHeader)),
+ block_size_in_bytes_(mmapped_block.region_size()),
+ mmapped_block_(std::move(mmapped_block)) {}
+
+libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) {
+ ICING_RETURN_IF_ERROR(ValidatePostingListBytes(posting_list_bytes,
+ mmapped_block_.region_size()));
+ header_->free_list_posting_list_index = kInvalidPostingListIndex;
+ header_->next_block_index = kInvalidBlockIndex;
+ header_->posting_list_bytes = posting_list_bytes;
+
+ // Starting with the last posting list, prepend each posting list to the free
+ // list. At the end, the beginning of the free list should be the first
+ // posting list.
+ for (PostingListIndex posting_list_index = max_num_posting_lists() - 1;
+ posting_list_index >= 0; --posting_list_index) {
+ // Adding the posting list at posting_list_index to the free list will
+ // modify both the posting list and also
+ // header_->free_list_posting_list_index.
+ FreePostingList(posting_list_index);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<PostingListUsed>
+IndexBlock::GetAllocatedPostingList(PostingListIndex posting_list_index) {
+ if (posting_list_index >= max_num_posting_lists() || posting_list_index < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Cannot get posting list with index %d in IndexBlock with only %d "
+ "posting lists.",
+ posting_list_index, max_num_posting_lists()));
+ }
+ return PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+}
+
+libtextclassifier3::StatusOr<PostingListIndex>
+IndexBlock::AllocatePostingList() {
+ if (!has_free_posting_lists()) {
+ return absl_ports::ResourceExhaustedError(
+ "No available posting lists to allocate.");
+ }
+
+ // Pull one off the free list.
+ PostingListIndex posting_list_index = header_->free_list_posting_list_index;
+
+ // We know at this point that posting_list_bytes will return a valid pl size
+ // (because an already initialized IndexBlock instance can't have an invalid
+ // posting_list_bytes). So CreateFromPreexistingPostingListFreeRegion will
+ // always return OK and ValueOrDie is safe to call.
+ auto posting_list_or =
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ PostingListFree plfree = std::move(posting_list_or).ValueOrDie();
+
+ header_->free_list_posting_list_index = plfree.get_next_posting_list_index();
+ if (header_->free_list_posting_list_index != kInvalidPostingListIndex &&
+ header_->free_list_posting_list_index >= max_num_posting_lists()) {
+ ICING_LOG(ERROR)
+ << "Free Posting List points to an invalid posting list index!";
+ header_->free_list_posting_list_index = kInvalidPostingListIndex;
+ }
+
+ // Make it a used posting list.
+ PostingListUsed::CreateFromUnitializedRegion(
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ return posting_list_index;
+}
+
+void IndexBlock::FreePostingList(PostingListIndex posting_list_index) {
+ if (posting_list_index >= max_num_posting_lists() || posting_list_index < 0) {
+ ICING_LOG(ERROR) << "Cannot free posting list with index "
+ << posting_list_index << " in IndexBlock with only "
+ << max_num_posting_lists() << " posting lists.";
+ return;
+ }
+
+ // We know at this point that posting_list_bytes will return a valid pl size.
+ // So CreateFromUninitializedRegion will always return OK and ValueOrDie is
+ // safe to call.
+ auto posting_list_or = PostingListFree::CreateFromUnitializedRegion(
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ PostingListFree plfree = std::move(posting_list_or).ValueOrDie();
+
+ // Put at the head of the list.
+ plfree.set_next_posting_list_index(header_->free_list_posting_list_index);
+ header_->free_list_posting_list_index = posting_list_index;
+}
+
+char* IndexBlock::get_posting_list_ptr(PostingListIndex posting_list_index) {
+ return posting_lists_start_ptr_ +
+ get_posting_list_bytes() * posting_list_index;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h
new file mode 100644
index 0000000..1d17e34
--- /dev/null
+++ b/icing/index/main/index-block.h
@@ -0,0 +1,215 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_INDEX_BLOCK_H_
+#define ICING_INDEX_MAIN_INDEX_BLOCK_H_
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+inline constexpr uint32_t kInvalidBlockIndex = 0;
+
+// This class is used to manage I/O to a single flash block and to manage the
+// division of that flash block into PostingLists. It provides an interface to
+// allocate, free and read posting lists.
+//
+// An IndexBlock contains a small header and an array of fixed-size posting list
+// buffers. Initially, all posting lists are chained in a singly-linked free
+// list.
+//
+// When we want to get a new PostingList from an IndexBlock, we just
+// pull one off the free list. When the user wants to return the
+// PostingList to the free pool, we prepend it to the free list.
+class IndexBlock {
+ public:
+ // What is the maximum posting list size in bytes that can be stored in this
+ // block.
+ static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes) {
+ return (block_size_in_bytes - sizeof(BlockHeader)) / sizeof(Hit) *
+ sizeof(Hit);
+ }
+
+ // For a given min number of bits needed to store PostingListIndex for a
+ // block of "block_size", return the approximate number of hits that a full
+ // posting list in this block could accomodate.
+ static uint32_t ApproximateFullPostingListHitsForBlock(
+ uint32_t block_size, int posting_list_index_bits);
+
+ // Create an IndexBlock to reference the previously used region of the
+ // mmapped_file starting at offset with size block_size
+ //
+ // RETURNS:
+ // - a valid IndexBlock on success
+ // - INVALID_ARGUMENT if size is too small for even just the BlockHeader or
+ // if the posting list size stored in the region is not a valid posting
+ // list size or it exceeds max_posting_list_bytes(size).
+ // - INTERNAL_ERROR if unable to mmap the region [offset, offset+block_size)
+ static libtextclassifier3::StatusOr<IndexBlock>
+ CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem,
+ std::string_view file_path,
+ off_t offset, uint32_t block_size);
+
+ // Create an IndexBlock to reference an uninitialized region of the
+ // mmapped_file starting at offset with size block_size. The IndexBlock will
+ // initialize the region to be an empty IndexBlock with posting lists of size
+ // posting_list_bytes.
+ //
+ // RETURNS:
+ // - a valid IndexBlock on success
+ // - INVALID_ARGUMENT if size is too small for even just the BlockHeader or
+ // if posting_list_bytes is not a valid posting list size or it exceeds
+ // max_posting_list_bytes(size).
+ // - INTERNAL_ERROR if unable to mmap the region [offset, offset+block_size)
+ static libtextclassifier3::StatusOr<IndexBlock> CreateFromUninitializedRegion(
+ const Filesystem& filesystem, std::string_view file_path, off_t offset,
+ uint32_t block_size, uint32_t posting_list_bytes);
+
+ IndexBlock(const IndexBlock&) = delete;
+ IndexBlock& operator=(const IndexBlock&) = delete;
+ IndexBlock(IndexBlock&&) = default;
+ IndexBlock& operator=(IndexBlock&&) = default;
+
+ // Instantiate a PostingListUsed at posting_list_index with the existing
+ // content in the IndexBlock.
+ //
+ // RETURNS:
+ // - a valid PostingListUsed on success
+ // - INVALID_ARGUMENT if posting_list_index >= max_num_posting_lists()
+ libtextclassifier3::StatusOr<PostingListUsed> GetAllocatedPostingList(
+ PostingListIndex posting_list_index);
+
+ // Allocates a PostingListUsed in the IndexBlock, if possible.
+ //
+ // RETURNS:
+ // - a valid PostingListIndex that can be used to retrieve the allocated
+ // PostingListUsed via a call to GetAllocatedPostingList
+ // - RESOURCE_EXHAUSTED if !has_free_posting_lists()
+ libtextclassifier3::StatusOr<PostingListIndex> AllocatePostingList();
+
+ // Free posting list at posting_list_index.
+ //
+ // It is considered an error to "double-free" a posting list. You should never
+ // call FreePostingList(index) with the same index twice, unless that index
+ // was returned by an intervening AllocatePostingList() call.
+ //
+ // Ex.
+ // PostingListIndex index = block.AllocatePostingList();
+ // DoSomething(block.GetAllocatedPostingList(index));
+ // block.FreePostingList(index);
+ // block.FreePostingList(index); // Argh! What are you doing?!
+ // ...
+ // PostingListIndex index = block.AllocatePostingList();
+ // DoSomething(block.GetAllocatedPostingList(index));
+ // block.FreePostingList(index);
+ // index = block.AllocatePostingList();
+ // DoSomethingElse(block.GetAllocatedPostingList(index));
+ // // A-Ok! We called AllocatePostingList() since the last FreePostingList()
+ // call. block.FreePostingList(index);
+ //
+ // Has no effect if posting_list_index >= max_num_posting_lists().
+ void FreePostingList(PostingListIndex posting_list_index);
+
+ // Blocks can be chained. The interpretation of the chaining is up
+ // to the caller.
+ uint32_t next_block_index() const { return header_->next_block_index; }
+
+ void set_next_block_index(uint32_t next_block_index) {
+ header_->next_block_index = next_block_index;
+ }
+
+ // Retrieves the size (in bytes) of the posting lists in this IndexBlock.
+ uint32_t get_posting_list_bytes() const {
+ return header_->posting_list_bytes;
+ }
+
+ // Maximum number of posting lists in the block.
+ uint32_t max_num_posting_lists() const {
+ return total_posting_lists_bytes() / get_posting_list_bytes();
+ }
+
+ // Number of bits required to store the largest PostingListIndex in this
+ // block.
+ int posting_list_index_bits() const {
+ return BitsToStore(max_num_posting_lists());
+ }
+
+ // Returns whether or not there are available posting lists in the free list.
+ bool has_free_posting_lists() const {
+ return header_->free_list_posting_list_index != kInvalidPostingListIndex;
+ }
+
+ private:
+ // Assumes that mmapped_file already has established a valid mapping to the
+ // requested block.
+ IndexBlock(MemoryMappedFile mmapped_block);
+
+ // Resets IndexBlock to hold posting lists of posting_list_bytes size and adds
+ // all posting lists to the free list.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if posting_list_bytes is a valid posting list size.
+ libtextclassifier3::Status Reset(int posting_list_bytes);
+
+ char* get_posting_list_ptr(PostingListIndex posting_list_index);
+
+ // Bytes in the block available for posting lists (minus header,
+ // alignment, etc.).
+ uint32_t total_posting_lists_bytes() const {
+ return block_size_in_bytes_ - sizeof(BlockHeader);
+ }
+
+ struct BlockHeader {
+ // Index of the next block if this block is being chained or part of a free
+ // list.
+ uint32_t next_block_index;
+
+ // Index to the first PostingListFree in the IndexBlock. This is the start
+ // of the free list.
+ PostingListIndex free_list_posting_list_index;
+
+ // The size of each posting list in the IndexBlock.
+ uint32_t posting_list_bytes;
+ };
+ // Pointer to the header of this block. The header is used to store info about
+ // this block and its posting lists.
+ BlockHeader* header_;
+ // Pointer to the beginning of the posting lists region - the area the block
+ // after the header.
+ char* posting_lists_start_ptr_;
+ uint32_t block_size_in_bytes_;
+
+ // MemoryMappedFile used to interact with the underlying flash block.
+ MemoryMappedFile mmapped_block_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_INDEX_BLOCK_H_
diff --git a/icing/index/main/index-block_test.cc b/icing/index/main/index-block_test.cc
new file mode 100644
index 0000000..08ba57d
--- /dev/null
+++ b/icing/index/main/index-block_test.cc
@@ -0,0 +1,354 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/index-block.h"
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static constexpr int kBlockSize = 4096;
+
+bool CreateFileWithSize(const Filesystem& filesystem, const std::string& file,
+ int size) {
+ size_t parent_dir_end = file.find_last_of('/');
+ if (parent_dir_end == std::string::npos) {
+ return false;
+ }
+ std::string file_dir = file.substr(0, parent_dir_end);
+ return filesystem.CreateDirectoryRecursively(file_dir.c_str()) &&
+ filesystem.Grow(file.c_str(), size);
+}
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+
+TEST(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
+ constexpr int kPostingListBytes = 20;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize,
+ kPostingListBytes));
+ EXPECT_TRUE(block.has_free_posting_lists());
+ }
+}
+
+TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) {
+ constexpr int kPostingListBytes1 = 20;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize,
+ kPostingListBytes1));
+ EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes1));
+ // There should be (4096 - 12) / 20 = 204 posting lists
+ // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8
+ // bits.
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(204));
+ EXPECT_THAT(block.posting_list_index_bits(), Eq(8));
+
+ constexpr int kPostingListBytes2 = 200;
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0,
+ kBlockSize, kPostingListBytes2));
+ EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes2));
+ // There should be (4096 - 12) / 200 = 20 posting lists
+ // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5
+ // bits.
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(20));
+ EXPECT_THAT(block.posting_list_index_bits(), Eq(5));
+}
+
+TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
+ constexpr int kPostingListBytes = 2000;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ std::vector<Hit> test_hits{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore),
+ };
+ PostingListIndex allocated_index;
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file,
+ /*offset=*/0,
+ /*block_size=*/kBlockSize, kPostingListBytes));
+ // Add hits to the first posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(allocated_index, block.AllocatePostingList());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ block.GetAllocatedPostingList(allocated_index));
+ for (const Hit& hit : test_hits) {
+ ICING_ASSERT_OK(pl_used.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(
+ test_hits.rbegin(), test_hits.rend())));
+ }
+ {
+ // Create an IndexBlock from the previously allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block,
+ IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ block.GetAllocatedPostingList(allocated_index));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(
+ test_hits.rbegin(), test_hits.rend())));
+ EXPECT_TRUE(block.has_free_posting_lists());
+ }
+}
+
+TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
+ constexpr int kPostingListBytes = 2000;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ std::vector<Hit> hits_in_posting_list1{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore),
+ };
+ std::vector<Hit> hits_in_posting_list2{
+ Hit(/*section_id=*/12, /*document_id=*/220, /*score=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kMaxHitScore),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*score=*/2),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*score=*/12),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kMaxHitScore),
+ };
+ PostingListIndex allocated_index_1;
+ PostingListIndex allocated_index_2;
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize,
+ kPostingListBytes));
+
+ // Add hits to the first posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(allocated_index_1, block.AllocatePostingList());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used_1,
+ block.GetAllocatedPostingList(allocated_index_1));
+ for (const Hit& hit : hits_in_posting_list1) {
+ ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used_1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+
+ // Add hits to the second posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(allocated_index_2, block.AllocatePostingList());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used_2,
+ block.GetAllocatedPostingList(allocated_index_2));
+ for (const Hit& hit : hits_in_posting_list2) {
+ ICING_ASSERT_OK(pl_used_2.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used_2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_FALSE(block.has_free_posting_lists());
+ }
+ {
+ // Create an IndexBlock from the previously allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block,
+ IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used_1,
+ block.GetAllocatedPostingList(allocated_index_1));
+ EXPECT_THAT(pl_used_1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used_2,
+ block.GetAllocatedPostingList(allocated_index_2));
+ EXPECT_THAT(pl_used_2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_FALSE(block.has_free_posting_lists());
+ }
+}
+
+TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
+ constexpr int kPostingListBytes = 2000;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block,
+ IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize, kPostingListBytes));
+
+ // Add hits to the first posting list.
+ std::vector<Hit> hits_in_posting_list1{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_1,
+ block.AllocatePostingList());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_1,
+ block.GetAllocatedPostingList(allocated_index_1));
+ for (const Hit& hit : hits_in_posting_list1) {
+ ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used_1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+
+ // Add hits to the second posting list.
+ std::vector<Hit> hits_in_posting_list2{
+ Hit(/*section_id=*/12, /*document_id=*/220, /*score=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kMaxHitScore),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*score=*/2),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*score=*/12),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kMaxHitScore),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_2,
+ block.AllocatePostingList());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_2,
+ block.GetAllocatedPostingList(allocated_index_2));
+ for (const Hit& hit : hits_in_posting_list2) {
+ ICING_ASSERT_OK(pl_used_2.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used_2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_FALSE(block.has_free_posting_lists());
+
+ // Now free the first posting list. Then, reallocate it and fill it with a
+ // different set of hits.
+ block.FreePostingList(allocated_index_1);
+ EXPECT_TRUE(block.has_free_posting_lists());
+
+ std::vector<Hit> hits_in_posting_list3{
+ Hit(/*section_id=*/12, /*document_id=*/0, /*score=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/1, Hit::kMaxHitScore),
+ Hit(/*section_id=*/0, /*document_id=*/2, /*score=*/2),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_3,
+ block.AllocatePostingList());
+ EXPECT_THAT(allocated_index_3, Eq(allocated_index_1));
+ ICING_ASSERT_OK_AND_ASSIGN(pl_used_1,
+ block.GetAllocatedPostingList(allocated_index_3));
+ for (const Hit& hit : hits_in_posting_list3) {
+ ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ }
+ EXPECT_THAT(pl_used_1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list3.rbegin(),
+ hits_in_posting_list3.rend())));
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_FALSE(block.has_free_posting_lists());
+}
+
+TEST(IndexBlockTest, IndexBlockNextBlockIndex) {
+ constexpr int kPostingListBytes = 2000;
+ constexpr int kSomeBlockIndex = 22;
+
+ Filesystem filesystem;
+ std::string flash_file = GetTestTempDir() + "/flash/0";
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+
+ {
+ // Create an IndexBlock from this newly allocated file block and set the
+ // next block index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize,
+ kPostingListBytes));
+ EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex));
+ block.set_next_block_index(kSomeBlockIndex);
+ EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex));
+ }
+ {
+ // Create an IndexBlock from this previously allocated file block and make
+ // sure that next_block_index is still set properly.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block,
+ IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize));
+ EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex));
+ }
+ {
+ // Create an IndexBlock, treating this file block as uninitialized. This
+ // reset the next_block_index to kInvalidBlockIndex.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem, flash_file, /*offset=*/0, kBlockSize,
+ kPostingListBytes));
+ EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-free.h b/icing/index/main/posting-list-free.h
new file mode 100644
index 0000000..4b27401
--- /dev/null
+++ b/icing/index/main/posting-list-free.h
@@ -0,0 +1,129 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
+#define ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include <cstdint>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// A FlashIndexBlock can contain multiple posting lists. This specifies which
+// PostingList in the FlashIndexBlock we want to refer to.
+using PostingListIndex = int32_t;
+inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U;
+
+// A posting list in the index block's free list.
+//
+// We re-use the first sizeof(PostingListIndex) bytes of the posting list
+// buffer to store a next index for chaining.
+class PostingListFree {
+ public:
+ // Creates a PostingListFree that points to a buffer of size_in_bytes bytes.
+ // 'Preexisting' means that posting_list_buffer was previously modified by
+ // another instance of PostingListFree.
+ //
+ // Caller owns the posting_list_buffer and must not free it while using
+ // a PostingListFree.
+ //
+ // RETURNS:
+ // - A valid PostingListFree on success
+ // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
+ // || size_in_bytes % sizeof(Hit) != 0.
+ // - FAILED_PRECONDITION if posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListFree>
+ CreateFromPreexistingPostingListFreeRegion(void *posting_list_buffer,
+ uint32_t size_in_bytes) {
+ ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
+ if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is invalid!", size_in_bytes));
+ }
+ return PostingListFree(posting_list_buffer, size_in_bytes);
+ }
+
+ // Creates a PostingListFree that points to a buffer of size_in_bytes bytes
+ // and initializes the content of the buffer so that the returned
+ // PostingListFree is empty.
+ //
+ // Caller owns the posting_list_buffer buffer and must not free it while using
+ // a PostingListFree.
+ //
+ // RETURNS:
+ // - A valid PostingListFree on success
+ // - INVALID_ARGUMENT if size_in_bytes < min_size() || size_in_bytes %
+ // sizeof(Hit) != 0.
+ // - FAILED_PRECONDITION if posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListFree>
+ CreateFromUnitializedRegion(void *posting_list_buffer,
+ uint32_t size_in_bytes) {
+ ICING_ASSIGN_OR_RETURN(PostingListFree posting_list_free,
+ CreateFromPreexistingPostingListFreeRegion(
+ posting_list_buffer, size_in_bytes));
+ posting_list_free.Clear();
+ return posting_list_free;
+ }
+
+ // Used to store/access the index of the next free posting list in this
+ // index block.
+ PostingListIndex get_next_posting_list_index() const {
+ PostingListIndex posting_list_index;
+ memcpy(&posting_list_index, posting_list_buffer_,
+ sizeof(posting_list_index));
+ return posting_list_index;
+ }
+ void set_next_posting_list_index(PostingListIndex posting_list_index) {
+ memcpy(posting_list_buffer_, &posting_list_index,
+ sizeof(posting_list_index));
+ }
+
+ private:
+ PostingListFree(void *posting_list_buffer, uint32_t size_in_bytes)
+ : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
+ size_in_bytes_(size_in_bytes) {}
+
+ // Reset the current free posting list as unchained free posting list so that
+ // there's no next posting list index. This *must* be called if the
+ // posting_list_buffer_ region was never used for a previous instance of
+ // PostingListFree.
+ void Clear() { set_next_posting_list_index(kInvalidPostingListIndex); }
+
+ // A byte array of size size_in_bytes_. The first sizeof(PostingListIndex)
+ // bytes which will store the next posting list index, the rest are unused and
+ // can be anything.
+ uint8_t *posting_list_buffer_;
+ uint32_t size_in_bytes_;
+
+ static_assert(sizeof(PostingListIndex) <=
+ posting_list_utils::min_posting_list_size(),
+ "PostingListIndex must be small enough to fit in a "
+ "minimum-sized Posting List.");
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
diff --git a/icing/index/main/posting-list-free_test.cc b/icing/index/main/posting-list-free_test.cc
new file mode 100644
index 0000000..a152934
--- /dev/null
+++ b/icing/index/main/posting-list-free_test.cc
@@ -0,0 +1,131 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-free.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gtest/gtest.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+TEST(PostingListTest, PostingListFree) {
+ static const size_t kHitsSize = 2551 * sizeof(Hit);
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
+}
+
+TEST(PostingListTest, PostingListTooSmallInvalidArgument) {
+ static const size_t kHitSizeTooSmall =
+ posting_list_utils::min_posting_list_size() - sizeof(Hit);
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitSizeTooSmall);
+ EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListTest, PostingListNotAlignedInvalidArgument) {
+ static const size_t kHitSizeNotAligned =
+ posting_list_utils::min_posting_list_size() + 1;
+
+ std::unique_ptr<char[]> hits_buf =
+ std::make_unique<char[]>(kHitSizeNotAligned);
+ EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListTest, PostingListNullBufferFailedPrecondition) {
+ static const size_t kHitSize = posting_list_utils::min_posting_list_size();
+ EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST(PostingListTest, PostingListFreePreexistingRegion) {
+ constexpr PostingListIndex kOtherPostingListIndex = 12;
+ static const size_t kHitsSize = 2551 * sizeof(Hit);
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+ {
+ // Set posting list index to kOtherPostingListIndex
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ pl_free.set_next_posting_list_index(kOtherPostingListIndex);
+ EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
+ }
+ {
+ // We should still be able to retrieve kOtherPostingListIndex when we create
+ // a posting list from the existing region.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
+ }
+}
+
+TEST(PostingListTest, PostingListFreeUninitializedRegion) {
+ constexpr PostingListIndex kOtherPostingListIndex = 12;
+ static const size_t kHitsSize = 2551 * sizeof(Hit);
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+ {
+ // Set posting list index to kOtherPostingListIndex
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ pl_free.set_next_posting_list_index(kOtherPostingListIndex);
+ EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
+ }
+ {
+ // Creating from an uninitialized region should cause the posting list to
+ // override kOtherPostingListIndex with kInvalidPostingListIndex.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-used.cc b/icing/index/main/posting-list-used.cc
new file mode 100644
index 0000000..a439c45
--- /dev/null
+++ b/icing/index/main/posting-list-used.cc
@@ -0,0 +1,675 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-used.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t GetScoreByteSize(const Hit &hit) {
+ return hit.has_score() ? sizeof(Hit::Score) : 0;
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
+ void *posting_list_buffer, uint32_t size_in_bytes) {
+ ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
+ if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is invalid!", size_in_bytes));
+ }
+ return PostingListUsed(posting_list_buffer, size_in_bytes);
+}
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer,
+ uint32_t size_in_bytes) {
+ ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
+ CreateFromPreexistingPostingListUsedRegion(
+ posting_list_buffer, size_in_bytes));
+ posting_list_used.Clear();
+ return posting_list_used;
+}
+
+void PostingListUsed::Clear() {
+ // Safe to ignore return value because size_in_bytes_ a valid argument.
+ set_start_byte_offset(size_in_bytes_);
+}
+
+libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) {
+ ICING_RETURN_ERROR_IF_NULL(other);
+ if (other->MinPostingListSizeToFit() > size_in_bytes_) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "other->MinPostingListSizeToFit %d must be larger than size %d.",
+ other->MinPostingListSizeToFit(), size_in_bytes_));
+ }
+
+ if (!IsPostingListValid()) {
+ return absl_ports::FailedPreconditionError(
+ "This posting list is in an invalid state and can't be used!");
+ }
+ if (!other->IsPostingListValid()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot MoveFrom an invalid posting list!");
+ }
+
+ // Pop just enough hits that all of other's compressed hits fit in
+ // this posting_list's compressed area. Then we can memcpy that area.
+ std::vector<Hit> hits;
+ while (other->full() || other->almost_full() ||
+ (size_in_bytes_ - posting_list_utils::kSpecialHitsSize <
+ other->BytesUsed())) {
+ if (!other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits).ok()) {
+ return absl_ports::AbortedError(
+ "Unable to retrieve hits from other posting list.");
+ }
+ }
+
+ // memcpy the area and set up start byte offset.
+ Clear();
+ memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(),
+ other->posting_list_buffer_ + other->get_start_byte_offset(),
+ other->BytesUsed());
+ // Because we popped all hits from other outside of the compressed area and we
+ // guaranteed that other->BytesUsed is less than size_in_bytes_ -
+ // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
+ // NOT_FULL state, so ignoring the value is safe.
+ set_start_byte_offset(size_in_bytes_ - other->BytesUsed());
+
+ // Put back remaining hits.
+ for (size_t i = 0; i < hits.size(); i++) {
+ const Hit &hit = hits[hits.size() - i - 1];
+ // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
+ // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
+ // should be impossible because we've already assured that there is enough
+ // room above.
+ ICING_RETURN_IF_ERROR(PrependHit(hit));
+ }
+
+ other->Clear();
+ return libtextclassifier3::Status::OK;
+}
+
+uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
+ Hit::Value pad;
+ uint32_t pad_end = offset;
+ while (pad_end < size_in_bytes_) {
+ size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad);
+ if (pad != 0) {
+ // No longer a pad.
+ break;
+ }
+ pad_end += pad_len;
+ }
+ return pad_end;
+}
+
+bool PostingListUsed::PadToEnd(uint32_t start, uint32_t end) {
+ if (end > size_in_bytes_) {
+ ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
+ return false;
+ }
+ // In VarInt a value of 0 encodes to 0.
+ memset(posting_list_buffer_ + start, 0, end - start);
+ return true;
+}
+
+libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
+ const Hit &hit) {
+ // Get delta between first hit and the new hit. Try to fit delta
+ // in the padded area and put new hit at the special position 1.
+ // Calling ValueOrDie is safe here because 1 < kNumSpecialHits.
+ Hit cur = get_special_hit(1).ValueOrDie();
+ if (cur.value() <= hit.value()) {
+ return absl_ports::InvalidArgumentError(
+ "Hit being prepended must be strictly less than the most recent Hit");
+ }
+ uint64_t delta = cur.value() - hit.value();
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = VarInt::Encode(delta, delta_buf);
+ uint32_t cur_score_bytes = GetScoreByteSize(cur);
+
+ uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize);
+
+ if (pad_end >=
+ posting_list_utils::kSpecialHitsSize + delta_len + cur_score_bytes) {
+ // Pad area has enough space for delta and score of existing hit
+ // (cur). Write delta at pad_end - delta_len - cur_score_bytes.
+ uint8_t *delta_offset =
+ posting_list_buffer_ + pad_end - delta_len - cur_score_bytes;
+ memcpy(delta_offset, delta_buf, delta_len);
+ // Now copy score.
+ Hit::Score score = cur.score();
+ uint8_t *score_offset = delta_offset + delta_len;
+ memcpy(score_offset, &score, cur_score_bytes);
+
+ // Now first hit is the new hit, at special position 1. Safe to ignore the
+ // return value because 1 < kNumSpecialHits.
+ set_special_hit(1, hit);
+ // Safe to ignore the return value because sizeof(Hit) is a valid argument.
+ set_start_byte_offset(sizeof(Hit));
+ } else {
+ // No space for delta. We put the new hit at special position 0
+ // and go to the full state. Safe to ignore the return value because 1 <
+ // kNumSpecialHits.
+ set_special_hit(0, hit);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+void PostingListUsed::PrependHitToEmpty(const Hit &hit) {
+ // First hit to be added. Just add verbatim, no compression.
+ if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) {
+ // Safe to ignore the return value because 1 < kNumSpecialHits
+ set_special_hit(1, hit);
+ // Safe to ignore the return value because sizeof(Hit) is a valid argument.
+ set_start_byte_offset(sizeof(Hit));
+ } else {
+ // Since this is the first hit, size != kSpecialHitsSize and
+ // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
+ // the compressed region, so ValueOrDie is safe.
+ uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_).ValueOrDie();
+ // Safe to ignore the return value because PrependHitUncompressed is
+ // guaranteed to return a valid offset.
+ set_start_byte_offset(offset);
+ }
+}
+
+libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
+ const Hit &hit, uint32_t offset) {
+ // First hit in compressed area. It is uncompressed. See if delta
+ // between the first hit and new hit will still fit in the
+ // compressed area.
+ if (offset + sizeof(Hit::Value) > size_in_bytes_) {
+ // The first hit in the compressed region *should* be uncompressed, but
+ // somehow there isn't enough room between offset and the end of the
+ // compressed area to fit an uncompressed hit. This should NEVER happen.
+ return absl_ports::FailedPreconditionError(
+ "Posting list is in an invalid state.");
+ }
+ Hit::Value cur_value;
+ memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value));
+ if (cur_value <= hit.value()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Hit %d being prepended must be strictly less than the most recent "
+ "Hit %d",
+ hit.value(), cur_value));
+ }
+ uint64_t delta = cur_value - hit.value();
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = VarInt::Encode(delta, delta_buf);
+ uint32_t hit_score_bytes = GetScoreByteSize(hit);
+
+ // offset now points to one past the end of the first hit.
+ offset += sizeof(Hit::Value);
+ if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
+ hit_score_bytes <=
+ offset) {
+ // Enough space for delta in compressed area.
+
+ // Prepend delta.
+ offset -= delta_len;
+ memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
+
+ // Prepend new hit with (possibly) its score. We know that there is room
+ // for 'hit' because of the if statement above, so calling ValueOrDie is
+ // safe.
+ offset = PrependHitUncompressed(hit, offset).ValueOrDie();
+ // offset is guaranteed to be valid here. So it's safe to ignore the return
+ // value. The if above will guarantee that offset >= kSpecialHitSize and <
+ // size_in_bytes_ because the if ensures that there is enough room between
+ // offset and kSpecialHitSize to fit the delta of the previous hit, any
+ // score and the uncompressed hit.
+ set_start_byte_offset(offset);
+ } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) {
+ // Only have space for delta. The new hit must be put in special
+ // position 1.
+
+ // Prepend delta.
+ offset -= delta_len;
+ memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
+
+ // Prepend pad. Safe to ignore the return value of PadToEnd because offset
+ // must be less than size_in_bytes_. Otherwise, this function already would
+ // have returned FAILED_PRECONDITION.
+ PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
+
+ // Put new hit in special position 1. Safe to ignore return value because 1
+ // < kNumSpecialHits.
+ set_special_hit(1, hit);
+
+ // State almost_full. Safe to ignore the return value because sizeof(Hit) is
+ // a valid argument.
+ set_start_byte_offset(sizeof(Hit));
+ } else {
+ // Very rare case where delta is larger than sizeof(Hit::Value)
+ // (i.e. varint delta encoding expanded required storage). We
+ // move first hit to special position 1 and put new hit in
+ // special position 0.
+ Hit cur(cur_value);
+ if (cur.has_score()) {
+ // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
+ // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
+ // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
+ // offset < size_in_bytes, so it is safe to call ValueOrDie here.
+ cur = Hit(cur_value, ReadScore(offset).ValueOrDie());
+ offset += sizeof(Hit::Score);
+ }
+ // Safe to ignore the return value of PadToEnd because offset must be less
+ // than size_in_bytes_. Otherwise, this function already would have returned
+ // FAILED_PRECONDITION.
+ PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
+ // Safe to ignore the return value here because 0 and 1 < kNumSpecialHits.
+ set_special_hit(1, cur);
+ set_special_hit(0, hit);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) {
+ static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
+ "Hit::Value cannot be larger than 8 bytes because the delta "
+ "must be able to fit in 8 bytes.");
+ if (!hit.is_valid()) {
+ return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
+ }
+ if (!IsPostingListValid()) {
+ return absl_ports::FailedPreconditionError(
+ "This PostingListUsed is in an invalid state and can't add any hits!");
+ }
+
+ if (full()) {
+ // State full: no space left.
+ return absl_ports::ResourceExhaustedError("No more room for hits");
+ } else if (almost_full()) {
+ return PrependHitToAlmostFull(hit);
+ } else if (empty()) {
+ PrependHitToEmpty(hit);
+ return libtextclassifier3::Status::OK;
+ } else {
+ uint32_t offset = get_start_byte_offset();
+ return PrependHitToNotFull(hit, offset);
+ }
+}
+
+libtextclassifier3::StatusOr<std::vector<Hit>> PostingListUsed::GetHits()
+ const {
+ std::vector<Hit> hits_out;
+ ICING_RETURN_IF_ERROR(GetHits(&hits_out));
+ return hits_out;
+}
+
+libtextclassifier3::Status PostingListUsed::GetHits(
+ std::vector<Hit> *hits_out) const {
+ return GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(),
+ /*pop=*/false, hits_out);
+}
+
+libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) {
+ if (num_hits == 1 && full()) {
+ // The PL is in full status which means that we save 2 uncompressed hits in
+ // the 2 special postions. But full status may be reached by 2 different
+ // statuses.
+ // (1) In "almost full" status
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // When we prepend another hit, we can only put it at the special
+ // position 0. And we get a full PL
+ // +-----------------+----------------+-------+-----------------+
+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // (2) In "not full" status
+ // +-----------------+----------------+------+-------+------------------+
+ // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
+ // +-----------------+----------------+------+-------+------------------+
+ // When we prepend another hit, we can reach any of the 3 following
+ // scenarios:
+ // (2.1) not full
+ // if the space of pad and original 1st hit can accommodate the new 1st hit
+ // and the encoded delta value.
+ // +-----------------+----------------+------+-----------+-----------------+
+ // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
+ // +-----------------+----------------+------+-----------+-----------------+
+ // (2.2) almost full
+ // If the space of pad and original 1st hit cannot accommodate the new 1st
+ // hit and the encoded delta value but can accommodate the encoded delta
+ // value only. We can put the new 1st hit at special position 1.
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // (2.3) full
+ // In very rare case, it cannot even accommodate only the encoded delta
+ // value. we can move the original 1st hit into special position 1 and the
+ // new 1st hit into special position 0. This may happen because we use
+ // VarInt encoding method which may make the encoded value longer (about
+ // 4/3 times of original)
+ // +-----------------+----------------+-------+-----------------+
+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // Suppose now the PL is full. But we don't know whether it arrived to
+ // this status from "not full" like (2.3) or from "almost full" like (1).
+ // We'll return to "almost full" status like (1) if we simply pop the new
+ // 1st hit but we want to make the prepending operation "reversible". So
+ // there should be some way to return to "not full" if possible. A simple
+ // way to do it is to pop 2 hits out of the PL to status "almost full" or
+ // "not full". And add the original 1st hit back. We can return to the
+ // correct original statuses of (2.1) or (1). This makes our prepending
+ // operation reversible.
+ std::vector<Hit> out;
+
+ // Popping 2 hits should never fail because we've just ensured that the
+ // posting list is in the FULL state.
+ ICING_RETURN_IF_ERROR(GetHitsInternal(/*limit=*/2, /*pop=*/true, &out));
+
+ // PrependHit should never fail because out[1] is a valid hit less than
+ // previous hits in the posting list and because there's no way that the
+ // posting list could run out of room because it previously stored this hit
+ // AND another hit.
+ PrependHit(out[1]);
+ } else if (num_hits > 0) {
+ return GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListUsed::GetHitsInternal(
+ uint32_t limit, bool pop, std::vector<Hit> *out) const {
+ // Put current uncompressed val here.
+ Hit::Value val = Hit::kInvalidValue;
+ uint32_t offset = get_start_byte_offset();
+ uint32_t count = 0;
+
+ // First traverse the first two special positions.
+ while (count < limit && offset < posting_list_utils::kSpecialHitsSize) {
+ // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
+ // kNumSpecialHits because of the check above.
+ Hit hit = get_special_hit(offset / sizeof(Hit)).ValueOrDie();
+ val = hit.value();
+ if (out != nullptr) {
+ out->push_back(hit);
+ }
+ offset += sizeof(Hit);
+ count++;
+ }
+
+ // If special position 1 was set then we need to skip padding.
+ if (val != Hit::kInvalidValue &&
+ offset == posting_list_utils::kSpecialHitsSize) {
+ offset = GetPadEnd(offset);
+ }
+
+ while (count < limit && offset < size_in_bytes_) {
+ if (val == Hit::kInvalidValue) {
+ // First hit is in compressed area. Put that in val.
+ memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value));
+ offset += sizeof(Hit::Value);
+ } else {
+ // Now we have delta encoded subsequent hits. Decode and push.
+ uint64_t delta;
+ offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
+ val += delta;
+ }
+ Hit hit(val);
+ if (hit.has_score()) {
+ auto score_or = ReadScore(offset);
+ if (!score_or.ok()) {
+ // This posting list has been corrupted somehow. The first hit of the
+ // posting list claims to have a score, but there's no more room in the
+ // posting list for that score to exist. Return an empty vector and zero
+ // to indicate no hits retrieved.
+ out->clear();
+ return absl_ports::InternalError("Posting list has been corrupted!");
+ }
+ hit = Hit(val, score_or.ValueOrDie());
+ offset += sizeof(Hit::Score);
+ }
+ if (out != nullptr) {
+ out->push_back(hit);
+ }
+ count++;
+ }
+
+ if (pop) {
+ PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this);
+ // Modify the posting list so that we pop all hits actually
+ // traversed.
+ if (offset >= posting_list_utils::kSpecialHitsSize &&
+ offset < size_in_bytes_) {
+ // In the compressed area. Pop and reconstruct. offset/val is
+ // the last traversed hit, which we must discard. So move one
+ // more forward.
+ uint64_t delta;
+ offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
+ val += delta;
+
+ // Now val is the first hit of the new posting list.
+ if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
+ // val fits in compressed area. Simply copy.
+ offset -= sizeof(Hit::Value);
+ memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
+ } else {
+ // val won't fit in compressed area. Also see if there is a score.
+ Hit hit(val);
+ if (hit.has_score()) {
+ auto score_or = ReadScore(offset);
+ if (!score_or.ok()) {
+ // This posting list has been corrupted somehow. The first hit of
+ // the posting list claims to have a score, but there's no more room
+ // in the posting list for that score to exist. Return an empty
+ // vector and zero to indicate no hits retrieved. Do not pop
+ // anything.
+ out->clear();
+ return absl_ports::InternalError(
+ "Posting list has been corrupted!");
+ }
+ hit = Hit(val, score_or.ValueOrDie());
+ }
+ // Okay to ignore the return value here because 1 < kNumSpecialHits.
+ mutable_this->set_special_hit(1, hit);
+
+ // Prepend pad. Safe to ignore the return value of PadToEnd because
+ // offset must be less than size_in_bytes_ thanks to the if above.
+ mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
+ offset = sizeof(Hit);
+ }
+ }
+ // offset is guaranteed to be valid so ignoring the return value of
+ // set_start_byte_offset is safe. It falls into one of four scenarios:
+ // Scenario 1: the above if was false because offset is not < size_in_bytes_
+ // In this case, offset must be == size_in_bytes_ because we reached
+ // offset by unwinding hits on the posting list.
+ // Scenario 2: offset is < kSpecialHitSize
+ // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
+ // because offset is incremented by sizeof(Hit) within the first while
+ // loop.
+ // Scenario 3: offset is within the compressed region and the new first hit
+ // in the posting list (the value that 'val' holds) will fit as an
+ // uncompressed hit in the compressed region. The resulting offset from
+ // decompressing val must be >= kSpecialHitSize because otherwise we'd be
+ // in Scenario 4
+ // Scenario 4: offset is within the compressed region, but the new first hit
+ // in the posting list is too large to fit as an uncompressed hit in the
+ // in the compressed region. Therefore, it must be stored in a special hit
+ // and offset will be sizeof(Hit).
+ mutable_this->set_start_byte_offset(offset);
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Hit> PostingListUsed::get_special_hit(
+ uint32_t index) const {
+ static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
+ if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
+ return absl_ports::InvalidArgumentError(
+ "Special hits only exist at indices 0 and 1");
+ }
+ Hit val;
+ memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val));
+ return val;
+}
+
+bool PostingListUsed::set_special_hit(uint32_t index, const Hit &val) {
+ if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
+ ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
+ return false;
+ }
+ memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val));
+ return true;
+}
+
+uint32_t PostingListUsed::BytesUsed() const {
+ // The special hits will be included if they represent actual hits. If they
+ // represent the hit offset or the invalid hit sentinel, they are not
+ // included.
+ return size_in_bytes_ - get_start_byte_offset();
+}
+
+uint32_t PostingListUsed::MinPostingListSizeToFit() const {
+ if (full() || almost_full()) {
+ // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+ // the minimum size posting list that can fit these hits. So just return the
+ // size of the posting list.
+ return size_in_bytes_;
+ }
+
+ // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
+ // posting list that would be guaranteed to fit these hits would be
+ // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
+ // special_hit(1) and the n compressed hits in the compressed region.
+ // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
+ // fitting these hits into a posting list would require BytesUsed plus one
+ // extra hit.
+ return BytesUsed() + sizeof(Hit);
+}
+
+bool PostingListUsed::IsPostingListValid() const {
+ if (almost_full()) {
+ // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
+ // know that 1 < kNumSpecialHits.
+ if (!get_special_hit(1).ValueOrDie().is_valid()) {
+ ICING_LOG(ERROR)
+ << "Both special hits cannot be invalid at the same time.";
+ return false;
+ }
+ } else if (!full()) {
+ // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
+ // safe because we know that 0 < kNumSpecialHits.
+ if (get_special_hit(0).ValueOrDie().value() > size_in_bytes_ ||
+ get_special_hit(0).ValueOrDie().value() <
+ posting_list_utils::kSpecialHitsSize) {
+ ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).ValueOrDie().value()
+ << " size: " << size_in_bytes_
+ << " sp size: " << posting_list_utils::kSpecialHitsSize;
+ return false;
+ }
+ }
+ return true;
+}
+
+uint32_t PostingListUsed::get_start_byte_offset() const {
+ if (full()) {
+ return 0;
+ } else if (almost_full()) {
+ return sizeof(Hit);
+ } else {
+ // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
+ // kNumSpecialHits.
+ return get_special_hit(0).ValueOrDie().value();
+ }
+}
+
+bool PostingListUsed::set_start_byte_offset(uint32_t offset) {
+ if (offset > size_in_bytes_) {
+ ICING_LOG(ERROR) << "offset cannot be a value greater than size "
+ << size_in_bytes_ << ". offset is " << offset << ".";
+ return false;
+ }
+ if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
+ << ", " << posting_list_utils::kSpecialHitsSize
+ << "). offset is " << offset << ".";
+ return false;
+ }
+ if (offset < sizeof(Hit) && offset != 0) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
+ << "). offset is " << offset << ".";
+ return false;
+ }
+ if (offset >= posting_list_utils::kSpecialHitsSize) {
+ // not_full state. Safe to ignore the return value because 0 and 1 are both
+ // < kNumSpecialHits.
+ set_special_hit(0, Hit(offset));
+ set_special_hit(1, Hit());
+ } else if (offset == sizeof(Hit)) {
+ // almost_full state. Safe to ignore the return value because 1 is both <
+ // kNumSpecialHits.
+ set_special_hit(0, Hit());
+ }
+ // Nothing to do for the FULL state - the offset isn't actually stored
+ // anywhere and both special hits hold valid hits.
+ return true;
+}
+
+libtextclassifier3::StatusOr<uint32_t> PostingListUsed::PrependHitUncompressed(
+ const Hit &hit, uint32_t offset) {
+ if (hit.has_score()) {
+ if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend Hit at offset %d.", offset));
+ }
+ offset -= sizeof(Hit);
+ memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit));
+ } else {
+ if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend Hit::Value at offset %d.", offset));
+ }
+ offset -= sizeof(Hit::Value);
+ Hit::Value val = hit.value();
+ memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
+ }
+ return offset;
+}
+
+libtextclassifier3::StatusOr<Hit::Score> PostingListUsed::ReadScore(
+ uint32_t offset) const {
+ if (offset + sizeof(Hit::Score) > size_in_bytes_) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "offset %d must not point past the end of the posting list of size %d.",
+ offset, size_in_bytes_));
+ }
+ Hit::Score score;
+ memcpy(&score, posting_list_buffer_ + offset, sizeof(Hit::Score));
+ return score;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-used.h b/icing/index/main/posting-list-used.h
new file mode 100644
index 0000000..8bc9c8d
--- /dev/null
+++ b/icing/index/main/posting-list-used.h
@@ -0,0 +1,347 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_
+#define ICING_INDEX_MAIN_POSTING_LIST_USED_H_
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// A posting list with hits in it. Layout described in comments in
+// posting-list-used.cc.
+class PostingListUsed {
+ public:
+ // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
+ // 'Preexisting' means that posting_list_buffer was previously modified by
+ // another instance of PostingListUsed.
+ //
+ // Caller owns the hits buffer and must not free it while using a
+ // PostingListUsed.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
+ // || size_in_bytes % sizeof(Hit) != 0.
+ // - FAILED_PRECONDITION if posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer,
+ uint32_t size_in_bytes);
+
+ // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
+ // and initializes the content of the buffer so that the returned
+ // PostingListUsed is empty.
+ //
+ // Caller owns the posting_list_buffer buffer and must not free it while using
+ // a PostingListUsed.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
+ // || size_in_bytes % sizeof(Hit) != 0.
+ // - FAILED_PRECONDITION if posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromUnitializedRegion(void *posting_list_buffer,
+ uint32_t size_in_bytes);
+
+ // Move contents from another posting list. Clears other.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit
+ // in 'this'.
+ // - FAILED_PRECONDITION if 'this' posting list is in a corrupted state.
+ libtextclassifier3::Status MoveFrom(PostingListUsed *other);
+
+ // Min size of posting list that can fit these used bytes. (See
+ // MoveFrom.)
+ uint32_t MinPostingListSizeToFit() const;
+
+ // Prepend a hit to the posting list.
+ // RETURNS:
+ // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+ // previously added hit.
+ // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
+ // list.
+ libtextclassifier3::Status PrependHit(const Hit &hit);
+
+ // Prepend hits to the posting list. Hits should be sorted in
+ // descending order (as defined by the less than operator for Hit)
+ //
+ // Returns the number of hits that could be prepended to the posting list. If
+ // keep_prepended is true, whatever could be prepended is kept, otherwise the
+ // posting list is left in its original state.
+ template <class T, Hit (*GetHit)(const T &)>
+ uint32_t PrependHitArray(const T *array, uint32_t num_hits,
+ bool keep_prepended);
+
+ // Retrieves the hits stored in the posting list.
+ //
+ // RETURNS:
+ // - On success, a vector of hits sorted by the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetHits() const;
+
+ // Same as GetHits but appends hits to hits_out.
+ //
+ // RETURNS:
+ // - On success, a vector of hits sorted by the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetHits(std::vector<Hit> *hits_out) const;
+
+ // Undo the last num_hits hits prepended. If num_hits > number of
+ // hits we clear all hits.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status PopFrontHits(uint32_t num_hits);
+
+ // Returns bytes used by actual hits.
+ uint32_t BytesUsed() const;
+
+ private:
+ // Posting list layout formats:
+ //
+ // not_full
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // almost_full
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // full()
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |1st hit |2nd hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // The first two uncompressed hits also implicitly encode information about
+ // the size of the compressed hits region.
+ //
+ // 1. If the posting list is NOT_FULL, then
+ // posting_list_buffer_[0] contains the byte offset of the start of the
+ // compressed hits - and, thus, the size of the compressed hits region is
+ // size_in_bytes - posting_list_buffer_[0].
+ //
+ // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
+ // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
+ // - 1] and ends at size_in_bytes - 1.
+ //
+ // Hit scores are stored after the hit value, compressed or
+ // uncompressed. For the first two special hits, we always have a
+ // space for the score. For hits in the compressed area, we only have
+ // the score following the hit value of hit.has_score() is true. This
+ // allows good compression in the common case where hits don't have a
+ // specific score.
+ //
+ // EXAMPLE
+ // Posting list storage. Posting list size: 20 bytes
+ // EMPTY!
+ // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
+ // | 20 |Hit::kInvalidVal| 0x000 |
+ // +-------------+----------------+----------------+-----------------------+
+ //
+ // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
+ // NOT FULL!
+ // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
+ // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 |
+ // +-------------+----------------+-----------------+----------------------+
+ //
+ // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4, Score=125)
+ // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
+ // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
+ // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 |
+ // +-------------+----------------+-----------+-----------+------+-----------+
+ //
+ // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
+ // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
+ // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
+ // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 |
+ // +-------------+------------+--------+----------+---------+------+---------+
+ //
+ // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6, Score = 87)
+ // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196)
+ // ALMOST FULL!
+ // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
+ // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 |
+ // +-------------+------------+---------+---------+---------+------+---------+
+ //
+ // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
+ // FULL!
+ // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
+ // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
+ // |
+ // +-------------+-------------+---------+----------+---------+------+---------+
+ PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes)
+ : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
+ size_in_bytes_(size_in_bytes) {}
+
+ // Helpers to determine what state the posting list is in.
+ bool full() const {
+ return get_special_hit(0).ValueOrDie().is_valid() &&
+ get_special_hit(1).ValueOrDie().is_valid();
+ }
+ bool almost_full() const {
+ return !get_special_hit(0).ValueOrDie().is_valid();
+ }
+ bool empty() const {
+ return get_special_hit(0).ValueOrDie().value() == size_in_bytes_ &&
+ !get_special_hit(1).ValueOrDie().is_valid();
+ }
+
+ // Returns false if both special hits are invalid or if the offset value
+ // stored in the special hit is less than kSpecialHitsSize or greater than
+ // size_in_bytes_. Returns true, otherwise.
+ bool IsPostingListValid() const;
+
+ // Prepend hit to a posting list that is in the ALMOST_FULL state.
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if hit is not less than the previously added hit.
+ libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit);
+
+ // Prepend hit to a posting list that is in the EMPTY state. This will always
+ // succeed because there are no pre-existing hits and no validly constructed
+ // posting list could fail to fit one hit.
+ void PrependHitToEmpty(const Hit &hit);
+
+ // Prepend hit to a posting list that is in the NOT_FULL state.
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if hit is not less than the previously added hit.
+ libtextclassifier3::Status PrependHitToNotFull(const Hit &hit,
+ uint32_t offset);
+
+ // Reset contents to an empty posting list. This *must* be called if the
+ // posting_list_buffer_ region is uninitialized.
+ void Clear();
+
+ // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
+ // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive)
+ // (not_full state).
+ uint32_t get_start_byte_offset() const;
+
+ // Sets the special hits to properly reflect what offset is (see layout
+ // comment for further details).
+ //
+ // Returns false if offset > size_in_bytes_ or offset is (kSpecialHitsSize,
+ // sizeof(Hit)) or offset is (sizeof(Hit), 0). True, otherwise.
+ bool set_start_byte_offset(uint32_t offset);
+
+ // Manipulate padded areas. We never store the same hit value twice
+ // so a delta of 0 is a pad byte.
+
+ // Returns offset of first non-pad byte.
+ uint32_t GetPadEnd(uint32_t offset) const;
+
+ // Fill padding between offset start and offset end with 0s.
+ // Returns false if end > size_in_bytes_. True, otherwise.
+ bool PadToEnd(uint32_t start, uint32_t end);
+
+ // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
+ // hits in the posting list if the posting list contains less than limit
+ // number of hits. out can be NULL.
+ //
+ // NOTE: If called with limit=1, pop=true on a posting list that transitioned
+ // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
+ // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
+ // ALMOST_FULL.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetHitsInternal(uint32_t limit, bool pop,
+ std::vector<Hit> *out) const;
+
+ // Retrieves the value stored in the index-th special hit.
+ //
+ // RETURNS:
+ // - A valid Hit, on success
+ // - INVALID_ARGUMENT if index is not less than kNumSpecialHits
+ libtextclassifier3::StatusOr<Hit> get_special_hit(uint32_t index) const;
+
+ // Sets the value stored in the index-th special hit to val. If index is not
+ // less than kSpecialHitSize / sizeof(Hit), this has no effect.
+ bool set_special_hit(uint32_t index, const Hit &val);
+
+ // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
+ // returns the new beginning of the padded region.
+ //
+ // RETURNS:
+ // - The new beginning of the padded region, if successful.
+ // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
+ // kSpecialHitsSize
+ libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
+ const Hit &hit, uint32_t offset);
+
+ // Reads the score located at offset and returns it. Callers are responsible
+ // for ensuring that the bytes starting at offset actually represent a score.
+ //
+ // RETURNS:
+ // - The score located at offset, if successful
+ // - INVALID_ARGUMENT if offset + sizeof(Hit::Score) >= size_in_bytes_
+ libtextclassifier3::StatusOr<Hit::Score> ReadScore(uint32_t offset) const;
+
+ // A byte array of size size_in_bytes_ containing encoded hits for this
+ // posting list.
+ uint8_t *posting_list_buffer_; // does not own!
+ uint32_t size_in_bytes_;
+};
+
+// Inlined functions. Implementation details below. Avert eyes!
+template <class T, Hit (*GetHit)(const T &)>
+uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
+ bool keep_prepended) {
+ if (!IsPostingListValid()) {
+ return 0;
+ }
+
+ // Prepend hits working backwards from array[num_hits - 1].
+ uint32_t i;
+ for (i = 0; i < num_hits; ++i) {
+ if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) {
+ break;
+ }
+ }
+ if (i != num_hits && !keep_prepended) {
+ // Didn't fit. Undo everything and check that we have the same offset as
+ // before. PopFrontHits guarantees that it will remove all 'i' hits so long
+ // as there are at least 'i' hits in the posting list, which we know there
+ // are.
+ PopFrontHits(i);
+ }
+ return i;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_POSTING_LIST_USED_H_
diff --git a/icing/index/main/posting-list-used_test.cc b/icing/index/main/posting-list-used_test.cc
new file mode 100644
index 0000000..eb62aeb
--- /dev/null
+++ b/icing/index/main/posting-list-used_test.cc
@@ -0,0 +1,658 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-used.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <iterator>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+
+using std::reverse;
+using std::vector;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::IsEmpty;
+using testing::Le;
+using testing::Lt;
+
+namespace icing {
+namespace lib {
+
+struct HitElt {
+ HitElt() = default;
+ explicit HitElt(const Hit &hit_in) : hit(hit_in) {}
+
+ static Hit get_hit(const HitElt &hit_elt) {
+ return hit_elt.hit;
+ }
+
+ Hit hit;
+};
+
+TEST(PostingListTest, PostingListUsedPrependHitNotFull) {
+ static const int kNumHits = 2551;
+ static const size_t kHitsSize = kNumHits * sizeof(Hit);
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+
+ // Make used.
+ Hit hit0(/*section_id=*/0, 0, /*score=*/56);
+ pl_used.PrependHit(hit0);
+ // Size = sizeof(uncompressed hit0)
+ int expected_size = sizeof(Hit);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0)));
+
+ Hit hit1(/*section_id=*/0, 1, Hit::kMaxHitScore);
+ pl_used.PrependHit(hit1);
+ // Size = sizeof(uncompressed hit1)
+ // + sizeof(hit0-hit1) + sizeof(hit0::score)
+ expected_size += 2 + sizeof(Hit::Score);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+
+ Hit hit2(/*section_id=*/0, 2, /*score=*/56);
+ pl_used.PrependHit(hit2);
+ // Size = sizeof(uncompressed hit2)
+ // + sizeof(hit1-hit2)
+ // + sizeof(hit0-hit1) + sizeof(hit0::score)
+ expected_size += 2;
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+
+ Hit hit3(/*section_id=*/0, 3, Hit::kMaxHitScore);
+ pl_used.PrependHit(hit3);
+ // Size = sizeof(uncompressed hit3)
+ // + sizeof(hit2-hit3) + sizeof(hit2::score)
+ // + sizeof(hit1-hit2)
+ // + sizeof(hit0-hit1) + sizeof(hit0::score)
+ expected_size += 2 + sizeof(Hit::Score);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(),
+ IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+}
+
+TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) {
+ constexpr int kHitsSize = 2 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+
+ // Fill up the compressed region.
+ // Transitions:
+ // Adding hit0: EMPTY -> NOT_FULL
+ // Adding hit1: NOT_FULL -> NOT_FULL
+ // Adding hit2: NOT_FULL -> NOT_FULL
+ Hit hit0(/*section_id=*/0, 0, Hit::kMaxHitScore);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit0));
+ ICING_EXPECT_OK(pl_used.PrependHit(hit1));
+ ICING_EXPECT_OK(pl_used.PrependHit(hit2));
+ // Size used will be 2+2+4=8 bytes
+ int expected_size = sizeof(Hit::Value) + 2 + 2;
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+
+ // Add one more hit to transition NOT_FULL -> ALMOST_FULL
+ Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit3));
+ // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is
+ // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL.
+ // The in-use compressed region will actually shrink from 8 bytes to 7 bytes
+ // because the uncompressed version of hit2 will be overwritten with the
+ // compressed delta of hit2. hit3 will be written to one of the special hits.
+ // Because we're in ALMOST_FULL, the expected size is the size of the pl minus
+ // the one hit used to mark the posting list as ALMOST_FULL.
+ expected_size = kHitsSize - sizeof(Hit);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(),
+ IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
+ Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit4));
+ // There are currently 7 bytes in use in the compressed region. hit3 will have
+ // a 2-byte delta. That delta will fit in the compressed region (which will
+ // now have 9 bytes in use), hit4 will be placed in one of the special hits
+ // and the posting list will remain in ALMOST_FULL.
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(),
+ IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> FULL
+ Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit5));
+ // There are currently 9 bytes in use in the compressed region. hit4 will have
+ // a 2-byte delta which will not fit in the compressed region. So hit4 will
+ // remain in one of the special hits and hit5 will occupy the other, making
+ // the posting list FULL.
+ EXPECT_THAT(pl_used.BytesUsed(), Le(kHitsSize));
+ EXPECT_THAT(pl_used.GetHits(),
+ IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0)));
+
+ // The posting list is FULL. Adding another hit should fail.
+ Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1);
+ EXPECT_THAT(pl_used.PrependHit(hit6),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListTest, PostingListUsedMinSize) {
+ std::unique_ptr<char[]> hits_buf =
+ std::make_unique<char[]>(posting_list_utils::min_posting_list_size());
+
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()),
+ posting_list_utils::min_posting_list_size()));
+ // PL State: EMPTY
+ EXPECT_THAT(pl_used.BytesUsed(), Eq(0));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+
+ // Add a hit, PL should shift to ALMOST_FULL state
+ Hit hit0(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/true);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit0));
+ // Size = sizeof(uncompressed hit0)
+ int expected_size = sizeof(Hit);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0)));
+
+ // Add the smallest hit possible - no score and a delta of 1. PL should shift
+ // to FULL state.
+ Hit hit1(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/false);
+ ICING_EXPECT_OK(pl_used.PrependHit(hit1));
+ // Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0)
+ expected_size += sizeof(Hit);
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+
+ // Try to add the smallest hit possible. Should fail
+ Hit hit2(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
+ EXPECT_THAT(pl_used.PrependHit(hit2),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+}
+
+TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) {
+ constexpr int kFinalSize = 1025;
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kFinalSize);
+
+ // Min Size = 10
+ int size = posting_list_utils::min_posting_list_size();
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), size));
+
+ std::vector<HitElt> hits_in;
+ hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ std::reverse(hits_in.begin(), hits_in.end());
+
+ // Add five hits. The PL is in the empty state and an empty min size PL can
+ // only fit two hits. So PrependHitArray should fail.
+ uint32_t num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hits_in[0], hits_in.size(), false);
+ EXPECT_THAT(num_can_prepend, Eq(2));
+
+ int can_fit_hits = num_can_prepend;
+ // The PL has room for 2 hits. We should be able to add them without any
+ // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL
+ const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2);
+ num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ hits_in_ptr, can_fit_hits, false);
+ EXPECT_THAT(num_can_prepend, Eq(can_fit_hits));
+ EXPECT_THAT(size, Eq(pl_used.BytesUsed()));
+ std::deque<Hit> hits_pushed;
+ std::transform(hits_in.rbegin(),
+ hits_in.rend() - hits_in.size() + can_fit_hits,
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+}
+
+TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
+ // Size = 30
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), size));
+
+ std::vector<HitElt> hits_in;
+ hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // The last hit is uncompressed and the four before it should only take one
+ // byte. Total use = 8 bytes.
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25-22 Hit #5
+ // 21-10 <unused>
+ // 9-5 kSpecialHit
+ // 4-0 Offset=22
+ // ----------------------
+ int byte_size = sizeof(Hit::Value) + hits_in.size() - 1;
+
+ // Add five hits. The PL is in the empty state and should be able to fit all
+ // five hits without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hits_in[0], hits_in.size(), false);
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ std::deque<Hit> hits_pushed;
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ Hit first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // Size increased by the deltas of these hits (1+2+1+2+3+2) = 11 bytes
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-11 Hit #11
+ // 10 <unused>
+ // 9-5 kSpecialHit
+ // 4-0 Offset=22
+ // ----------------------
+ byte_size += 11;
+
+ // Add these 6 hits. The PL is currently in the NOT_FULL state and should
+ // remain in the NOT_FULL state.
+ num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hits_in[0], hits_in.size(), false);
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/3);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-12 delta(Hit #11)
+ // 11-10 <unused>
+ // 9-5 Hit #12
+ // 4-0 kSpecialHit
+ // ----------------------
+ byte_size = 25;
+
+ // Add this 1 hit. The PL is currently in the NOT_FULL state and should
+ // transition to the ALMOST_FULL state - even though there is still some
+ // unused space.
+ num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hits_in[0], hits_in.size(), false);
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-12 delta(Hit #11)
+ // 11 delta(Hit #12)
+ // 10 <unused>
+ // 9-5 Hit #13
+ // 4-0 Hit #14
+ // ----------------------
+
+ // Add these 2 hits. The PL is currently in the ALMOST_FULL state. Adding the
+ // first hit should keep the PL in ALMOST_FULL because the delta between Hit
+ // #12 and Hit #13 (1 byte) can fit in the unused area (2 bytes). Adding the
+ // second hit should tranisition to the FULL state because the delta between
+ // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area
+ // (1 byte).
+ num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hits_in[0], hits_in.size(), false);
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(size, Eq(pl_used.BytesUsed()));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+}
+
+TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) {
+ static constexpr int kNumHits = 128;
+ static constexpr int kDeltaSize = 1;
+ static constexpr int kScoreSize = 1;
+ static constexpr size_t kHitsSize =
+ ((kNumHits * (kDeltaSize + kScoreSize)) / 5) * 5;
+
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+
+ // Create an array with one too many hits
+ vector<Hit> hits_in_too_many =
+ CreateHits(kNumHits + 1, /*desired_byte_length=*/1);
+ vector<HitElt> hit_elts_in_too_many;
+ for (const Hit &hit : hits_in_too_many) {
+ hit_elts_in_too_many.emplace_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()),
+ posting_list_utils::min_posting_list_size()));
+
+ // PrependHitArray should fail because hit_elts_in_too_many is far too large
+ // for the minimum size pl.
+ uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
+ ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
+ ASSERT_THAT(pl_used.BytesUsed(), Eq(0));
+ ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_used, PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitsSize));
+ // PrependHitArray should fail because hit_elts_in_too_many is one hit too
+ // large for this pl.
+ num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
+ &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
+ ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
+ ASSERT_THAT(pl_used.BytesUsed(), Eq(0));
+ ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListTest, PostingListStatusJumpFromNotFullToFullAndBack) {
+ const uint32_t pl_size = 3 * sizeof(Hit);
+ char hits_buf[pl_size];
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(hits_buf, pl_size));
+ ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue - 1, 0)));
+ uint32_t bytes_used = pl.BytesUsed();
+ // Status not full.
+ ASSERT_THAT(bytes_used, Le(pl_size - posting_list_utils::kSpecialHitsSize));
+ ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue >> 2, 0)));
+ // Status should jump to full directly.
+ ASSERT_THAT(pl.BytesUsed(), Eq(pl_size));
+ pl.PopFrontHits(1);
+ // Status should return to not full as before.
+ ASSERT_THAT(pl.BytesUsed(), Eq(bytes_used));
+}
+
+TEST(PostingListTest, DeltaOverflow) {
+ char hits_buf[1000];
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(hits_buf, 4 * sizeof(Hit)));
+
+ static const Hit::Value kOverflow[4] = {
+ Hit::kInvalidValue >> 2,
+ (Hit::kInvalidValue >> 2) * 2,
+ (Hit::kInvalidValue >> 2) * 3,
+ Hit::kInvalidValue - 1,
+ };
+
+ // Fit at least 4 ordinary values.
+ for (Hit::Value v = 0; v < 4; v++) {
+ ICING_EXPECT_OK(pl.PrependHit(Hit(4 - v)));
+ }
+
+ // Cannot fit 4 overflow values.
+ ICING_ASSERT_OK_AND_ASSIGN(pl, PostingListUsed::CreateFromUnitializedRegion(
+ hits_buf, 4 * sizeof(Hit)));
+ ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[3])));
+ ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[2])));
+
+ // Can fit only one more.
+ ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[1])));
+ EXPECT_THAT(pl.PrependHit(Hit(kOverflow[0])),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListTest, MoveFrom) {
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf1.get()), size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ }
+
+ std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf2.get()), size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ }
+
+ ICING_ASSERT_OK(pl_used2.MoveFrom(&pl_used1));
+ EXPECT_THAT(pl_used2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(pl_used1.GetHits(), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListTest, MoveFromNullArgumentReturnsInvalidArgument) {
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf1.get()), size));
+ std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits) {
+ ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ }
+
+ EXPECT_THAT(pl_used1.MoveFrom(/*other=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(pl_used1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits.rbegin(), hits.rend())));
+}
+
+TEST(PostingListTest, MoveFromInvalidPostingListReturnsInvalidArgument) {
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf1.get()), size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ }
+
+ std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf2.get()), size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ }
+
+ // Write invalid hits to the beginning of pl_used1 to make it invalid.
+ Hit invalid_hit;
+ Hit *first_hit = reinterpret_cast<Hit *>(hits_buf1.get());
+ *first_hit = invalid_hit;
+ ++first_hit;
+ *first_hit = invalid_hit;
+ EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(pl_used2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+}
+
+TEST(PostingListTest, MoveToInvalidPostingListReturnsInvalidArgument) {
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf1.get()), size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ }
+
+ std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf2.get()), size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ }
+
+ // Write invalid hits to the beginning of pl_used2 to make it invalid.
+ Hit invalid_hit;
+ Hit *first_hit = reinterpret_cast<Hit *>(hits_buf2.get());
+ *first_hit = invalid_hit;
+ ++first_hit;
+ *first_hit = invalid_hit;
+ EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(pl_used1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST(PostingListTest, MoveToPostingListTooSmall) {
+ int size = 3 * posting_list_utils::min_posting_list_size();
+ std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf1.get()), size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ }
+
+ std::unique_ptr<char[]> hits_buf2 =
+ std::make_unique<char[]>(posting_list_utils::min_posting_list_size());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf2.get()),
+ posting_list_utils::min_posting_list_size()));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/1, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ }
+
+ EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(pl_used1.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(pl_used2.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-utils.cc b/icing/index/main/posting-list-utils.cc
new file mode 100644
index 0000000..b734767
--- /dev/null
+++ b/icing/index/main/posting-list-utils.cc
@@ -0,0 +1,55 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-utils.h"
+
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace posting_list_utils {
+
+bool IsValidPostingListSize(uint32_t size_in_bytes) {
+ // size must be sizeof(Hit) aligned. Otherwise, we can have serious
+ // wasted space in the worst case.
+ if (size_in_bytes % sizeof(Hit) != 0) {
+ ICING_LOG(ERROR) << "Size " << size_in_bytes << " hit " << sizeof(Hit);
+ return false;
+ }
+
+ // Must be able to store the min information.
+ if (size_in_bytes < min_posting_list_size()) {
+ ICING_LOG(ERROR) << "Size " << size_in_bytes << " is less than min size "
+ << min_posting_list_size();
+ return false;
+ }
+
+ // We re-use the first two hits as pointers into the posting list
+ // so the posting list size must fit in sizeof(Hit).
+ if (BitsToStore(size_in_bytes) > sizeof(Hit::Value) * 8) {
+ ICING_LOG(ERROR)
+ << "Posting list size must be small enough to store the offset in "
+ << sizeof(Hit::Value) * 8 << " bytes.";
+ return false;
+ }
+
+ return true;
+}
+
+} // namespace posting_list_utils
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-utils.h b/icing/index/main/posting-list-utils.h
new file mode 100644
index 0000000..77537a7
--- /dev/null
+++ b/icing/index/main/posting-list-utils.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
+#define ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
+
+#include <cstdint>
+
+#include "icing/index/hit/hit.h"
+
+namespace icing {
+namespace lib {
+
+namespace posting_list_utils {
+
+// Represents the byte length of the two special hits described
+// in the private section of posting-list-used.h.
+inline constexpr uint32_t kNumSpecialHits = 2;
+inline constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * kNumSpecialHits;
+
+constexpr uint32_t min_posting_list_size() { return kSpecialHitsSize; }
+
+// For a posting list size to be valid, it must:
+// 1) be sizeof(Hit) aligned
+// 2) be equal to or larger than min_posting_list_size
+// 3) be small enough to be encoded within a single Hit (5 bytes)
+bool IsValidPostingListSize(uint32_t size_in_bytes);
+
+} // namespace posting_list_utils
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
diff --git a/icing/testing/i18n-test-utils.h b/icing/index/term-metadata.h
similarity index 61%
copy from icing/testing/i18n-test-utils.h
copy to icing/index/term-metadata.h
index 4e8a3b8..c1c1564 100644
--- a/icing/testing/i18n-test-utils.h
+++ b/icing/index/term-metadata.h
@@ -12,19 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TESTING_I18N_TEST_UTILS_H_
-#define ICING_TESTING_I18N_TEST_UTILS_H_
+#ifndef ICING_INDEX_TERM_METADATA_H_
+#define ICING_INDEX_TERM_METADATA_H_
#include <string>
-#include "unicode/umachine.h"
-
namespace icing {
namespace lib {
-std::string UcharToString(UChar32 uchar);
+// A POD struct storing metadata of a term.
+struct TermMetadata {
+ TermMetadata(std::string content_in, int hit_count_in)
+ : content(std::move(content_in)), hit_count(hit_count_in) {}
+
+ // Content of the term.
+ std::string content;
+
+ // Number of document hits associated with the term.
+ int hit_count;
+};
} // namespace lib
} // namespace icing
-#endif // ICING_TESTING_I18N_TEST_UTILS_H_
+#endif // ICING_INDEX_TERM_METADATA_H_
diff --git a/icing/index/term-property-id.cc b/icing/index/term-property-id.cc
new file mode 100644
index 0000000..6bdcbbd
--- /dev/null
+++ b/icing/index/term-property-id.cc
@@ -0,0 +1,49 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/term-property-id.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+enum TermPropertyId {
+ // Property id for whether the term has hits in prefix sections
+ kHasHitsInPrefixSection = 0,
+
+ // Property id for whether the term has no exact hits. No exact hits means
+ // that the term is for prefix matching only. This kind of term is used only
+ // in main index.
+ kHasNoExactHits = 1,
+
+ // The smallest property id for namespace. One namespace id will be mapped
+ // to one property id, so that all the property ids that are greater than
+ // this will all be property ids for namespaces.
+ kNamespace = 2,
+};
+
+} // namespace
+
+uint32_t GetHasHitsInPrefixSectionPropertyId() {
+ return kHasHitsInPrefixSection;
+}
+
+uint32_t GetHasNoExactHitsPropertyId() { return kHasNoExactHits; }
+
+uint32_t GetNamespacePropertyId(NamespaceId namespace_id) {
+ return kNamespace + namespace_id;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/term-property-id.h b/icing/index/term-property-id.h
new file mode 100644
index 0000000..ac92f80
--- /dev/null
+++ b/icing/index/term-property-id.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_PROPERTY_ID_H_
+#define ICING_INDEX_TERM_PROPERTY_ID_H_
+
+#include <cstdint>
+
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/store/namespace-id.h"
+
+namespace icing {
+namespace lib {
+
+// A property in IcingDynamicTrie represents some attribute of the terms, e.g.
+// whether the terms have prefix hit sections. A property id is something
+// defined by the index to keep track of what properties the terms have. This
+// class provides functions to get all the property ids we use in the index
+// lexicon.
+
+// Returns the id of term property 'HasPrefixHits'.
+uint32_t GetHasHitsInPrefixSectionPropertyId();
+
+// Returns the id of term property 'HasNoExactHits'.
+uint32_t GetHasNoExactHitsPropertyId();
+
+// Returns the id of term property 'HasNamespaceX'. We have one property id for
+// each namespace id.
+// TODO(b/153091745): This approach requires one property bitmap per namespace
+// id, which doesn't scale well. Try to optimize the space usage.
+uint32_t GetNamespacePropertyId(NamespaceId namespace_id);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_PROPERTY_ID_H_
diff --git a/icing/icing-search-engine_jni.cc b/icing/jni/icing-search-engine-jni.cc
similarity index 70%
rename from icing/icing-search-engine_jni.cc
rename to icing/jni/icing-search-engine-jni.cc
index 263c4f9..4396007 100644
--- a/icing/icing-search-engine_jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2020 The Android Open Source Project
+// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include <google/protobuf/message_lite.h>
#include <jni.h>
#include <string>
+#include "icing/jni/jni-cache.h"
+#include <google/protobuf/message_lite.h>
#include "icing/absl_ports/status_imports.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
@@ -24,11 +25,11 @@
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
-#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
namespace {
-
bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes,
google::protobuf::MessageLite* protobuf) {
int bytes_size = env->GetArrayLength(bytes);
@@ -77,7 +78,7 @@
JNIEXPORT jlong JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeCreate(
- JNIEnv* env, jobject obj, jbyteArray icing_search_engine_options_bytes) {
+ JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) {
icing::lib::IcingSearchEngineOptions options;
if (!ParseProtoFromJniByteArray(env, icing_search_engine_options_bytes,
&options)) {
@@ -86,14 +87,18 @@
return 0;
}
+ std::unique_ptr<const icing::lib::JniCache> jni_cache;
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+ ICING_ASSIGN_OR_RETURN(jni_cache, icing::lib::JniCache::Create(env), 0);
+#endif // ICING_REVERSE_JNI_SEGMENTATION
icing::lib::IcingSearchEngine* icing =
- new icing::lib::IcingSearchEngine(options);
+ new icing::lib::IcingSearchEngine(options, std::move(jni_cache));
return reinterpret_cast<jlong>(icing);
}
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeInitialize(
- JNIEnv* env, jobject obj, jlong native_pointer) {
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -105,7 +110,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeSetSchema(
- JNIEnv* env, jobject obj, jlong native_pointer, jbyteArray schema_bytes,
+ JNIEnv* env, jclass clazz, jlong native_pointer, jbyteArray schema_bytes,
jboolean ignore_errors_and_delete_documents) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -116,15 +121,15 @@
return nullptr;
}
- icing::lib::SetSchemaResultProto set_schema_result_proto =
- icing->SetSchema(schema_proto, ignore_errors_and_delete_documents);
+ icing::lib::SetSchemaResultProto set_schema_result_proto = icing->SetSchema(
+ std::move(schema_proto), ignore_errors_and_delete_documents);
return SerializeProtoToJniByteArray(env, set_schema_result_proto);
}
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeGetSchema(
- JNIEnv* env, jobject obj, jlong native_pointer) {
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -135,7 +140,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType(
- JNIEnv* env, jobject obj, jlong native_pointer, jstring schema_type) {
+ JNIEnv* env, jclass clazz, jlong native_pointer, jstring schema_type) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -149,7 +154,8 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativePut(
- JNIEnv* env, jobject obj, jlong native_pointer, jbyteArray document_bytes) {
+ JNIEnv* env, jclass clazz, jlong native_pointer,
+ jbyteArray document_bytes) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -159,17 +165,16 @@
return nullptr;
}
- icing::lib::PutResultProto put_result_proto = icing->Put(document_proto);
+ icing::lib::PutResultProto put_result_proto =
+ icing->Put(std::move(document_proto));
return SerializeProtoToJniByteArray(env, put_result_proto);
}
JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeGet(JNIEnv* env,
- jobject obj,
- jlong native_pointer,
- jstring name_space,
- jstring uri) {
+Java_com_google_android_icing_IcingSearchEngine_nativeGet(
+ JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space,
+ jstring uri) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -183,8 +188,45 @@
}
JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeGetAllNamespaces(
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::GetAllNamespacesResultProto get_all_namespaces_result_proto =
+ icing->GetAllNamespaces();
+
+ return SerializeProtoToJniByteArray(env, get_all_namespaces_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeGetNextPage(
+ JNIEnv* env, jclass clazz, jlong native_pointer,
+ jlong next_page_token) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::SearchResultProto next_page_result_proto =
+ icing->GetNextPage(next_page_token);
+
+ return SerializeProtoToJniByteArray(env, next_page_result_proto);
+}
+
+JNIEXPORT void JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeInvalidateNextPageToken(
+ JNIEnv* env, jclass clazz, jlong native_pointer,
+ jlong next_page_token) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing->InvalidateNextPageToken(next_page_token);
+
+ return;
+}
+
+JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeSearch(
- JNIEnv* env, jobject obj, jlong native_pointer,
+ JNIEnv* env, jclass clazz, jlong native_pointer,
jbyteArray search_spec_bytes, jbyteArray scoring_spec_bytes,
jbyteArray result_spec_bytes) {
icing::lib::IcingSearchEngine* icing =
@@ -217,7 +259,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeDelete(
- JNIEnv* env, jobject obj, jlong native_pointer, jstring name_space,
+ JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space,
jstring uri) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -233,7 +275,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace(
- JNIEnv* env, jobject obj, jlong native_pointer, jstring name_space) {
+ JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -247,7 +289,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType(
- JNIEnv* env, jobject obj, jlong native_pointer, jstring schema_type) {
+ JNIEnv* env, jclass clazz, jlong native_pointer, jstring schema_type) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -261,7 +303,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk(
- JNIEnv* env, jobject obj, jlong native_pointer) {
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -273,7 +315,7 @@
JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(
- JNIEnv* env, jobject obj, jlong native_pointer) {
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
GetIcingSearchEnginePointer(native_pointer);
@@ -282,4 +324,27 @@
return SerializeProtoToJniByteArray(env, optimize_result_proto);
}
+JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo(
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::GetOptimizeInfoResultProto get_optimize_info_result_proto =
+ icing->GetOptimizeInfo();
+
+ return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeReset(
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::ResetResultProto reset_result_proto = icing->Reset();
+
+ return SerializeProtoToJniByteArray(env, reset_result_proto);
+}
+
} // extern "C"
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
new file mode 100644
index 0000000..58eb8bf
--- /dev/null
+++ b/icing/jni/jni-cache.cc
@@ -0,0 +1,216 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/jni-cache.h"
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+JniCache::JniCache(JavaVM* jvm)
+ : jvm(jvm),
+ string_class(nullptr, jvm),
+ string_utf8(nullptr, jvm),
+ locale_class(nullptr, jvm),
+ locale_us(nullptr, jvm),
+ breakiterator_class(nullptr, jvm) {}
+
+// The macros below are intended to reduce the boilerplate in Create and avoid
+// easily introduced copy/paste errors.
+#define ICING_GET_CLASS_OR_RETURN_NULL(FIELD, NAME) \
+ { \
+ ICING_ASSIGN_OR_RETURN( \
+ libtextclassifier3::ScopedLocalRef<jclass> clazz, \
+ libtextclassifier3::JniHelper::FindClass(env, NAME), nullptr); \
+ result->FIELD##_class = \
+ libtextclassifier3::MakeGlobalRef(clazz.get(), env, jvm); \
+ if (result->FIELD##_class == nullptr) { \
+ ICING_LOG(ERROR) << "Error finding class: " << NAME; \
+ return nullptr; \
+ } \
+ }
+
+#define ICING_GET_OPTIONAL_CLASS(FIELD, NAME) \
+ { \
+ libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jclass>> \
+ status_or_class = libtextclassifier3::JniHelper::FindClass(env, NAME); \
+ if (status_or_class.ok()) { \
+ result->FIELD##_class = libtextclassifier3::MakeGlobalRef( \
+ std::move(status_or_class).ValueOrDie().get(), env, jvm); \
+ } \
+ }
+
+#define ICING_GET_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ result->CLASS##_##FIELD = \
+ env->GetMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding method: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java method."); \
+ }
+
+#define ICING_GET_OPTIONAL_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ if (result->CLASS##_class != nullptr) { \
+ result->CLASS##_##FIELD = \
+ env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ env->ExceptionClear(); \
+ }
+
+#define ICING_GET_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ result->CLASS##_##FIELD = \
+ env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding method: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java static method."); \
+ }
+
+#define ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(CLASS, FIELD, NAME, \
+ SIGNATURE) \
+ { \
+ const jfieldID CLASS##_##FIELD##_field = \
+ env->GetStaticFieldID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!CLASS##_##FIELD##_field) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field id: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java field id."); \
+ } \
+ ICING_ASSIGN_OR_RETURN( \
+ libtextclassifier3::ScopedLocalRef<jobject> static_object, \
+ libtextclassifier3::JniHelper::GetStaticObjectField( \
+ env, result->CLASS##_class.get(), CLASS##_##FIELD##_field), \
+ nullptr); \
+ result->CLASS##_##FIELD = \
+ libtextclassifier3::MakeGlobalRef(static_object.get(), env, jvm); \
+ if (result->CLASS##_##FIELD == nullptr) { \
+ ICING_LOG(ERROR) << "Error finding field: " << NAME; \
+ return nullptr; \
+ } \
+ }
+
+#define ICING_GET_STATIC_INT_FIELD(CLASS, FIELD, NAME) \
+ const jfieldID CLASS##_##FIELD##_field = \
+ env->GetStaticFieldID(result->CLASS##_class.get(), NAME, "I"); \
+ << "Error finding field id: " << NAME; \
+ if (!CLASS##_##FIELD##_field) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field id: " << NAME; \
+ return absl_ports::AbortedError( \
+ "Unable to get Java static int field id."); \
+ } \
+ result->CLASS##_##FIELD = env->GetStaticIntField( \
+ result->CLASS##_class.get(), CLASS##_##FIELD##_field); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java static int field."); \
+ }
+
+libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create(
+ JNIEnv* env) {
+ if (env == nullptr) {
+ return nullptr;
+ }
+ JavaVM* jvm = nullptr;
+ if (JNI_OK != env->GetJavaVM(&jvm) || jvm == nullptr) {
+ return nullptr;
+ }
+ std::unique_ptr<JniCache> result(new JniCache(jvm));
+
+ // String
+ ICING_GET_CLASS_OR_RETURN_NULL(string, "java/lang/String");
+ ICING_GET_METHOD(string, constructor, "<init>", "([BLjava/lang/String;)V");
+ ICING_GET_METHOD(string, code_point_count, "codePointCount", "(II)I");
+ ICING_GET_METHOD(string, length, "length", "()I");
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> result_string,
+ libtextclassifier3::JniHelper::NewStringUTF(env, "UTF-8"), nullptr);
+ result->string_utf8 =
+ libtextclassifier3::MakeGlobalRef(result_string.get(), env, jvm);
+ if (result->string_utf8 == nullptr) {
+ return nullptr;
+ }
+
+ // Locale
+ ICING_GET_CLASS_OR_RETURN_NULL(locale, "java/util/Locale");
+ ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(locale, us, "US",
+ "Ljava/util/Locale;");
+ ICING_GET_METHOD(locale, constructor, "<init>", "(Ljava/lang/String;)V");
+ ICING_GET_OPTIONAL_STATIC_METHOD(locale, for_language_tag, "forLanguageTag",
+ "(Ljava/lang/String;)Ljava/util/Locale;");
+
+ // BreakIteratorBatcher
+ ICING_GET_CLASS_OR_RETURN_NULL(
+ breakiterator,
+ "com/google/android/icing/BreakIteratorBatcher");
+ ICING_GET_METHOD(breakiterator, constructor, "<init>",
+ "(Ljava/util/Locale;)V");
+ ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V");
+ ICING_GET_METHOD(breakiterator, next, "next", "(I)[I");
+ ICING_GET_METHOD(breakiterator, first, "first", "()I");
+ ICING_GET_METHOD(breakiterator, following, "following", "(I)I");
+ ICING_GET_METHOD(breakiterator, preceding, "preceding", "(I)I");
+
+ return result;
+}
+
+#undef ICING_GET_STATIC_INT_FIELD
+#undef ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL
+#undef ICING_GET_STATIC_METHOD
+#undef ICING_GET_METHOD
+#undef ICING_GET_CLASS_OR_RETURN_NULL
+#undef ICING_GET_OPTIONAL_CLASS
+
+JNIEnv* JniCache::GetEnv() const {
+ void* env;
+ if (JNI_OK == jvm->GetEnv(&env, JNI_VERSION_1_4)) {
+ return reinterpret_cast<JNIEnv*>(env);
+ } else {
+ ICING_LOG(ERROR) << "Icing JniCache used on unattached thread";
+ return nullptr;
+ }
+}
+
+bool JniCache::ExceptionCheckAndClear() const {
+ return libtextclassifier3::JniExceptionCheckAndClear(GetEnv());
+}
+
+libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+JniCache::ConvertToJavaString(const char* utf8_text,
+ const int utf8_text_size_bytes) const {
+ // Create java byte array.
+ JNIEnv* jenv = GetEnv();
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jbyteArray> text_java_utf8,
+ libtextclassifier3::JniHelper::NewByteArray(jenv, utf8_text_size_bytes));
+
+ jenv->SetByteArrayRegion(text_java_utf8.get(), 0, utf8_text_size_bytes,
+ reinterpret_cast<const jbyte*>(utf8_text));
+
+ // Create the string with a UTF-8 charset.
+ ICING_ASSIGN_OR_RETURN(libtextclassifier3::ScopedLocalRef<jstring> result,
+ libtextclassifier3::JniHelper::NewObject<jstring>(
+ jenv, string_class.get(), string_constructor,
+ text_java_utf8.get(), string_utf8.get()));
+
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
new file mode 100644
index 0000000..a5f16c7
--- /dev/null
+++ b/icing/jni/jni-cache.h
@@ -0,0 +1,78 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_JNI_CACHE_H_
+#define ICING_JNI_JNI_CACHE_H_
+
+#include <jni.h>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A helper class to cache class and method pointers for calls from JNI to Java.
+// (for implementations such as Java ICU that need to make calls from C++ to
+// Java)
+struct JniCache {
+ static libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> Create(
+ JNIEnv* env);
+
+ // Returns the correct JNIEnv of the current thread. This allows multiple
+ // threads, each accessing the same instance of JniCache, to retrieve their
+ // unique JNIEnv pointers.
+ JNIEnv* GetEnv() const;
+
+ // Returns true if there are any pending exceptions from the execution of JNI
+ // calls. Also clears the exception if any existed.
+ bool ExceptionCheckAndClear() const;
+
+ JavaVM* jvm = nullptr;
+
+ // java.lang.String
+ libtextclassifier3::ScopedGlobalRef<jclass> string_class;
+ jmethodID string_constructor = nullptr;
+ jmethodID string_code_point_count = nullptr;
+ jmethodID string_length = nullptr;
+ libtextclassifier3::ScopedGlobalRef<jstring> string_utf8;
+
+ // java.util.Locale
+ libtextclassifier3::ScopedGlobalRef<jclass> locale_class;
+ libtextclassifier3::ScopedGlobalRef<jobject> locale_us;
+ jmethodID locale_constructor = nullptr;
+ jmethodID locale_for_language_tag = nullptr;
+
+ // BreakIteratorBatcher
+ libtextclassifier3::ScopedGlobalRef<jclass> breakiterator_class;
+ jmethodID breakiterator_constructor = nullptr;
+ jmethodID breakiterator_settext = nullptr;
+ jmethodID breakiterator_next = nullptr;
+ jmethodID breakiterator_first = nullptr;
+ jmethodID breakiterator_following = nullptr;
+ jmethodID breakiterator_preceding = nullptr;
+
+ // Helper to convert lib3 UnicodeText to Java strings.
+ libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+ ConvertToJavaString(const char* utf8_text,
+ const int utf8_text_size_bytes) const;
+
+ private:
+ explicit JniCache(JavaVM* jvm);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc
new file mode 100644
index 0000000..1a8a799
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.cc
@@ -0,0 +1,187 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+
+#include <jni.h>
+#include <math.h>
+
+#include <cassert>
+#include <cctype>
+#include <map>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Chosen based on results in go/reverse-jni-benchmarks
+static constexpr int kBatchSize = 100;
+} // namespace
+
+// -----------------------------------------------------------------------------
+// Implementations that call out to JVM. Behold the beauty.
+// -----------------------------------------------------------------------------
+libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ReverseJniBreakIterator::Create(const JniCache* jni_cache,
+ std::string_view text,
+ std::string_view locale) {
+ if (jni_cache == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "Create must be called with a valid JniCache pointer!");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_text,
+ jni_cache->ConvertToJavaString(text.data(), text.length()));
+ if (java_text.get() == nullptr) {
+ return absl_ports::AbortedError("Failed to create Java String from input.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
+ jni_cache->ConvertToJavaString(locale.data(), locale.length()));
+ if (java_locale_string.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java String from locale.");
+ }
+
+ JNIEnv* jenv = jni_cache->GetEnv();
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> java_locale,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
+ java_locale_string.get()));
+ if (java_locale.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java Locale from locale.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->breakiterator_class.get(),
+ jni_cache->breakiterator_constructor, java_locale.get()));
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
+ libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
+ jni_cache->jvm);
+ if (iterator_batcher.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java BreakIteratorBatcher.");
+ }
+
+ ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
+ jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
+ java_text.get()));
+ return std::unique_ptr<ReverseJniBreakIterator>(
+ new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+}
+
+ReverseJniBreakIterator::ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+ : jni_cache_(jni_cache),
+ iterator_batcher_(std::move(iterator_batcher)),
+ is_done_(false),
+ is_almost_done_(false) {}
+
+int ReverseJniBreakIterator::Next() {
+ if (is_done_) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ if (break_indices_cache_.empty()) {
+ if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
+ // Either there were no more results or an error occurred. Either way,
+ // mark ourselves as done and return.
+ is_done_ = true;
+ return ReverseJniBreakIterator::kDone;
+ }
+ is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+ }
+ int break_index = break_indices_cache_.front();
+ break_indices_cache_.pop();
+ is_done_ = is_almost_done_ && break_indices_cache_.empty();
+ return break_index;
+}
+
+int ReverseJniBreakIterator::First() {
+ const int first_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_first);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return first_index;
+}
+
+int ReverseJniBreakIterator::Preceding(int offset) {
+ const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return preceding_index;
+}
+
+int ReverseJniBreakIterator::Following(int offset) {
+ const int following_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return following_index;
+}
+
+int ReverseJniBreakIterator::FetchNextBatch() {
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
+ libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
+ jni_cache_->GetEnv(), iterator_batcher_.get(),
+ jni_cache_->breakiterator_next, kBatchSize),
+ ReverseJniBreakIterator::kDone);
+ if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
+ if (num_indices == 0) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint* break_indices_arr =
+ static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
+ break_indices.get(), nullptr));
+ for (int i = 0; i < num_indices; ++i) {
+ break_indices_cache_.push(break_indices_arr[i]);
+ }
+ jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
+ break_indices_arr,
+ /*mode=*/0);
+ return num_indices;
+}
+
+void ReverseJniBreakIterator::ClearCache() {
+ break_indices_cache_ = std::queue<int>();
+ is_done_ = false;
+ is_almost_done_ = false;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h
new file mode 100644
index 0000000..c1f05f4
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+
+#include <jni.h>
+
+#include <queue>
+#include <string>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
+// hides the batching element to provide an interface akin to
+// java.text.BreakIterator.
+//
+// Example:
+// std::string text = "我每天走路去上班。";
+// ASSERT_THAT(text, SizeIs(27));
+// std::unique_ptr<ReverseJniBreakIterator> itr =
+// ReverseJniBreakIterator::Create(jni_cache, text, locale);
+// std::vector<int> nexts;
+// int next = itr->Next();
+// while (next != ReverseJniBreakIterator::kDone) {
+// nexts.push_back(next);
+// next = itr->Next();
+// }
+// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
+class ReverseJniBreakIterator {
+ public:
+ static constexpr int kDone = -1;
+
+ // Creates a ReverseJniBreakiterator with the given text and locale.
+ //
+ // Returns:
+ // A ReverseJniBreakIterator on success
+ // INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
+ // INTERNAL if unable to create any of the required Java objects
+ static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ Create(const JniCache* jni_cache, std::string_view text,
+ std::string_view locale);
+
+ // Returns the UTF-16 boundary following the current boundary. If the current
+ // boundary is the last text boundary, it returns
+ // ReverseJniBreakIterator::kDONE.
+ //
+ // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
+ // boundary. Callers interested in the UTF-8 boundary are required to maintain
+ // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
+ int Next();
+
+ // Returns the first UTF-16 boundary. The iterator's current position is set
+ // to the first text boundary and any cached data is cleared.
+ int First();
+
+ // Returns the position of the first UTF-16 boundary preceding the UTF-16
+ // offset. If there is no boundary preceding the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary was
+ // returned and any cached data is cleared.
+ int Preceding(int offset);
+
+ // Returns the position of the first UTF-16 boundary following the UTF-16
+ // offset. If there is no boundary following the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary
+ // was returned and any cached data is cleared.
+ int Following(int offset);
+
+ private:
+ ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+
+ // Fetches the results of up to kBatchSize next calls and stores them in
+ // break_indices_cache_. Returns the number of results or kDone if no more
+ // results could be fetched.
+ int FetchNextBatch();
+
+ // Empties the cache and sets is_done_ and is_almost_done_ to false.
+ void ClearCache();
+
+ // Keeps track of references to Java classes and methods. Does NOT own.
+ const JniCache* jni_cache_;
+
+ // The reference to the actual instance of BreakIteratorBatcher that
+ // this class interacts with.
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
+
+ // The cache holding the most recent batch of return values from
+ // BreakIteratorBatcher#next.
+ std::queue<int> break_indices_cache_;
+
+ bool is_done_;
+
+ // The last batch was incomplete (< kBatchSize results were returned). The
+ // next call to BreakIteratorBatcher#next is guaranteed to return an
+ // empty array. Once the results from the last batch are evicted from
+ // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
+ bool is_almost_done_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index 01c17f1..4ea93ec 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h
@@ -37,8 +37,6 @@
return (static_cast<uint8_t>(byte) & 0xC0) == 0x80;
}
- static bool IsAsciiChar(char c) { return static_cast<signed char>(c) >= 0; }
-
// Update a rolling crc32. This undoes the one's complement
// pre-conditioning and post-conditioning of zlib's
// crc32. Therefore, UpdateCrc32(0, str, len) != HashCrc32(str,
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index 888a4ab..ee3d3a2 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -11,9 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
//
// We store the trie in three areas: nodes, nexts and suffixes.
//
@@ -84,6 +81,7 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-flash-bitmap.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "icing/util/math-util.h"
@@ -249,6 +247,11 @@
const IcingFilesystem &filesystem);
bool Sync();
uint64_t GetDiskUsage() const;
+
+ // Returns the size of the elements held in the trie. This excludes the size
+ // of any internal metadata of the trie, e.g. the trie's header.
+ uint64_t GetElementsFileSize() const;
+
void Warm();
void Clear();
@@ -695,6 +698,18 @@
return total;
}
+uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize()
+ const {
+ // Trie files themselves, exclude size of the header. These arrays are dense,
+ // not sparse, so use file size for more accurate numbers.
+ uint64_t total = 0;
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ IcingFilesystem::IncrementByOrSetInvalid(
+ filesystem_->GetFileSize(array_fds_[i].get()), &total);
+ }
+ return total;
+}
+
IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
if (nodes_left() == 0) {
ICING_LOG(FATAL) << "No allocated nodes left";
@@ -1153,6 +1168,30 @@
return total;
}
+uint64_t IcingDynamicTrie::GetElementsSize() const {
+ uint64_t total = 0;
+
+ // Bitmaps are sparsely populated, so disk usage is more accurate for those.
+ // Property bitmaps.
+ IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
+ &total);
+ // The deleted bitmap is always initially grown to kGrowSize, whether there
+ // are elements or not. So even if there are no elements in the trie, we'll
+ // still have the bitmap of size kGrowSize, so subtract that from the size of
+ // the trie's elements.
+ total -= IcingFlashBitmap::kGrowSize;
+
+ for (auto &bitmap : property_bitmaps_) {
+ if (bitmap == nullptr) continue;
+ IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
+ }
+
+ // Storage. We can use file size here since the storage files aren't sparse.
+ IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(),
+ &total);
+ return total;
+}
+
std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
const std::string &filename, bool verify,
const IcingFilesystem *filesystem) {
@@ -1867,18 +1906,18 @@
// If we start with non-ascii, take all left branches while there is
// a continuation byte.
- if (!IcingStringUtil::IsAsciiChar(cur_[cur_len_ - 1])) {
+ if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
while (!node->is_leaf()) {
- if (cur_len_ >= UTFmax) break;
+ if (cur_len_ >= U8_MAX_LENGTH) break;
InitBranch(branch_end_, node, 0);
// When we are looking to complete a utf8 char, skip 0s.
if (branch_end_->child->val() == 0) {
// Check if we already have a valid cur_.
cur_[cur_len_] = 0;
- Rune rune;
- chartorune(&rune, cur_);
- if (rune == Runeerror && node->log2_num_children() > 0) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0);
+ if (uchar32 == i18n_utils::kInvalidUChar32 &&
+ node->log2_num_children() > 0) {
branch_end_->child++;
} else {
// Good termination. Just break.
@@ -1914,8 +1953,8 @@
void IcingDynamicTrie::Utf8Iterator::GoIntoSuffix(const Node *node) {
const char *suffix = trie_.storage_->GetSuffix(node->next_index());
const char *cur_suffix;
- for (cur_suffix = suffix;
- cur_len_ < UTFmax && IcingStringUtil::IsContinuationByte(*cur_suffix);
+ for (cur_suffix = suffix; cur_len_ < U8_MAX_LENGTH &&
+ IcingStringUtil::IsContinuationByte(*cur_suffix);
cur_suffix++) {
cur_[cur_len_++] = *cur_suffix;
}
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 2e93ef1..c33be96 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -48,7 +48,8 @@
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/legacy/index/icing-storage.h"
#include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
-#include "utf.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/utf8.h"
namespace icing {
namespace lib {
@@ -265,6 +266,10 @@
bool Remove() override;
uint64_t GetDiskUsage() const override;
+ // Returns the size of the elements held in the trie. This excludes the size
+ // of any internal metadata of the trie, e.g. the trie's header.
+ uint64_t GetElementsSize() const;
+
// REQUIRED: For all functions below is_initialized() == true.
// Number of keys in trie.
@@ -549,11 +554,11 @@
void InitBranch(Branch *branch, const Node *start, char key_char);
void GoIntoSuffix(const Node *node);
- char cur_[UTFmax + 1]; // NULL-terminated
+ char cur_[U8_MAX_LENGTH + 1]; // NULL-terminated
int cur_len_;
LogicalNode cur_logical_node_;
- Branch branch_stack_[UTFmax];
+ Branch branch_stack_[U8_MAX_LENGTH];
Branch *branch_end_;
const IcingDynamicTrie &trie_;
@@ -564,6 +569,7 @@
class CandidateSet;
// For testing only.
+ friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test;
friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
void GetHeader(IcingDynamicTrieHeader *hdr) const;
diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
new file mode 100644
index 0000000..4fae52a
--- /dev/null
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc
@@ -0,0 +1,921 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-dynamic-trie.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/hash/farmhash.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/testing/tmp-directory.h"
+
+using testing::ElementsAre;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr std::string_view kKeys[] = {
+ "", "ab", "ac", "abd", "bac", "bb", "bacd", "abbb", "abcdefg",
+};
+constexpr uint32_t kNumKeys = ABSL_ARRAYSIZE(kKeys);
+
+class IcingDynamicTrieTest : public ::testing::Test {
+ protected:
+ class NewValueMap : public IcingDynamicTrie::NewValueMap {
+ public:
+ const void* GetNewValue(uint32_t old_value_index) const override {
+ const_cast<NewValueMap*>(this)->buf_ = old_value_index;
+ return &buf_;
+ }
+
+ private:
+ uint32_t buf_;
+ };
+
+ // Output trie stats to stderr.
+ static void StatsDump(const IcingDynamicTrie& trie) {
+ IcingDynamicTrie::Stats stats;
+ trie.CollectStats(&stats);
+ DLOG(INFO) << "Stats:\n" << stats.DumpStats(true);
+ }
+
+ static void AddToTrie(IcingDynamicTrie* trie, uint32_t num_keys) {
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(&key, 0, "%u+%010u", i % 2, i);
+ bool inserted = trie->Insert(key.c_str(), &i);
+ ASSERT_TRUE(inserted);
+ }
+ }
+
+ static void CheckTrie(const IcingDynamicTrie& trie, uint32_t num_keys) {
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(&key, 0, "%u+%010u", i % 2, i);
+ uint32_t val;
+ bool found = trie.Find(key.c_str(), &val);
+ EXPECT_TRUE(found);
+ EXPECT_EQ(i, val);
+ }
+ }
+
+ static void PrintTrie(const IcingDynamicTrie& trie) {
+ std::vector<std::string> keys;
+ std::ostringstream os;
+ DLOG(INFO) << "Trie:\n";
+ trie.DumpTrie(&os, &keys);
+ DLOG(INFO) << os.str();
+ }
+
+ void SetUp() override {
+ trie_files_dir_ = GetTestTempDir() + "/trie_files";
+ trie_files_prefix_ = trie_files_dir_ + "/test_file_";
+ }
+
+ void TearDown() override {
+ IcingFilesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(trie_files_dir_.c_str());
+ }
+
+ std::string trie_files_dir_;
+ std::string trie_files_prefix_;
+};
+
+constexpr std::string_view kCommonEnglishWords[] = {
+ "that", "was", "for", "on", "are", "with", "they", "be", "at",
+ "one", "have", "this", "from", "word", "but", "what", "some", "you",
+ "had", "the", "and", "can", "out", "other", "were", "which", "their",
+ "time", "will", "how", "said", "each", "tell", "may", "three"};
+constexpr uint32_t kCommonEnglishWordArrayLen =
+ sizeof(kCommonEnglishWords) / sizeof(std::string_view);
+
+TEST_F(IcingDynamicTrieTest, Simple) {
+ // Test simple key insertions.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ EXPECT_EQ(trie.size(), kNumKeys);
+
+ StatsDump(trie);
+ std::vector<std::string> keys;
+ std::ostringstream os;
+ DLOG(INFO) << "Trie:\n";
+ trie.DumpTrie(&os, &keys);
+ DLOG(INFO) << os.str();
+ EXPECT_EQ(keys.size(), kNumKeys);
+}
+
+TEST_F(IcingDynamicTrieTest, Init) {
+ // Test create/init behavior.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ EXPECT_FALSE(trie.is_initialized());
+ EXPECT_FALSE(trie.Init());
+
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ EXPECT_TRUE(trie.Init());
+ EXPECT_TRUE(trie.is_initialized());
+}
+
+TEST_F(IcingDynamicTrieTest, Iterator) {
+ // Test iterator.
+ IcingFilesystem filesystem;
+ uint32_t val;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
+ }
+
+ // We try everything twice to test that Reset also works.
+
+ // Should get the entire trie.
+ IcingDynamicTrie::Iterator it_all(trie, "");
+ for (int i = 0; i < 2; i++) {
+ uint32_t count = 0;
+ for (; it_all.IsValid(); it_all.Advance()) {
+ uint32_t val_idx = it_all.GetValueIndex();
+ EXPECT_EQ(it_all.GetValue(), trie.GetValueAtIndex(val_idx));
+ count++;
+ }
+ EXPECT_EQ(count, kNumKeys);
+ it_all.Reset();
+ }
+
+ // Get everything under "a".
+ IcingDynamicTrie::Iterator it1(trie, "a");
+ for (int i = 0; i < 2; i++) {
+ ASSERT_TRUE(it1.IsValid());
+ EXPECT_STREQ(it1.GetKey(), "ab");
+ static const uint32_t kOne = 1;
+ ASSERT_TRUE(it1.GetValue() != nullptr);
+ EXPECT_TRUE(!memcmp(it1.GetValue(), &kOne, sizeof(kOne)));
+
+ ASSERT_TRUE(it1.Advance());
+ ASSERT_TRUE(it1.IsValid());
+ EXPECT_STREQ(it1.GetKey(), "abbb");
+
+ ASSERT_TRUE(it1.Advance());
+ ASSERT_TRUE(it1.IsValid());
+ EXPECT_STREQ(it1.GetKey(), "abcdefg");
+
+ ASSERT_TRUE(it1.Advance());
+ ASSERT_TRUE(it1.IsValid());
+ EXPECT_STREQ(it1.GetKey(), "abd");
+
+ ASSERT_TRUE(it1.Advance());
+ ASSERT_TRUE(it1.IsValid());
+ EXPECT_STREQ(it1.GetKey(), "ac");
+
+ EXPECT_FALSE(it1.Advance());
+ EXPECT_FALSE(it1.IsValid());
+
+ it1.Reset();
+ }
+
+ // Now "b".
+ IcingDynamicTrie::Iterator it2(trie, "b");
+ for (int i = 0; i < 2; i++) {
+ ASSERT_TRUE(it2.IsValid());
+ EXPECT_STREQ(it2.GetKey(), "bac");
+ val = 1;
+ ASSERT_TRUE(it1.GetValue() != nullptr);
+ EXPECT_TRUE(!memcmp(it1.GetValue(), &val, sizeof(val)));
+ val = 4;
+ ASSERT_TRUE(it2.GetValue() != nullptr);
+ EXPECT_TRUE(!memcmp(it2.GetValue(), &val, sizeof(val)));
+
+ ASSERT_TRUE(it2.Advance());
+ ASSERT_TRUE(it2.IsValid());
+ EXPECT_STREQ(it2.GetKey(), "bacd");
+
+ ASSERT_TRUE(it2.Advance());
+ ASSERT_TRUE(it2.IsValid());
+ EXPECT_STREQ(it2.GetKey(), "bb");
+
+ EXPECT_FALSE(it2.Advance());
+ EXPECT_FALSE(it2.IsValid());
+
+ it2.Reset();
+ }
+
+ // Get everything under "ab".
+ IcingDynamicTrie::Iterator it3(trie, "ab");
+ for (int i = 0; i < 2; i++) {
+ ASSERT_TRUE(it3.IsValid());
+ EXPECT_STREQ(it3.GetKey(), "ab");
+ val = 1;
+ ASSERT_TRUE(it3.GetValue() != nullptr);
+ EXPECT_TRUE(!memcmp(it3.GetValue(), &val, sizeof(val)));
+
+ ASSERT_TRUE(it3.Advance());
+ ASSERT_TRUE(it3.IsValid());
+ EXPECT_STREQ(it3.GetKey(), "abbb");
+
+ ASSERT_TRUE(it3.Advance());
+ ASSERT_TRUE(it3.IsValid());
+ EXPECT_STREQ(it3.GetKey(), "abcdefg");
+
+ ASSERT_TRUE(it3.Advance());
+ ASSERT_TRUE(it3.IsValid());
+ EXPECT_STREQ(it3.GetKey(), "abd");
+
+ EXPECT_FALSE(it3.Advance());
+ EXPECT_FALSE(it3.IsValid());
+
+ it3.Reset();
+ }
+
+ // Should match only one key exactly.
+ constexpr std::string_view kOneMatch[] = {
+ "abd",
+ "abcd",
+ "abcdef",
+ "abcdefg",
+ };
+ // With the following match:
+ constexpr std::string_view kOneMatchMatched[] = {
+ "abd",
+ "abcdefg",
+ "abcdefg",
+ "abcdefg",
+ };
+
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kOneMatch); k++) {
+ IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data());
+ for (int i = 0; i < 2; i++) {
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_STREQ(it_single.GetKey(), kOneMatchMatched[k].data());
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+
+ it_single.Reset();
+ }
+ }
+
+ // Matches nothing.
+ constexpr std::string_view kNoMatch[] = {
+ "abbd",
+ "abcdeg",
+ "abcdefh",
+ };
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kNoMatch); k++) {
+ IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data());
+ for (int i = 0; i < 2; i++) {
+ EXPECT_FALSE(it_empty.IsValid());
+
+ it_empty.Reset();
+ }
+ }
+
+ // Clear.
+ trie.Clear();
+ EXPECT_FALSE(IcingDynamicTrie::Iterator(trie, "").IsValid());
+ EXPECT_EQ(0u, trie.size());
+ EXPECT_EQ(1.0, trie.min_free_fraction());
+}
+
+TEST_F(IcingDynamicTrieTest, Persistence) {
+ // Test persistence on the English dictionary.
+ IcingFilesystem filesystem;
+ {
+ // Test with a trie including strings in words. Test will fail if
+ // words are not unique.
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ EXPECT_FALSE(trie.Init());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &i));
+ }
+ // Explicitly omit sync.
+
+ StatsDump(trie);
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+ EXPECT_EQ(0U, trie.size());
+
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &i));
+ }
+ trie.Sync();
+
+ StatsDump(trie);
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ // Make sure we can find everything with the right value.
+ uint32_t found_count = 0;
+ uint32_t matched_count = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ uint32_t val;
+ bool found = trie.Find(kCommonEnglishWords[i].data(), &val);
+ if (found) {
+ found_count++;
+ if (i == val) {
+ matched_count++;
+ }
+ }
+ }
+ EXPECT_EQ(found_count, kCommonEnglishWordArrayLen);
+ EXPECT_EQ(matched_count, kCommonEnglishWordArrayLen);
+
+ StatsDump(trie);
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, PersistenceShared) {
+ // Test persistence on the English dictionary.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie::RuntimeOptions ropt;
+
+ {
+ // Test with a trie including strings in words. Test will fail if
+ // words are not unique.
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ EXPECT_FALSE(trie.Init());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t next_reopen = kCommonEnglishWordArrayLen / 16;
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &i));
+
+ if (i == next_reopen) {
+ ASSERT_NE(0u, trie.UpdateCrc());
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+
+ next_reopen += next_reopen / 2;
+ }
+ }
+ // Explicitly omit sync. Shared should automatically persist.
+
+ StatsDump(trie);
+ }
+
+ // Go back and forth between the two policies.
+ for (int i = 0; i < 5; i++) {
+ if (i % 2 == 0) {
+ DLOG(INFO) << "Opening with map shared";
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ } else {
+ DLOG(INFO) << "Opening with explicit flush";
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kExplicitFlush;
+ }
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ // Make sure we can find everything with the right value.
+ uint32_t found_count = 0;
+ uint32_t matched_count = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ uint32_t val;
+ bool found = trie.Find(kCommonEnglishWords[i].data(), &val);
+ if (found) {
+ found_count++;
+ if (i == val) {
+ matched_count++;
+ }
+ }
+ }
+ EXPECT_EQ(found_count, kCommonEnglishWordArrayLen);
+ EXPECT_EQ(matched_count, kCommonEnglishWordArrayLen);
+
+ StatsDump(trie);
+ }
+
+ // Clear and re-open.
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ ASSERT_TRUE(trie.Init());
+ trie.Clear();
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+}
+
+TEST_F(IcingDynamicTrieTest, Sync) {
+ IcingFilesystem filesystem;
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ StatsDump(trie);
+ PrintTrie(trie);
+
+ trie.Sync();
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ StatsDump(trie);
+ PrintTrie(trie);
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, LimitsZero) {
+ // Don't crash if we set limits to 0.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_FALSE(trie.CreateIfNotExist(IcingDynamicTrie::Options(0, 0, 0, 0)));
+}
+
+TEST_F(IcingDynamicTrieTest, LimitsSmall) {
+ // Test limits with a few keys.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(
+ IcingDynamicTrie::Options(10, 300, 30, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ ASSERT_LT(3U, kNumKeys);
+
+ for (uint32_t i = 0; i < 3; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i)) << i;
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ uint32_t val = 3;
+ EXPECT_FALSE(trie.Insert(kKeys[3].data(), &val));
+
+ StatsDump(trie);
+ PrintTrie(trie);
+}
+
+TEST_F(IcingDynamicTrieTest, DISABLEDFingerprintedKeys) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie::Options options(4 << 20, 4 << 20, 20 << 20,
+ sizeof(uint32_t));
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(options));
+ ASSERT_TRUE(trie.Init());
+ IcingDynamicTrie triefp(trie_files_prefix_ + ".fps",
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(triefp.CreateIfNotExist(options));
+ ASSERT_TRUE(triefp.Init());
+
+ static const uint32_t kNumKeys = 1000000;
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(
+ &key, 1000, "content://gmail-ls/account/conversation/%u/message/%u", i,
+ 10 * i);
+ ASSERT_TRUE(trie.Insert(key.c_str(), &i));
+
+ // Now compute a fingerprint.
+ uint64_t fpkey = tc3farmhash::Fingerprint64(key);
+
+ // Convert to base255 since keys in trie cannot contain 0.
+ uint8_t fpkey_base255[9];
+ for (int j = 0; j < 8; j++) {
+ fpkey_base255[j] = (fpkey % 255) + 1;
+ fpkey /= 255;
+ }
+ fpkey_base255[8] = '\0';
+ ASSERT_TRUE(
+ triefp.Insert(reinterpret_cast<const char*>(fpkey_base255), &i));
+
+ // Sync periodically to gauge write locality.
+ if ((i + 1) % (kNumKeys / 10) == 0) {
+ DLOG(INFO) << "Trie sync";
+ trie.Sync();
+ DLOG(INFO) << "Trie fp sync";
+ triefp.Sync();
+ }
+ }
+
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+ DLOG(INFO) << "Trie fp stats";
+ StatsDump(triefp);
+}
+
+TEST_F(IcingDynamicTrieTest, AddDups) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kNumKeys = 5000;
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+}
+
+TEST_F(IcingDynamicTrieTest, Properties) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kOne = 1;
+ uint32_t val_idx;
+ trie.Insert("abcd", &kOne, &val_idx, false);
+ trie.SetProperty(val_idx, 0);
+ trie.SetProperty(val_idx, 3);
+
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 3);
+ ASSERT_TRUE(reader.Exists());
+ EXPECT_TRUE(reader.HasProperty(val_idx));
+ EXPECT_FALSE(reader.HasProperty(1000));
+ }
+
+ // Disappear after close.
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 3);
+ EXPECT_FALSE(reader.HasProperty(val_idx));
+ }
+
+ // Persist after sync.
+ trie.Insert("abcd", &kOne, &val_idx, false);
+ trie.SetProperty(val_idx, 1);
+ ASSERT_TRUE(trie.Sync());
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t val;
+ ASSERT_TRUE(trie.Find("abcd", &val, &val_idx));
+ EXPECT_EQ(1u, val);
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 1);
+ EXPECT_TRUE(reader.HasProperty(val_idx));
+ }
+
+ // Get all.
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_TRUE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ EXPECT_TRUE(readers.Exists(3));
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, ClearSingleProperty) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kOne = 1;
+ uint32_t val_idx[3];
+ trie.Insert("abcd", &kOne, &val_idx[0], false);
+ trie.SetProperty(val_idx[0], 0);
+ trie.SetProperty(val_idx[0], 3);
+
+ trie.Insert("efgh", &kOne, &val_idx[1], false);
+ trie.SetProperty(val_idx[1], 0);
+ trie.SetProperty(val_idx[1], 3);
+
+ trie.Insert("ijkl", &kOne, &val_idx[2], false);
+ trie.SetProperty(val_idx[2], 0);
+ trie.SetProperty(val_idx[2], 3);
+
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_FALSE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ EXPECT_TRUE(readers.Exists(3));
+ for (size_t i = 0; i < readers.size(); i++) {
+ if (readers.Exists(i)) {
+ for (size_t j = 0; j < sizeof(val_idx) / sizeof(uint32_t); ++j) {
+ EXPECT_TRUE(readers.HasProperty(i, val_idx[j]));
+ }
+ }
+ }
+ }
+
+ EXPECT_TRUE(trie.ClearPropertyForAllValues(3));
+
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_FALSE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ // Clearing the property causes all values to be deleted.
+ EXPECT_FALSE(readers.Exists(3));
+ for (size_t i = 0; i < readers.size(); i++) {
+ for (size_t j = 0; j < sizeof(val_idx) / sizeof(uint32_t); ++j) {
+ if (i == 0) {
+ EXPECT_TRUE(readers.HasProperty(i, val_idx[j]));
+ } else {
+ EXPECT_FALSE(readers.HasProperty(i, val_idx[j]));
+ }
+ }
+ }
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, Compact) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ EXPECT_EQ(trie.size(), kNumKeys);
+
+ StatsDump(trie);
+ PrintTrie(trie);
+
+ IcingDynamicTrie trie2(trie_files_prefix_ + "-2",
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie2.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie2.Init());
+
+ std::unordered_map<uint32_t, uint32_t> old_to_new_tvi;
+ trie.Compact(NewValueMap(), &trie2, &old_to_new_tvi);
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ uint32_t val;
+ bool found = trie2.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ EXPECT_TRUE(old_to_new_tvi.find(val) != old_to_new_tvi.end());
+ }
+}
+
+} // namespace
+
+// The tests below are accessing private methods and fields of IcingDynamicTrie
+// so can't be in the anonymous namespace.
+
+TEST_F(IcingDynamicTrieTest, TrieShouldRespectLimits) {
+ // Test limits on numbers of nodes, nexts, and suffixes size.
+ IcingFilesystem filesystem;
+
+ // These 3 numbers are the entities we need in order to insert all the test
+ // words before the last one.
+ uint32_t num_nodes_enough;
+ uint32_t num_nexts_enough;
+ uint32_t suffixes_size_enough;
+
+ // First, try to fill the 3 numbers above.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ // Creates a trie with enough numbers of nodes, nexts, and suffix file size.
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, /*max_nexts_in=*/1000,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &value));
+ }
+
+ IcingDynamicTrieHeader header;
+ trie.GetHeader(&header);
+
+ // Before each insertion, it requires that there're (2 + 1 + key_length)
+ // nodes left, so we need 8 nodes to insert the last word. +7 here will make
+ // it just enough to insert the word before the last one.
+ num_nodes_enough = header.num_nodes() + 7;
+
+ // Before each insertion, it requires that there're (2 + 1 + key_length +
+ // kMaxNextArraySize) nexts left, so we need (8 + kMaxNextArraySize) nexts
+ // to insert the last word. (7 + kMaxNextArraySize) here will make it just
+ // enough to insert the word before the last one.
+ num_nexts_enough =
+ header.num_nexts() + 7 + IcingDynamicTrie::kMaxNextArraySize;
+
+ // Before each insertion, it requires that there're (1 + key_length +
+ // value_size) bytes left for suffixes, so we need (6 + sizeof(uint32_t))
+ // bytes to insert the last word. (5 + sizeof(uint32_t)) here will make it
+ // just enough to insert the word before the last one.
+ suffixes_size_enough = header.suffixes_size() + 5 + sizeof(uint32_t);
+ }
+
+ // Test a trie with just enough number of nodes.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ num_nodes_enough, /*max_nexts_in=*/1000,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &value));
+ }
+
+ // Fails to insert the last word because no enough nodes left.
+ EXPECT_FALSE(trie.Insert(
+ kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(), &value));
+ }
+
+ // Test a trie with just enough number of nexts.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, num_nexts_enough,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &value));
+ }
+
+ // Fails to insert the last word because no enough nexts left.
+ EXPECT_FALSE(trie.Insert(
+ kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(), &value));
+ }
+
+ // Test a trie with just enough suffixes size.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, /*max_nexts_in=*/1000, suffixes_size_enough,
+ sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_TRUE(trie.Insert(kCommonEnglishWords[i].data(), &value));
+ }
+
+ // Fails to insert the last word because no enough space for more suffixes.
+ EXPECT_FALSE(trie.Insert(
+ kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(), &value));
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, SyncErrorRecovery) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kNumKeys = 5000;
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+
+ trie.Sync();
+ trie.Close();
+
+ // Reach into the file and set the value_size.
+ ASSERT_TRUE(trie.Init());
+ IcingDynamicTrieHeader hdr;
+ trie.GetHeader(&hdr);
+ hdr.set_value_size(hdr.value_size() + 123);
+ trie.SetHeader(hdr);
+ trie.Close();
+
+ ASSERT_FALSE(trie.Init());
+}
+
+TEST_F(IcingDynamicTrieTest, BitmapsClosedWhenInitFails) {
+ // Create trie with one property.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(
+ trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+ ASSERT_TRUE(trie.deleted_bitmap_);
+ trie.SetProperty(0, 0);
+ ASSERT_EQ(1, trie.property_bitmaps_.size());
+ ASSERT_TRUE(trie.property_bitmaps_[0]);
+ trie.Close();
+
+ // Intentionally corrupt deleted_bitmap file to make Init() fail.
+ FILE* fp = fopen(trie.deleted_bitmap_filename_.c_str(), "r+");
+ ASSERT_TRUE(fp);
+ ASSERT_EQ(16, fwrite("################", 1, 16, fp));
+ fclose(fp);
+ ASSERT_FALSE(trie.Init());
+
+ // Check that both the bitmap and the property files have been closed.
+ ASSERT_FALSE(trie.deleted_bitmap_);
+ ASSERT_EQ(0, trie.property_bitmaps_.size());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index 2b10c1c..f645632 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -17,13 +17,15 @@
#ifndef ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
#define ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
+#include <sys/types.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
#include <memory>
#include <string>
#include <unordered_set>
+#include <utility>
#include <vector>
namespace icing {
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index 9abd369..3b3521a 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -11,9 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
//
// A disk-backed bitmap.
//
diff --git a/icing/legacy/index/icing-mmapper.h b/icing/legacy/index/icing-mmapper.h
index bf62aa5..d054c11 100644
--- a/icing/legacy/index/icing-mmapper.h
+++ b/icing/legacy/index/icing-mmapper.h
@@ -22,9 +22,11 @@
#ifndef ICING_LEGACY_INDEX_ICING_MMAPPER_H_
#define ICING_LEGACY_INDEX_ICING_MMAPPER_H_
-#include <stdint.h>
#include <unistd.h>
+#include <cstddef>
+#include <cstdint>
+
namespace icing {
namespace lib {
diff --git a/icing/legacy/index/icing-storage.h b/icing/legacy/index/icing-storage.h
index cc06c54..58b6aa1 100644
--- a/icing/legacy/index/icing-storage.h
+++ b/icing/legacy/index/icing-storage.h
@@ -20,6 +20,7 @@
#ifndef ICING_LEGACY_INDEX_ICING_STORAGE_H_
#define ICING_LEGACY_INDEX_ICING_STORAGE_H_
+#include <cstdint>
#include <string>
namespace icing {
diff --git a/icing/performance-configuration.cc b/icing/performance-configuration.cc
new file mode 100644
index 0000000..aeaa449
--- /dev/null
+++ b/icing/performance-configuration.cc
@@ -0,0 +1,91 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/performance-configuration.h"
+
+#include "icing/result/result-state.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Search-related thresholds:
+// Search performance mainly involves the following components:
+// 1. QueryProcessor
+// Running time is O(query_length) according to results of
+// //icing/query:query-processor_benchmark.
+// 2. ScoringProcessor and Ranker
+// Running time is O(num_to_score) according to results of
+// //icing/scoring:score-and-rank_benchmark. Note that the
+// process includes scoring, building a heap, and popping results from the
+// heap.
+// 3. ResultRetriever
+// Running time is O(page_size). Assuming page_size is always a small number,
+// it's actually O(1).
+//
+// Overall Search performance goal: 33 ms. On a 60FPS screen it's the time of
+// rendering 2 frames.
+//
+// With the information above, we then try to choose default values for
+// query_length and num_to_score so that the overall time can comfortably fit
+// in with our goal.
+// 1. Set query_length to 23000 so that any query can be executed by
+// QueryProcessor within 15 ms on a Pixel 3 XL according to results of
+// //icing/query:query-processor_benchmark.
+// 2. Set num_to_score to 30000 so that results can be scored and ranked within
+// 3 ms on a Pixel 3 XL according to results of
+// //icing/scoring:score-and-rank_benchmark.
+//
+// In the worse-case scenario, we still have [33 ms - 15 ms - 3 ms] = 15 ms left
+// for all the other things like proto parsing, document fetching, and even
+// Android Binder calls if Icing search engine runs in a separate process.
+constexpr int kMaxQueryLength = 23000;
+constexpr int kDefaultNumToScore = 30000;
+
+// New Android devices nowadays all allow more than 16 MB memory per app. Using
+// that as a guideline, we set 16 MB as the safe memory threshold.
+// TODO(b/150029642): Android apps / framework have better understanding of how
+// much memory is allowed, so it would be better to let clients pass in this
+// value.
+constexpr int kSafeMemoryUsage = 16 * 1024 * 1024; // 16MB
+
+// This number is not determined by benchmarks. We just assume that returning
+// the best 1000 scored document hits of a query is enough. To find the best
+// 1000 scored document hits from a heap, we need roughly 0.7 ms on a Pixel 3 XL
+// according to //icing/scoring:ranker_benchmark.
+constexpr int kMaxNumHitsPerQuery = 1000;
+
+// A rough estimation of the size of ResultState if it stores the maximum number
+// of scored document hits.
+constexpr int kMaxMemoryPerResult =
+ sizeof(ResultState) + kMaxNumHitsPerQuery * sizeof(ScoredDocumentHit);
+
+// To be safer, we assume that all the Results contain the maximum number of
+// hits and only use half of the memory allowed.
+constexpr int kDefaultNumResultsToCache =
+ kSafeMemoryUsage / 2 / kMaxMemoryPerResult;
+
+static_assert(
+ kDefaultNumResultsToCache > 500,
+ "Default number of results to cache has changed, please update and make "
+ "sure it still meets our requirements.");
+} // namespace
+
+PerformanceConfiguration::PerformanceConfiguration()
+ : PerformanceConfiguration(kMaxQueryLength, kDefaultNumToScore,
+ kMaxNumHitsPerQuery, kDefaultNumResultsToCache) {
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/performance-configuration.h b/icing/performance-configuration.h
new file mode 100644
index 0000000..fa4050b
--- /dev/null
+++ b/icing/performance-configuration.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_PERFORMANCE_CONFIGURATION_H_
+#define ICING_PERFORMANCE_CONFIGURATION_H_
+
+namespace icing {
+namespace lib {
+
+// Stores key thresholds that affect performance of Icing search engine.
+struct PerformanceConfiguration {
+ // Loads default configuration.
+ PerformanceConfiguration();
+
+ PerformanceConfiguration(int max_query_length_in, int num_to_score_in,
+ int max_num_hits_per_query_in,
+ int max_num_cache_results_in)
+ : max_query_length(max_query_length_in),
+ num_to_score(num_to_score_in),
+ max_num_hits_per_query(max_num_hits_per_query_in),
+ max_num_cache_results(max_num_cache_results_in) {}
+
+ // Search performance
+
+ // Maximum length of query to execute in IndexProcessor.
+ int max_query_length;
+
+ // Number of results to score in ScoringProcessor for every query.
+ int num_to_score;
+
+ // Memory
+
+ // Maximum number of ScoredDocumentHits to return per query.
+ int max_num_hits_per_query;
+
+ // Maximum number of ResultStates to store in ResultStateManager.
+ int max_num_cache_results;
+};
+
+// TODO(b/149040810): Consider creating a class to manage performance
+// configurations according to different devices/platforms/clients and even
+// real-time memory usage.
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_PERFORMANCE_CONFIGURATION_H_
diff --git a/icing/query/query-processor-test-jni-layer.cc b/icing/query/query-processor-test-jni-layer.cc
new file mode 100644
index 0000000..9f7e42e
--- /dev/null
+++ b/icing/query/query-processor-test-jni-layer.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_QueryProcessorJniTest_testsMain(JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
index e585350..4d714f8 100644
--- a/icing/query/query-processor.cc
+++ b/icing/query/query-processor.cc
@@ -19,7 +19,7 @@
#include <stack>
#include <string>
#include <string_view>
-#include <unordered_set>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -270,7 +270,7 @@
break;
}
case Token::Type::INVALID:
- U_FALLTHROUGH;
+ [[fallthrough]];
default:
// This wouldn't happen if tokenizer and query processor both work
// correctly. An unknown token indicates inconsistency between tokenizer
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index dc4a387..000bf3a 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,6 +16,7 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
@@ -26,6 +27,8 @@
#include "icing/testing/fake-clock.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/util/logging.h"
// Run on a Linux workstation:
@@ -64,7 +67,8 @@
void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id,
TermMatchType::Code term_match_type,
const std::string& token) {
- Index::Editor editor = index->Edit(document_id, section_id, term_match_type);
+ Index::Editor editor =
+ index->Edit(document_id, section_id, term_match_type, /*namespace_id=*/0);
ICING_ASSERT_OK(editor.AddHit(token.c_str()));
}
@@ -75,29 +79,27 @@
}
std::unique_ptr<Normalizer> CreateNormalizer() {
- return Normalizer::Create(
+ return normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
}
-void CleanUp(const Filesystem& filesystem, const std::string& base_dir) {
- filesystem.DeleteDirectoryRecursively(base_dir.c_str());
-}
-
void BM_QueryOneTerm(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem icing_filesystem;
Filesystem filesystem;
- const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
- CleanUp(filesystem, base_dir);
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
@@ -106,7 +108,7 @@
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -151,18 +153,41 @@
}
}
- // Destroy document store before the whole directory is removed because it
- // persists data in destructor.
+ // Destroy document store and schema store before the whole directory is
+ // removed because they persist data in destructor.
document_store.reset();
- CleanUp(filesystem, base_dir);
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_QueryOneTerm)
+ // The testing numbers are in an ascending order with a fixed interval, that
+ // way we can tell if the performance increments are linear, exponential, or
+ // something else.
->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
+ ->Arg(3000)
+ ->Arg(5000)
+ ->Arg(7000)
+ ->Arg(9000)
+ ->Arg(11000)
+ ->Arg(13000)
+ ->Arg(15000)
+ ->Arg(17000)
+ ->Arg(19000)
+ ->Arg(21000)
+ ->Arg(23000)
+ ->Arg(25000)
+ ->Arg(27000)
+ ->Arg(29000)
+ ->Arg(31000)
+ ->Arg(33000)
+ ->Arg(35000)
+ ->Arg(37000)
+ ->Arg(39000)
+ ->Arg(41000)
+ ->Arg(43000)
+ ->Arg(45000)
+ ->Arg(47000)
+ ->Arg(49000)
->Arg(64000)
->Arg(128000)
->Arg(256000)
@@ -175,17 +200,18 @@
void BM_QueryFiveTerms(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem icing_filesystem;
Filesystem filesystem;
- const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
- CleanUp(filesystem, base_dir);
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
@@ -194,7 +220,7 @@
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -257,18 +283,41 @@
}
}
- // Destroy document store before the whole directory is removed because it
- // persists data in destructor.
+ // Destroy document store and schema store before the whole directory is
+ // removed because they persist data in destructor.
document_store.reset();
- CleanUp(filesystem, base_dir);
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_QueryFiveTerms)
+ // The testing numbers are in an ascending order with a fixed interval, that
+ // way we can tell if the performance increments are linear, exponential, or
+ // something else.
->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
+ ->Arg(3000)
+ ->Arg(5000)
+ ->Arg(7000)
+ ->Arg(9000)
+ ->Arg(11000)
+ ->Arg(13000)
+ ->Arg(15000)
+ ->Arg(17000)
+ ->Arg(19000)
+ ->Arg(21000)
+ ->Arg(23000)
+ ->Arg(25000)
+ ->Arg(27000)
+ ->Arg(29000)
+ ->Arg(31000)
+ ->Arg(33000)
+ ->Arg(35000)
+ ->Arg(37000)
+ ->Arg(39000)
+ ->Arg(41000)
+ ->Arg(43000)
+ ->Arg(45000)
+ ->Arg(47000)
+ ->Arg(49000)
->Arg(64000)
->Arg(128000)
->Arg(256000)
@@ -281,17 +330,18 @@
void BM_QueryDiacriticTerm(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem icing_filesystem;
Filesystem filesystem;
- const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
- CleanUp(filesystem, base_dir);
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
@@ -300,7 +350,7 @@
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -348,18 +398,41 @@
}
}
- // Destroy document store before the whole directory is removed because it
- // persists data in destructor.
+ // Destroy document store and schema store before the whole directory is
+ // removed because they persist data in destructor.
document_store.reset();
- CleanUp(filesystem, base_dir);
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_QueryDiacriticTerm)
+ // The testing numbers are in an ascending order with a fixed interval, that
+ // way we can tell if the performance increments are linear, exponential, or
+ // something else.
->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
+ ->Arg(3000)
+ ->Arg(5000)
+ ->Arg(7000)
+ ->Arg(9000)
+ ->Arg(11000)
+ ->Arg(13000)
+ ->Arg(15000)
+ ->Arg(17000)
+ ->Arg(19000)
+ ->Arg(21000)
+ ->Arg(23000)
+ ->Arg(25000)
+ ->Arg(27000)
+ ->Arg(29000)
+ ->Arg(31000)
+ ->Arg(33000)
+ ->Arg(35000)
+ ->Arg(37000)
+ ->Arg(39000)
+ ->Arg(41000)
+ ->Arg(43000)
+ ->Arg(45000)
+ ->Arg(47000)
+ ->Arg(49000)
->Arg(64000)
->Arg(128000)
->Arg(256000)
@@ -372,17 +445,18 @@
void BM_QueryHiragana(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
IcingFilesystem icing_filesystem;
Filesystem filesystem;
- const std::string base_dir = GetTestTempDir() + "/query_test";
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
- CleanUp(filesystem, base_dir);
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(index_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(schema_dir.c_str()) ||
!filesystem.CreateDirectoryRecursively(doc_store_dir.c_str())) {
@@ -391,7 +465,7 @@
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -439,18 +513,41 @@
}
}
- // Destroy document store before the whole directory is removed because it
- // persists data in destructor.
+ // Destroy document store and schema store before the whole directory is
+ // removed because they persist data in destructor.
document_store.reset();
- CleanUp(filesystem, base_dir);
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_QueryHiragana)
+ // The testing numbers are in an ascending order with a fixed interval, that
+ // way we can tell if the performance increments are linear, exponential, or
+ // something else.
->Arg(1000)
- ->Arg(2000)
- ->Arg(4000)
- ->Arg(8000)
- ->Arg(16000)
- ->Arg(32000)
+ ->Arg(3000)
+ ->Arg(5000)
+ ->Arg(7000)
+ ->Arg(9000)
+ ->Arg(11000)
+ ->Arg(13000)
+ ->Arg(15000)
+ ->Arg(17000)
+ ->Arg(19000)
+ ->Arg(21000)
+ ->Arg(23000)
+ ->Arg(25000)
+ ->Arg(27000)
+ ->Arg(29000)
+ ->Arg(31000)
+ ->Arg(33000)
+ ->Arg(35000)
+ ->Arg(37000)
+ ->Arg(39000)
+ ->Arg(41000)
+ ->Arg(43000)
+ ->Arg(45000)
+ ->Arg(47000)
+ ->Arg(49000)
->Arg(64000)
->Arg(128000)
->Arg(256000)
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index d67071a..dc94a72 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -17,11 +17,13 @@
#include <memory>
#include <string>
+#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -36,10 +38,14 @@
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -52,100 +58,75 @@
using ::testing::Test;
using ::testing::UnorderedElementsAre;
+SchemaTypeConfigProto* AddSchemaType(SchemaProto* schema,
+ std::string schema_type) {
+ SchemaTypeConfigProto* type_config = schema->add_types();
+ type_config->set_schema_type(schema_type);
+ return type_config;
+}
+
+void AddIndexedProperty(SchemaTypeConfigProto* type_config, std::string name) {
+ PropertyConfigProto* property_config = type_config->add_properties();
+ property_config->set_property_name(name);
+ property_config->set_data_type(PropertyConfigProto::DataType::STRING);
+ property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property_config->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property_config->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
+}
+
+void AddUnindexedProperty(SchemaTypeConfigProto* type_config,
+ std::string name) {
+ PropertyConfigProto* property_config = type_config->add_properties();
+ property_config->set_property_name(name);
+ property_config->set_data_type(PropertyConfigProto::DataType::STRING);
+}
+
class QueryProcessorTest : public Test {
protected:
QueryProcessorTest()
: test_dir_(GetTestTempDir() + "/icing"),
- index_dir_(test_dir_ + "/index"),
- store_dir_(test_dir_ + "/store") {}
+ store_dir_(test_dir_ + "/store"),
+ index_dir_(test_dir_ + "/index") {}
void SetUp() override {
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
+#ifndef ICING_REVERSE_JNI_SEGMENTATION
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+#endif // ICING_REVERSE_JNI_SEGMENTATION
Index::Options options(index_dir_,
/*index_merge_size=*/1024 * 1024);
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- LanguageSegmenter::Create());
-
- ICING_ASSERT_OK_AND_ASSIGN(normalizer_,
- Normalizer::Create(/*max_term_byte_size=*/1000));
-
- SchemaProto schema;
-
- // Message schema
- auto type_config = schema.add_types();
- type_config->set_schema_type("message");
-
- // Add an indexed property so we generate section metadata on it
- auto property = type_config->add_properties();
- property->set_property_name("foo");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- // Add another indexed property so we generate section metadata on it
- property = type_config->add_properties();
- property->set_property_name(indexed_property_);
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- // Since we order indexed properties alphabetically, "foo" gets section id
- // 0, and "subject" gets section id 1 for messages
- indexed_message_section_id_ = 1;
-
- // Email schema
- type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- // Add an indexed property so we generate section metadata on it
- property = type_config->add_properties();
- property->set_property_name(indexed_property_);
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- // First and only indexed property, so it gets the first id of 0
- indexed_email_section_id_ = 0;
-
- // Add an unindexed property
- property = type_config->add_properties();
- property->set_property_name(unindexed_property_);
- property->set_data_type(PropertyConfigProto::DataType::STRING);
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_US, jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ language_segmenter_,
+ language_segmenter_factory::Create(segmenter_options));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
}
libtextclassifier3::Status AddTokenToIndex(
DocumentId document_id, SectionId section_id,
TermMatchType::Code term_match_type, const std::string& token) {
- Index::Editor editor =
- index_->Edit(document_id, section_id, term_match_type);
+ Index::Editor editor = index_->Edit(document_id, section_id,
+ term_match_type, /*namespace_id=*/0);
return editor.AddHit(token.c_str());
}
@@ -155,23 +136,20 @@
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string store_dir_;
std::unique_ptr<Index> index_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
std::unique_ptr<SchemaStore> schema_store_;
std::unique_ptr<DocumentStore> document_store_;
FakeClock fake_clock_;
- const std::string indexed_property_ = "subject";
- const std::string unindexed_property_ = "to";
- int indexed_email_section_id_;
- int indexed_message_section_id_;
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
private:
IcingFilesystem icing_filesystem_;
- Filesystem filesystem_;
- const std::string test_dir_;
const std::string index_dir_;
- const std::string store_dir_;
};
TEST_F(QueryProcessorTest, CreationWithNullPointerShouldFail) {
@@ -206,8 +184,18 @@
}
TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
- // We don't need to insert anything in the index since the empty query will
- // match all DocumentIds from the DocumentStore
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
@@ -220,6 +208,10 @@
.SetSchema("email")
.Build()));
+ // We don't need to insert anything in the index since the empty query will
+ // match all DocumentIds from the DocumentStore
+
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -239,8 +231,18 @@
}
TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
- // We don't need to insert anything in the index since the empty query will
- // match all DocumentIds from the DocumentStore
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
@@ -253,6 +255,10 @@
.SetSchema("email")
.Build()));
+ // We don't need to insert anything in the index since the empty query will
+ // match all DocumentIds from the DocumentStore
+
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -272,9 +278,18 @@
}
TEST_F(QueryProcessorTest, QueryTermNormalized) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -285,6 +300,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
EXPECT_THAT(
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
@@ -292,6 +312,7 @@
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -313,9 +334,18 @@
}
TEST_F(QueryProcessorTest, OneTermPrefixMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -326,10 +356,16 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
EXPECT_THAT(
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -351,9 +387,18 @@
}
TEST_F(QueryProcessorTest, OneTermExactMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -364,10 +409,16 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
EXPECT_THAT(
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -389,9 +440,18 @@
}
TEST_F(QueryProcessorTest, AndTwoTermExactMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -402,6 +462,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
EXPECT_THAT(
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
@@ -409,6 +474,7 @@
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -430,9 +496,18 @@
}
TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -443,6 +518,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
EXPECT_THAT(
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
@@ -450,6 +530,7 @@
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -471,8 +552,18 @@
}
TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -483,13 +574,19 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
EXPECT_THAT(AddTokenToIndex(document_id, section_id,
TermMatchType::EXACT_ONLY, "hello"),
IsOk());
EXPECT_THAT(
- AddTokenToIndex(document_id, section_id, TermMatchType::PREFIX, "world"),
+ AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -498,7 +595,7 @@
SearchSpecProto search_spec;
search_spec.set_query("hello wo");
- search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
query_processor->ParseSearch(search_spec));
@@ -511,9 +608,18 @@
}
TEST_F(QueryProcessorTest, OrTwoTermExactMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -529,6 +635,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
IsOk());
@@ -536,6 +647,7 @@
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -558,9 +670,18 @@
}
TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -576,6 +697,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
IsOk());
@@ -583,6 +709,7 @@
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -605,8 +732,18 @@
}
TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -622,6 +759,10 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+
EXPECT_THAT(AddTokenToIndex(document_id1, section_id,
TermMatchType::EXACT_ONLY, "hello"),
IsOk());
@@ -629,6 +770,7 @@
AddTokenToIndex(document_id2, section_id, TermMatchType::PREFIX, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -651,9 +793,18 @@
}
TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -668,6 +819,10 @@
.SetKey("namespace", "2")
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "animal puppy dog"
EXPECT_THAT(
@@ -689,6 +844,7 @@
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -754,9 +910,18 @@
}
TEST_F(QueryProcessorTest, OneGroup) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -772,6 +937,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "puppy dog"
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
@@ -786,6 +956,7 @@
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -810,9 +981,18 @@
}
TEST_F(QueryProcessorTest, TwoGroups) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -828,6 +1008,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "puppy dog"
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
@@ -868,9 +1053,18 @@
}
TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -886,6 +1080,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "puppy dog"
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
@@ -900,6 +1099,7 @@
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -924,9 +1124,18 @@
}
TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that the DocHitInfoIterators will see that the
@@ -942,6 +1151,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "puppy dog"
EXPECT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
@@ -956,6 +1170,7 @@
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -980,8 +1195,18 @@
}
TEST_F(QueryProcessorTest, ExcludeTerm) {
- SectionId section_id = 0;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that they'll bump the last_added_document_id,
@@ -997,6 +1222,10 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
IsOk());
@@ -1004,6 +1233,7 @@
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1026,8 +1256,18 @@
}
TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) {
- SectionId section_id = 0;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that they'll bump the last_added_document_id,
@@ -1042,6 +1282,9 @@
.SetKey("namespace", "2")
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "hello"),
@@ -1050,6 +1293,7 @@
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1071,8 +1315,18 @@
}
TEST_F(QueryProcessorTest, ExcludeAnd) {
- SectionId section_id = 0;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that they'll bump the last_added_document_id,
@@ -1088,6 +1342,10 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "animal dog"
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
@@ -1102,6 +1360,7 @@
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1141,9 +1400,18 @@
}
TEST_F(QueryProcessorTest, ExcludeOr) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're just
// inserting the documents so that they'll bump the last_added_document_id,
@@ -1159,6 +1427,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "animal dog"
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
@@ -1173,6 +1446,7 @@
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1213,9 +1487,18 @@
}
TEST_F(QueryProcessorTest, DeletedFilter) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1232,6 +1515,11 @@
.Build()));
EXPECT_THAT(document_store_->Delete("namespace", "1"), IsOk());
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "animal dog"
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
@@ -1246,6 +1534,7 @@
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1267,9 +1556,18 @@
}
TEST_F(QueryProcessorTest, NamespaceFilter) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1285,6 +1583,11 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "animal dog"
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
@@ -1299,6 +1602,7 @@
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1321,9 +1625,19 @@
}
TEST_F(QueryProcessorTest, SchemaTypeFilter) {
- SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+ AddSchemaType(&schema, "message");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1339,6 +1653,11 @@
.SetSchema("message")
.Build()));
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document 1 has content "animal dog"
ASSERT_THAT(
AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
@@ -1349,6 +1668,7 @@
AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1371,8 +1691,22 @@
}
TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
- SectionIdMask section_id_mask = 1U << indexed_email_section_id_;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+
+ // First and only indexed property, so it gets a section_id of 0
+ AddIndexedProperty(email_type, "subject");
+ int subject_section_id = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1383,11 +1717,16 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ SectionIdMask section_id_mask = 1U << subject_section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Document has content "animal"
- ASSERT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
- term_match_type, "animal"),
+ ASSERT_THAT(AddTokenToIndex(document_id, subject_section_id, term_match_type,
+ "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1396,7 +1735,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
- search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_query("subject:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1406,14 +1745,31 @@
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(document_id, section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms[indexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["subject"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
- SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
- SectionIdMask message_section_id_mask = 1U << indexed_message_section_id_;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(email_type, "a"); // Section "a" would get sectionId 0
+ AddIndexedProperty(email_type, "foo");
+ int email_foo_section_id = 1;
+
+ SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(message_type, "foo");
+ int message_foo_section_id = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1429,16 +1785,22 @@
.SetSchema("message")
.Build()));
+ // Populate the index
+ SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
+ SectionIdMask message_section_id_mask = 1U << message_foo_section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
term_match_type, "animal"),
IsOk());
// Message document has content "animal"
- ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1447,7 +1809,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
- search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1460,13 +1822,30 @@
ElementsAre(DocHitInfo(message_document_id, message_section_id_mask),
DocHitInfo(email_document_id, email_section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms[indexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
- SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(email_type, "foo");
+ int email_foo_section_id = 0;
+
+ SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(message_type, "foo");
+ int message_foo_section_id = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1482,16 +1861,21 @@
.SetSchema("message")
.Build()));
+ // Populate the index
+ SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
term_match_type, "animal"),
IsOk());
// Message document has content "animal"
- ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1501,7 +1885,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>', but only look within
// documents of email schema
- search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_query("foo:animal");
search_spec.add_schema_type_filters("email");
search_spec.set_term_match_type(term_match_type);
@@ -1514,13 +1898,30 @@
GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms[indexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
- SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(email_type, "foo");
+ int email_foo_section_id = 0;
+
+ SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(message_type, "bar");
+ int message_foo_section_id = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1536,18 +1937,23 @@
.SetSchema("message")
.Build()));
+ // Populate the index
+ SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
term_match_type, "animal"),
IsOk());
// Message document has content "animal", but put in in the same section id as
// the indexed email section id, the same id as indexed property "foo" in the
// message type
- ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1557,7 +1963,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>', but only look within
// documents of email schema
- search_spec.set_query(indexed_property_ + ":animal");
+ search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1569,12 +1975,22 @@
GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms[indexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) {
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1585,11 +2001,15 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, /*section_id=*/0,
term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1599,7 +2019,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>', but only look within
// documents of email schema
- search_spec.set_query("nonexistent.section:animal");
+ search_spec.set_query("nonexistent:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1609,12 +2029,24 @@
// doesn't match to the name of the section filter
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms["nonexistent.section"],
+ EXPECT_THAT(results.query_terms["nonexistent"],
UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) {
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+ AddUnindexedProperty(email_type, "foo");
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1625,11 +2057,15 @@
.SetSchema("email")
.Build()));
+ // Populate the index
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, /*section_id=*/0,
term_match_type, "animal"),
IsOk());
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1639,7 +2075,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>', but only look within
// documents of email schema
- search_spec.set_query(unindexed_property_ + ":animal");
+ search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1649,14 +2085,30 @@
// doesn't match to the name of the section filter
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
EXPECT_THAT(results.query_terms, SizeIs(1));
- EXPECT_THAT(results.query_terms[unindexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
- SectionIdMask email_section_id_mask = 1U << indexed_email_section_id_;
- SectionIdMask message_section_id_mask = 1U << indexed_message_section_id_;
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ // Create the schema and document store
+ SchemaProto schema;
+ SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(email_type, "foo");
+ int email_foo_section_id = 0;
+
+ SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
+ // SectionIds are assigned in ascending order per schema type, alphabetically.
+ AddIndexedProperty(message_type, "foo");
+ int message_foo_section_id = 0;
+
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1672,16 +2124,21 @@
.SetSchema("message")
.Build()));
+ // Poplate the index
+ SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
+ SectionIdMask message_section_id_mask = 1U << message_foo_section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
// Email document has content "animal"
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
term_match_type, "animal"),
IsOk());
- ASSERT_THAT(AddTokenToIndex(email_document_id, indexed_email_section_id_,
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
term_match_type, "cat"),
IsOk());
// Message document has content "animal"
- ASSERT_THAT(AddTokenToIndex(message_document_id, indexed_message_section_id_,
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
term_match_type, "animal"),
IsOk());
@@ -1693,7 +2150,7 @@
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
- search_spec.set_query("cat OR " + indexed_property_ + ":animal");
+ search_spec.set_query("cat OR foo:animal");
search_spec.set_term_match_type(term_match_type);
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
@@ -1707,30 +2164,46 @@
DocHitInfo(email_document_id, email_section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(2));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("cat"));
- EXPECT_THAT(results.query_terms[indexed_property_],
- UnorderedElementsAre("animal"));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) {
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id,
- document_store_->Put(DocumentBuilder()
- .SetKey("namespace", "1")
- .SetSchema("email")
- .SetCreationTimestampMs(0)
- .SetTtlMs(100)
- .Build()));
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
- EXPECT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
- term_match_type, "hello"),
- IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(100)
+ .Build()));
+
+ // Populate the index
+ int section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
// Arbitrary value, just has to be less than the document's creation
// timestamp + ttl
FakeClock fake_clock;
fake_clock.SetSystemTimeMilliseconds(50);
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
@@ -1744,31 +2217,46 @@
ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
query_processor->ParseSearch(search_spec));
- SectionIdMask section_id_mask = 1U << indexed_email_section_id_;
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(document_id, section_id_mask)));
}
TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) {
- TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentId document_id,
- document_store_->Put(DocumentBuilder()
- .SetKey("namespace", "1")
- .SetSchema("email")
- .SetCreationTimestampMs(0)
- .SetTtlMs(100)
- .Build()));
+ // Create the schema and document store
+ SchemaProto schema;
+ AddSchemaType(&schema, "email");
- EXPECT_THAT(AddTokenToIndex(document_id, indexed_email_section_id_,
- term_match_type, "hello"),
- IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_));
+ ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_store_,
+ DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(100)
+ .Build()));
+
+ // Populate the index
+ int section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
// Arbitrary value, just has to be greater than the document's creation
// timestamp + ttl
FakeClock fake_clock;
fake_clock.SetSystemTimeMilliseconds(200);
+ // Perform query
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
QueryProcessor::Create(index_.get(), language_segmenter_.get(),
diff --git a/icing/result/page-result-state.h b/icing/result/page-result-state.h
new file mode 100644
index 0000000..a26c44e
--- /dev/null
+++ b/icing/result/page-result-state.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_PAGE_RESULT_STATE_H_
+#define ICING_RESULT_PAGE_RESULT_STATE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/result/snippet-context.h"
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+// Contains information of results of one page.
+struct PageResultState {
+ PageResultState(std::vector<ScoredDocumentHit> scored_document_hits_in,
+ uint64_t next_page_token_in,
+ SnippetContext snippet_context_in,
+ int num_previously_returned_in)
+ : scored_document_hits(std::move(scored_document_hits_in)),
+ next_page_token(next_page_token_in),
+ snippet_context(std::move(snippet_context_in)),
+ num_previously_returned(num_previously_returned_in) {}
+
+ // Results of one page
+ std::vector<ScoredDocumentHit> scored_document_hits;
+
+ // Token used to fetch the next page
+ uint64_t next_page_token;
+
+ // Information needed for snippeting.
+ SnippetContext snippet_context;
+
+ // Number of results that have been returned in previous pages.
+ int num_previously_returned;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_PAGE_RESULT_STATE_H_
diff --git a/icing/result/result-retriever.cc b/icing/result/result-retriever.cc
index 597afe3..f09d834 100644
--- a/icing/result/result-retriever.cc
+++ b/icing/result/result-retriever.cc
@@ -17,6 +17,8 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/result/page-result-state.h"
+#include "icing/result/snippet-context.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -25,6 +27,7 @@
ResultRetriever::Create(const DocumentStore* doc_store,
const SchemaStore* schema_store,
const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
bool ignore_bad_document_ids) {
ICING_RETURN_ERROR_IF_NULL(doc_store);
ICING_RETURN_ERROR_IF_NULL(schema_store);
@@ -32,7 +35,7 @@
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<SnippetRetriever> snippet_retriever,
- SnippetRetriever::Create(schema_store, language_segmenter));
+ SnippetRetriever::Create(schema_store, language_segmenter, normalizer));
return std::unique_ptr<ResultRetriever>(new ResultRetriever(
doc_store, std::move(snippet_retriever), ignore_bad_document_ids));
@@ -40,19 +43,21 @@
libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
ResultRetriever::RetrieveResults(
- const ResultSpecProto& result_spec,
- const SectionRestrictQueryTermsMap& query_terms,
- TermMatchType::Code match_type,
- const std::vector<ScoredDocumentHit>& scored_document_hits) const {
- const int num_results_returned =
- std::min(static_cast<int>(scored_document_hits.size()),
- result_spec.num_to_retrieve());
+ const PageResultState& page_result_state) const {
std::vector<SearchResultProto::ResultProto> search_results;
- search_results.reserve(num_results_returned);
- for (const auto& scored_document_hit : scored_document_hits) {
- if (search_results.size() >= result_spec.num_to_retrieve()) {
- break;
- }
+ search_results.reserve(page_result_state.scored_document_hits.size());
+
+ const SnippetContext& snippet_context = page_result_state.snippet_context;
+ // Calculates how many snippets to return for this page.
+ int remaining_num_to_snippet = snippet_context.snippet_spec.num_to_snippet() -
+ page_result_state.num_previously_returned;
+
+ if (remaining_num_to_snippet < 0) {
+ remaining_num_to_snippet = 0;
+ }
+
+ for (const auto& scored_document_hit :
+ page_result_state.scored_document_hits) {
libtextclassifier3::StatusOr<DocumentProto> document_or =
doc_store_.Get(scored_document_hit.document_id());
@@ -71,11 +76,12 @@
SearchResultProto::ResultProto result;
// Add the snippet if requested.
- if (result_spec.snippet_spec().num_matches_per_property() > 0 &&
- result_spec.snippet_spec().num_to_snippet() > search_results.size()) {
+ if (snippet_context.snippet_spec.num_matches_per_property() > 0 &&
+ remaining_num_to_snippet > search_results.size()) {
SnippetProto snippet_proto = snippet_retriever_->RetrieveSnippet(
- query_terms, match_type, result_spec.snippet_spec(),
- document_or.ValueOrDie(), scored_document_hit.hit_section_id_mask());
+ snippet_context.query_terms, snippet_context.match_type,
+ snippet_context.snippet_spec, document_or.ValueOrDie(),
+ scored_document_hit.hit_section_id_mask());
*result.mutable_snippet() = std::move(snippet_proto);
}
diff --git a/icing/result/result-retriever.h b/icing/result/result-retriever.h
index 3c0e6dc..6f33eef 100644
--- a/icing/result/result-retriever.h
+++ b/icing/result/result-retriever.h
@@ -22,13 +22,16 @@
#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-terms.h"
+#include "icing/result/page-result-state.h"
+#include "icing/result/snippet-context.h"
+#include "icing/result/snippet-retriever.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/scoring/scored-document-hit.h"
-#include "icing/result/snippet-retriever.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
namespace icing {
namespace lib {
@@ -44,33 +47,34 @@
// FAILED_PRECONDITION on any null pointer input
static libtextclassifier3::StatusOr<std::unique_ptr<ResultRetriever>> Create(
const DocumentStore* doc_store, const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter,
+ const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
bool ignore_bad_document_ids = true);
- // Gets results (pairs of DocumentProtos and SnippetProtos) with the given
- // document ids from the given document store. The
- // order of documents returned is the same as the order of document_ids.
+ // Retrieves results (pairs of DocumentProtos and SnippetProtos) with the
+ // given document and snippet information. The expected number of documents to
+ // return is the number of all scored document hits inside PageResultState.
+ // The number of snippets to return is based on the total number of snippets
+ // needed and number of snippets that have already been returned previously
+ // for the same query. The order of results returned is the same as the order
+ // of scored document hits inside PageResultState.
//
- // Parameter "ignore_bad_document_ids" indicates whether to ignore invalid and
- // non-existing document_ids. If it's true, errors on some document_ids will
- // be ignored and valid documents will be returned, otherwise any error will
- // be returned immediately. Note that IO errors will always be returned.
+ // "ignore_bad_document_ids" from constructor indicates whether to ignore
+ // invalid and non-existing document ids. If it's true, errors on some
+ // document ids will be ignored and valid documents will be returned,
+ // otherwise any error will be returned immediately. Note that IO errors will
+ // always be returned.
//
// Returns when ignore_bad_document_ids is true:
- // A list of valid documents on success
+ // A list of ResultProto on success
// INTERNAL_ERROR on IO error
//
// Returns when ignore_bad_document_ids is false:
- // A list of documents on success
+ // A list of ResultProto on success
// INVALID_ARGUMENT if any document_id < 0
// NOT_FOUND if any doc doesn't exist or has been deleted
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- RetrieveResults(
- const ResultSpecProto& result_spec,
- const SectionRestrictQueryTermsMap& query_terms,
- TermMatchType::Code match_type,
- const std::vector<ScoredDocumentHit>& scored_document_hits) const;
+ RetrieveResults(const PageResultState& page_result_state) const;
private:
explicit ResultRetriever(const DocumentStore* doc_store,
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index e79b632..36dbfd9 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -17,16 +17,15 @@
#include <limits>
#include <memory>
-#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
-#include "icing/query/query-terms.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
@@ -34,6 +33,9 @@
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
namespace icing {
namespace lib {
@@ -42,6 +44,7 @@
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::IsEmpty;
using ::testing::Return;
using ::testing::SizeIs;
@@ -49,38 +52,20 @@
protected:
ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") {
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- test_document1_ = DocumentBuilder()
- .SetKey("icing", "email/1")
- .SetSchema("email")
- .AddStringProperty("subject", "subject foo")
- .AddStringProperty("body", "body bar")
- .SetCreationTimestampMs(1574365086666)
- .Build();
- test_document2_ = DocumentBuilder()
- .SetKey("icing", "email/2")
- .SetSchema("email")
- .AddStringProperty("subject", "subject foo 2")
- .AddStringProperty("body", "body bar 2")
- .SetCreationTimestampMs(1574365087777)
- .Build();
- test_document3_ = DocumentBuilder()
- .SetKey("icing", "email/3")
- .SetSchema("email")
- .AddStringProperty("subject", "subject foo 3")
- .AddStringProperty("body", "body bar 3")
- .SetCreationTimestampMs(1574365088888)
- .Build();
}
void SetUp() override {
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
SchemaProto schema;
auto type_config = schema.add_types();
@@ -102,14 +87,6 @@
prop_config->mutable_indexing_config()->set_tokenizer_type(
IndexingConfig::TokenizerType::PLAIN);
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- result_spec_no_snippet_ = ResultSpecProto::default_instance();
-
- result_spec_snippet_.mutable_snippet_spec()->set_num_to_snippet(
- std::numeric_limits<int>::max());
- result_spec_snippet_.mutable_snippet_spec()->set_num_matches_per_property(
- std::numeric_limits<int>::max());
- result_spec_snippet_.mutable_snippet_spec()->set_max_window_bytes(1024);
}
void TearDown() override {
@@ -117,21 +94,35 @@
}
const Filesystem filesystem_;
- ResultSpecProto result_spec_no_snippet_;
- ResultSpecProto result_spec_snippet_;
const std::string test_dir_;
- DocumentProto test_document1_;
- DocumentProto test_document2_;
- DocumentProto test_document3_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
FakeClock fake_clock_;
};
+ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+ snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
+ snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
+ snippet_spec.set_max_window_bytes(1024);
+ return snippet_spec;
+}
+
+DocumentProto CreateDocument(int id) {
+ return DocumentBuilder()
+ .SetKey("icing", "email/" + std::to_string(id))
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo " + std::to_string(id))
+ .AddStringProperty("body", "body bar " + std::to_string(id))
+ .SetCreationTimestampMs(1574365086666 + id)
+ .Build();
+}
+
TEST_F(ResultRetrieverTest, CreationWithNullPointerShouldFail) {
EXPECT_THAT(
ResultRetriever::Create(/*doc_store=*/nullptr, schema_store_.get(),
- language_segmenter_.get()),
+ language_segmenter_.get(), normalizer_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -139,25 +130,31 @@
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
- EXPECT_THAT(ResultRetriever::Create(doc_store.get(), /*schema_store=*/nullptr,
- language_segmenter_.get()),
+ EXPECT_THAT(
+ ResultRetriever::Create(doc_store.get(), /*schema_store=*/nullptr,
+ language_segmenter_.get(), normalizer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(ResultRetriever::Create(doc_store.get(), schema_store_.get(),
+ /*language_segmenter=*/nullptr,
+ normalizer_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- /*language_segmenter=*/nullptr),
+ language_segmenter_.get(),
+ /*normalizer=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ResultRetrieverTest, Simple) {
+TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(test_document3_));
+ doc_store->Put(CreateDocument(/*id=*/3)));
std::vector<ScoredDocumentHit> scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
@@ -166,66 +163,37 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get()));
+ language_segmenter_.get(), normalizer_.get()));
SearchResultProto::ResultProto result1;
- *result1.mutable_document() = test_document1_;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
SearchResultProto::ResultProto result2;
- *result2.mutable_document() = test_document2_;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
SearchResultProto::ResultProto result3;
- *result3.mutable_document() = test_document3_;
+ *result3.mutable_document() = CreateDocument(/*id=*/3);
- SectionRestrictQueryTermsMap query_terms{};
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
EXPECT_THAT(
- result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY,
- scored_document_hits),
+ result_retriever->RetrieveResults(page_result_state),
IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2),
EqualsProto(result3))));
}
-TEST_F(ResultRetrieverTest, OnlyOneResultRequested) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(test_document3_));
-
- result_spec_no_snippet_.set_num_to_retrieve(1);
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get()));
-
- SearchResultProto::ResultProto result1;
- *result1.mutable_document() = test_document1_;
-
- SectionRestrictQueryTermsMap query_terms{};
- EXPECT_THAT(result_retriever->RetrieveResults(
- result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY, scored_document_hits),
- IsOkAndHolds(ElementsAre(EqualsProto(result1))));
-}
-
TEST_F(ResultRetrieverTest, IgnoreErrors) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
DocumentId invalid_document_id = -1;
std::vector<ScoredDocumentHit> scored_document_hits = {
@@ -235,19 +203,23 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(),
+ language_segmenter_.get(), normalizer_.get(),
/*ignore_bad_document_ids=*/true));
SearchResultProto::ResultProto result1;
- *result1.mutable_document() = test_document1_;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
SearchResultProto::ResultProto result2;
- *result2.mutable_document() = test_document2_;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
- SectionRestrictQueryTermsMap query_terms{};
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
EXPECT_THAT(
- result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY,
- scored_document_hits),
+ result_retriever->RetrieveResults(page_result_state),
IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2))));
}
@@ -257,9 +229,9 @@
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
DocumentId invalid_document_id = -1;
std::vector<ScoredDocumentHit> scored_document_hits = {
@@ -269,28 +241,30 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(),
+ language_segmenter_.get(), normalizer_.get(),
/*ignore_bad_document_ids=*/false));
- SectionRestrictQueryTermsMap query_terms{};
- EXPECT_THAT(result_retriever->RetrieveResults(
- result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY, scored_document_hits),
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
+ EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
DocumentId non_existing_document_id = 4;
- scored_document_hits = {
+ page_result_state.scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
{document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
{non_existing_document_id, /*hit_section_id_mask=*/0b00000011,
/*score=*/0}};
- EXPECT_THAT(result_retriever->RetrieveResults(
- result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY, scored_document_hits),
+ EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(ResultRetrieverTest, IOError) {
+TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
MockFilesystem mock_filesystem;
ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
@@ -299,37 +273,42 @@
DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
std::vector<ScoredDocumentHit> scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
{document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- SectionRestrictQueryTermsMap query_terms{};
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(),
+ language_segmenter_.get(), normalizer_.get(),
/*ignore_bad_document_ids=*/true));
- EXPECT_THAT(result_retriever->RetrieveResults(
- result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY, scored_document_hits),
+
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
+ EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
-TEST_F(ResultRetrieverTest, SnippetingDisabled) {
+TEST_F(ResultRetrieverTest, DefaultSnippetSpecShouldDisableSnippeting) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(test_document3_));
+ doc_store->Put(CreateDocument(/*id=*/3)));
std::vector<ScoredDocumentHit> scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
@@ -338,14 +317,18 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get()));
+ language_segmenter_.get(), normalizer_.get()));
- SectionRestrictQueryTermsMap query_terms{};
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<SearchResultProto::ResultProto> results,
- result_retriever->RetrieveResults(result_spec_no_snippet_, query_terms,
- TermMatchType::EXACT_ONLY,
- scored_document_hits));
+ result_retriever->RetrieveResults(page_result_state));
ASSERT_THAT(results, SizeIs(3));
EXPECT_THAT(results.at(0).snippet(),
EqualsProto(SnippetProto::default_instance()));
@@ -361,11 +344,11 @@
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(test_document3_));
+ doc_store->Put(CreateDocument(/*id=*/3)));
std::vector<ScoredDocumentHit> scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
@@ -374,27 +357,30 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get()));
+ language_segmenter_.get(), normalizer_.get()));
- SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{{"", {"foo", "bar"}}}, CreateSnippetSpec(),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(result_spec_snippet_, query_terms,
- TermMatchType::EXACT_ONLY,
- scored_document_hits));
+ result_retriever->RetrieveResults(page_result_state));
EXPECT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].document(), EqualsProto(test_document1_));
+ EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1)));
EXPECT_THAT(
GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("subject foo"));
+ Eq("subject foo 1"));
EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
Eq("foo"));
EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
- Eq("body bar"));
+ Eq("body bar 1"));
EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
Eq("bar"));
- EXPECT_THAT(result[1].document(), EqualsProto(test_document2_));
+ EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2)));
EXPECT_THAT(
GetWindow(result[1].document(), result[1].snippet(), "subject", 0),
Eq("subject foo 2"));
@@ -405,7 +391,7 @@
EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "body", 0),
Eq("bar"));
- EXPECT_THAT(result[2].document(), EqualsProto(test_document3_));
+ EXPECT_THAT(result[2].document(), EqualsProto(CreateDocument(/*id=*/3)));
EXPECT_THAT(
GetWindow(result[2].document(), result[2].snippet(), "subject", 0),
Eq("subject foo 3"));
@@ -423,13 +409,14 @@
DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
schema_store_.get()));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(test_document1_));
+ doc_store->Put(CreateDocument(/*id=*/1)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(test_document2_));
+ doc_store->Put(CreateDocument(/*id=*/2)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(test_document3_));
+ doc_store->Put(CreateDocument(/*id=*/3)));
- result_spec_snippet_.mutable_snippet_spec()->set_num_to_snippet(1);
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(1);
std::vector<ScoredDocumentHit> scored_document_hits = {
{document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
@@ -438,35 +425,161 @@
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ResultRetriever> result_retriever,
ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get()));
+ language_segmenter_.get(), normalizer_.get()));
- SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
+ SnippetContext snippet_context(/*query_terms_in=*/{{"", {"foo", "bar"}}},
+ snippet_spec, TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(result_spec_snippet_, query_terms,
- TermMatchType::EXACT_ONLY,
- scored_document_hits));
+ result_retriever->RetrieveResults(page_result_state));
EXPECT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].document(), EqualsProto(test_document1_));
+ EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1)));
EXPECT_THAT(
GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("subject foo"));
+ Eq("subject foo 1"));
EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
Eq("foo"));
EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
- Eq("body bar"));
+ Eq("body bar 1"));
EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
Eq("bar"));
- EXPECT_THAT(result[1].document(), EqualsProto(test_document2_));
+ EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2)));
EXPECT_THAT(result[1].snippet(),
EqualsProto(SnippetProto::default_instance()));
- EXPECT_THAT(result[2].document(), EqualsProto(test_document3_));
+ EXPECT_THAT(result[2].document(), EqualsProto(CreateDocument(/*id=*/3)));
EXPECT_THAT(result[2].snippet(),
EqualsProto(SnippetProto::default_instance()));
}
+TEST_F(ResultRetrieverTest, ShouldSnippetAllResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetriever> result_retriever,
+ ResultRetriever::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/0);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> result,
+ result_retriever->RetrieveResults(page_result_state));
+ // num_to_snippet = 5, num_previously_returned_in = 0,
+ // We can return 5 - 0 = 5 snippets at most. We're able to return all 3
+ // snippets here.
+ ASSERT_THAT(result, SizeIs(3));
+ EXPECT_THAT(result[0].snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(result[1].snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(result[2].snippet().entries(), Not(IsEmpty()));
+}
+
+TEST_F(ResultRetrieverTest, ShouldSnippetSomeResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetriever> result_retriever,
+ ResultRetriever::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/3);
+
+ // num_to_snippet = 5, num_previously_returned_in = 3,
+ // We can return 5 - 3 = 2 snippets.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> result,
+ result_retriever->RetrieveResults(page_result_state));
+ ASSERT_THAT(result, SizeIs(3));
+ EXPECT_THAT(result[0].snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(result[1].snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(result[2].snippet().entries(), IsEmpty());
+}
+
+TEST_F(ResultRetrieverTest, ShouldNotSnippetAnyResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
+ {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetriever> result_retriever,
+ ResultRetriever::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ SnippetContext snippet_context(
+ /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
+ TermMatchType::EXACT_ONLY);
+ PageResultState page_result_state(
+ std::move(scored_document_hits), /*next_page_token_in=*/1,
+ std::move(snippet_context), /*num_previously_returned_in=*/6);
+
+ // num_to_snippet = 5, num_previously_returned_in = 6,
+ // We can't return any snippets for this page.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<SearchResultProto::ResultProto> result,
+ result_retriever->RetrieveResults(page_result_state));
+ ASSERT_THAT(result, SizeIs(3));
+ EXPECT_THAT(result[0].snippet().entries(), IsEmpty());
+ EXPECT_THAT(result[1].snippet().entries(), IsEmpty());
+ EXPECT_THAT(result[2].snippet().entries(), IsEmpty());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/result/result-state-manager.cc b/icing/result/result-state-manager.cc
new file mode 100644
index 0000000..e9ae0ab
--- /dev/null
+++ b/icing/result/result-state-manager.cc
@@ -0,0 +1,170 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state-manager.h"
+
+#include "icing/proto/search.pb.h"
+#include "icing/util/clock.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+ResultStateManager::ResultStateManager(int max_hits_per_query,
+ int max_result_states)
+ : max_hits_per_query_(max_hits_per_query),
+ max_result_states_(max_result_states),
+ random_generator_(GetSteadyTimeNanoseconds()) {}
+
+libtextclassifier3::StatusOr<PageResultState>
+ResultStateManager::RankAndPaginate(ResultState result_state) {
+ if (!result_state.HasMoreResults()) {
+ return absl_ports::InvalidArgumentError("ResultState has no results");
+ }
+
+ // Truncates scored document hits so that they don't take up too much space.
+ result_state.TruncateHitsTo(max_hits_per_query_);
+
+ // Gets the number before calling GetNextPage() because num_returned() may
+ // change after returning more results.
+ int num_previously_returned = result_state.num_returned();
+
+ std::vector<ScoredDocumentHit> page_result_document_hits =
+ result_state.GetNextPage();
+
+ if (!result_state.HasMoreResults()) {
+ // No more pages, won't store ResultState, returns directly
+ return PageResultState(
+ std::move(page_result_document_hits), kInvalidNextPageToken,
+ result_state.snippet_context(), num_previously_returned);
+ }
+
+ absl_ports::unique_lock l(&mutex_);
+
+ // ResultState has multiple pages, storing it
+ SnippetContext snippet_context_copy = result_state.snippet_context();
+ uint64_t next_page_token = Add(std::move(result_state));
+
+ return PageResultState(std::move(page_result_document_hits), next_page_token,
+ std::move(snippet_context_copy),
+ num_previously_returned);
+}
+
+uint64_t ResultStateManager::Add(ResultState result_state) {
+ RemoveStatesIfNeeded();
+
+ uint64_t new_token = GetUniqueToken();
+
+ result_state_map_.emplace(new_token, std::move(result_state));
+ // Tracks the insertion order
+ token_queue_.push(new_token);
+
+ return new_token;
+}
+
+libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage(
+ uint64_t next_page_token) {
+ absl_ports::unique_lock l(&mutex_);
+
+ const auto& state_iterator = result_state_map_.find(next_page_token);
+ if (state_iterator == result_state_map_.end()) {
+ return absl_ports::NotFoundError("next_page_token not found");
+ }
+
+ int num_returned = state_iterator->second.num_returned();
+ std::vector<ScoredDocumentHit> result_of_page =
+ state_iterator->second.GetNextPage();
+ if (result_of_page.empty()) {
+ // This shouldn't happen, all our active states should contain results, but
+ // a sanity check here in case of any data inconsistency.
+ InternalInvalidateResultState(next_page_token);
+ return absl_ports::NotFoundError(
+ "No more results, token has been invalidated.");
+ }
+
+ // Copies the SnippetContext in case the ResultState is invalidated.
+ SnippetContext snippet_context_copy =
+ state_iterator->second.snippet_context();
+
+ if (!state_iterator->second.HasMoreResults()) {
+ InternalInvalidateResultState(next_page_token);
+ }
+
+ return PageResultState(result_of_page, next_page_token,
+ std::move(snippet_context_copy), num_returned);
+}
+
+void ResultStateManager::InvalidateResultState(uint64_t next_page_token) {
+ if (next_page_token == kInvalidNextPageToken) {
+ return;
+ }
+
+ absl_ports::unique_lock l(&mutex_);
+
+ InternalInvalidateResultState(next_page_token);
+}
+
+void ResultStateManager::InvalidateAllResultStates() {
+ absl_ports::unique_lock l(&mutex_);
+
+ result_state_map_.clear();
+ invalidated_token_set_.clear();
+ token_queue_ = {};
+}
+
+uint64_t ResultStateManager::GetUniqueToken() {
+ uint64_t new_token = random_generator_();
+ // There's a small chance of collision between the random numbers, here we're
+ // trying to avoid any collisions by checking the keys.
+ while (result_state_map_.find(new_token) != result_state_map_.end() ||
+ invalidated_token_set_.find(new_token) !=
+ invalidated_token_set_.end() ||
+ new_token == kInvalidNextPageToken) {
+ new_token = random_generator_();
+ }
+ return new_token;
+}
+
+void ResultStateManager::RemoveStatesIfNeeded() {
+ if (result_state_map_.empty() || token_queue_.empty()) {
+ return;
+ }
+
+ // Removes any tokens that were previously invalidated.
+ while (!token_queue_.empty() &&
+ invalidated_token_set_.find(token_queue_.front()) !=
+ invalidated_token_set_.end()) {
+ invalidated_token_set_.erase(token_queue_.front());
+ token_queue_.pop();
+ }
+
+ // Removes the oldest state
+ if (result_state_map_.size() >= max_result_states_ && !token_queue_.empty()) {
+ result_state_map_.erase(token_queue_.front());
+ token_queue_.pop();
+ }
+}
+
+void ResultStateManager::InternalInvalidateResultState(uint64_t token) {
+ // Removes the entry in result_state_map_ and insert the token into
+ // invalidated_token_set_. The entry in token_queue_ can't be easily removed
+ // right now (may need O(n) time), so we leave it there and later completely
+ // remove the token in RemoveStatesIfNeeded().
+ if (result_state_map_.erase(token) > 0) {
+ invalidated_token_set_.insert(token);
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state-manager.h b/icing/result/result-state-manager.h
new file mode 100644
index 0000000..eaf9eb5
--- /dev/null
+++ b/icing/result/result-state-manager.h
@@ -0,0 +1,127 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RESULT_STATE_MANAGER_H_
+#define ICING_RESULT_RESULT_STATE_MANAGER_H_
+
+#include <queue>
+#include <random>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/page-result-state.h"
+#include "icing/result/result-state.h"
+
+namespace icing {
+namespace lib {
+
+// This should be the same as the default value of
+// SearchResultProto.next_page_token.
+inline constexpr uint64_t kInvalidNextPageToken = 0;
+
+// Used to store and manage ResultState.
+class ResultStateManager {
+ public:
+ explicit ResultStateManager(int max_hits_per_query, int max_result_states);
+
+ ResultStateManager(const ResultStateManager&) = delete;
+ ResultStateManager& operator=(const ResultStateManager&) = delete;
+
+ // Ranks the results and returns the first page of them. The result object
+ // PageResultState contains a next_page_token which can be used to fetch more
+ // pages later. It will be set to a default value 0 if there're no more pages.
+ //
+ // NOTE: it's caller's responsibility not to call this method with the same
+ // ResultState more than once, otherwise duplicate states will be stored
+ // internally.
+ //
+ // Returns:
+ // A PageResultState on success
+ // INVALID_ARGUMENT if the input state contains no results
+ libtextclassifier3::StatusOr<PageResultState> RankAndPaginate(
+ ResultState result_state) ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Retrieves and returns the next page of results wrapped in PageResultState.
+ // The returned results won't exist in ResultStateManager anymore. If the
+ // query has no more pages after this retrieval, the input token will be
+ // invalidated.
+ //
+ // Returns:
+ // PageResultState on success, guaranteed to have non-empty results
+ // NOT_FOUND if failed to find any more results
+ libtextclassifier3::StatusOr<PageResultState> GetNextPage(
+ uint64_t next_page_token) ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Invalidates the result state associated with the given next-page token.
+ void InvalidateResultState(uint64_t next_page_token)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Invalidates all result states / tokens currently in ResultStateManager.
+ void InvalidateAllResultStates() ICING_LOCKS_EXCLUDED(mutex_);
+
+ private:
+ absl_ports::shared_mutex mutex_;
+
+ // The maximum number of scored document hits to return for a query. When we
+ // have more than the maximum number, extra hits will be truncated.
+ const int max_hits_per_query_;
+
+ // The maximum number of result states. When we have more than the maximum
+ // number, the oldest / firstly added result state will be removed.
+ const int max_result_states_;
+
+ // A hash map of (next-page token -> result state)
+ std::unordered_map<uint64_t, ResultState> result_state_map_
+ ICING_GUARDED_BY(mutex_);
+
+ // A queue used to track the insertion order of tokens
+ std::queue<uint64_t> token_queue_ ICING_GUARDED_BY(mutex_);
+
+ // A set to temporarily store the invalidated tokens before they're finally
+ // removed from token_queue_. We store the invalidated tokens to ensure the
+ // uniqueness of new generated tokens.
+ std::unordered_set<uint64_t> invalidated_token_set_ ICING_GUARDED_BY(mutex_);
+
+ // A random 64-bit number generator
+ std::mt19937_64 random_generator_ ICING_GUARDED_BY(mutex_);
+
+ // Puts a new result state into the internal storage and returns a next-page
+ // token associated with it. The token is guaranteed to be unique among all
+ // currently valid tokens. When the maximum number of result states is
+ // reached, the oldest / firstly added result state will be removed to make
+ // room for the new state.
+ uint64_t Add(ResultState result_state) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to generate a next-page token that is unique among all
+ // existing tokens in token_queue_.
+ uint64_t GetUniqueToken() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to remove old states to make room for incoming states.
+ void RemoveStatesIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to remove a result state from result_state_map_, the token
+ // will then be temporarily kept in invalidated_token_set_ until it's finally
+ // removed from token_queue_.
+ void InternalInvalidateResultState(uint64_t token)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RESULT_STATE_MANAGER_H_
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
new file mode 100644
index 0000000..6defa6f
--- /dev/null
+++ b/icing/result/result-state-manager_test.cc
@@ -0,0 +1,479 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state-manager.h"
+
+#include "gtest/gtest.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::IsEmpty;
+
+ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id) {
+ return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
+}
+
+ScoringSpecProto CreateScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+ResultSpecProto CreateResultSpec(int num_per_page) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+ResultState CreateResultState(
+ const std::vector<ScoredDocumentHit>& scored_document_hits,
+ int num_per_page) {
+ return ResultState(scored_document_hits, /*query_terms=*/{},
+ SearchSpecProto::default_instance(), CreateScoringSpec(),
+ CreateResultSpec(num_per_page));
+}
+
+TEST(ResultStateManagerTest, ShouldRankAndPaginateOnePage) {
+ ResultState original_result_state =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3)},
+ /*num_per_page=*/10);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+
+ EXPECT_THAT(page_result_state.next_page_token, Eq(kInvalidNextPageToken));
+
+ // Should get the original scored document hits
+ EXPECT_THAT(
+ page_result_state.scored_document_hits,
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1))));
+}
+
+TEST(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) {
+ ResultState original_result_state =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/4),
+ CreateScoredDocumentHit(/*document_id=*/5)},
+ /*num_per_page=*/2);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+
+ // First page, 2 results
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ EXPECT_THAT(
+ page_result_state1.scored_document_hits,
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
+
+ uint64_t next_page_token = page_result_state1.next_page_token;
+
+ // Second page, 2 results
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
+ result_state_manager.GetNextPage(next_page_token));
+ EXPECT_THAT(
+ page_result_state2.scored_document_hits,
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2))));
+
+ // Third page, 1 result
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
+ result_state_manager.GetNextPage(next_page_token));
+ EXPECT_THAT(page_result_state3.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(
+ CreateScoredDocumentHit(/*document_id=*/1))));
+
+ // No results
+ EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST(ResultStateManagerTest, EmptyStateShouldReturnError) {
+ ResultState empty_result_state = CreateResultState({}, /*num_per_page=*/1);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ EXPECT_THAT(
+ result_state_manager.RankAndPaginate(std::move(empty_result_state)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ResultStateManagerTest, ShouldInvalidateOneToken) {
+ ResultState result_state1 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3)},
+ /*num_per_page=*/1);
+ ResultState result_state2 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/4),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/6)},
+ /*num_per_page=*/1);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(result_state1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state2,
+ result_state_manager.RankAndPaginate(std::move(result_state2)));
+
+ result_state_manager.InvalidateResultState(
+ page_result_state1.next_page_token);
+
+ // page_result_state1.next_page_token() shouldn't be found
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(page_result_state1.next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // page_result_state2.next_page_token() should still exist
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state2,
+ result_state_manager.GetNextPage(page_result_state2.next_page_token));
+ EXPECT_THAT(page_result_state2.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(
+ CreateScoredDocumentHit(/*document_id=*/5))));
+}
+
+TEST(ResultStateManagerTest, ShouldInvalidateAllTokens) {
+ ResultState result_state1 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3)},
+ /*num_per_page=*/1);
+ ResultState result_state2 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/4),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/6)},
+ /*num_per_page=*/1);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(result_state1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state2,
+ result_state_manager.RankAndPaginate(std::move(result_state2)));
+
+ result_state_manager.InvalidateAllResultStates();
+
+ // page_result_state1.next_page_token() shouldn't be found
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(page_result_state1.next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // page_result_state2.next_page_token() shouldn't be found
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(page_result_state2.next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST(ResultStateManagerTest, ShouldRemoveOldestResultState) {
+ ResultState result_state1 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2)},
+ /*num_per_page=*/1);
+ ResultState result_state2 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/4)},
+ /*num_per_page=*/1);
+ ResultState result_state3 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/6)},
+ /*num_per_page=*/1);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/2);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(result_state1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state2,
+ result_state_manager.RankAndPaginate(std::move(result_state2)));
+ // Adding state 3 should cause state 1 to be removed.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state3,
+ result_state_manager.RankAndPaginate(std::move(result_state3)));
+
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(page_result_state1.next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state2,
+ result_state_manager.GetNextPage(page_result_state2.next_page_token));
+ EXPECT_THAT(page_result_state2.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
+ /*document_id=*/3))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state3,
+ result_state_manager.GetNextPage(page_result_state3.next_page_token));
+ EXPECT_THAT(page_result_state3.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
+ /*document_id=*/5))));
+}
+
+TEST(ResultStateManagerTest,
+ PreviouslyInvalidatedResultStateShouldNotBeCounted) {
+ ResultState result_state1 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2)},
+ /*num_per_page=*/1);
+ ResultState result_state2 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/4)},
+ /*num_per_page=*/1);
+ ResultState result_state3 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/6)},
+ /*num_per_page=*/1);
+ ResultState result_state4 =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/7),
+ CreateScoredDocumentHit(/*document_id=*/8)},
+ /*num_per_page=*/1);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/3);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(result_state1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state2,
+ result_state_manager.RankAndPaginate(std::move(result_state2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state3,
+ result_state_manager.RankAndPaginate(std::move(result_state3)));
+
+ // Invalidates state 2, so that the number of valid tokens becomes 2.
+ result_state_manager.InvalidateResultState(
+ page_result_state2.next_page_token);
+
+ // Adding state 4 shouldn't affect rest of the states
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state4,
+ result_state_manager.RankAndPaginate(std::move(result_state4)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state1,
+ result_state_manager.GetNextPage(page_result_state1.next_page_token));
+ EXPECT_THAT(page_result_state1.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
+ /*document_id=*/1))));
+
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(page_result_state2.next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state3,
+ result_state_manager.GetNextPage(page_result_state3.next_page_token));
+ EXPECT_THAT(page_result_state3.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
+ /*document_id=*/5))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_state4,
+ result_state_manager.GetNextPage(page_result_state4.next_page_token));
+ EXPECT_THAT(page_result_state4.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
+ /*document_id=*/7))));
+}
+
+TEST(ResultStateManagerTest, ShouldGetSnippetContext) {
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultState original_result_state = ResultState(
+ /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2)},
+ query_terms_map, search_spec, CreateScoringSpec(), result_spec);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+
+ ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
+
+ EXPECT_THAT(page_result_state.snippet_context.match_type,
+ Eq(TermMatchType::EXACT_ONLY));
+ EXPECT_TRUE(page_result_state.snippet_context.query_terms.find("term1") !=
+ page_result_state.snippet_context.query_terms.end());
+ EXPECT_THAT(page_result_state.snippet_context.snippet_spec,
+ EqualsProto(result_spec.snippet_spec()));
+}
+
+TEST(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+ // 0 indicates no snippeting
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(0);
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultState original_result_state = ResultState(
+ /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2)},
+ query_terms_map, search_spec, CreateScoringSpec(), result_spec);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+
+ ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
+
+ EXPECT_THAT(page_result_state.snippet_context.query_terms, IsEmpty());
+ EXPECT_THAT(
+ page_result_state.snippet_context.snippet_spec,
+ EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
+ EXPECT_THAT(page_result_state.snippet_context.match_type,
+ Eq(TermMatchType::UNKNOWN));
+}
+
+TEST(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) {
+ ResultState original_result_state =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/4),
+ CreateScoredDocumentHit(/*document_id=*/5)},
+ /*num_per_page=*/2);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/std::numeric_limits<int>::max(),
+ /*max_result_states=*/std::numeric_limits<int>::max());
+
+ // First page, 2 results
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ ASSERT_THAT(page_result_state1.scored_document_hits.size(), Eq(2));
+
+ // No previously returned results
+ EXPECT_THAT(page_result_state1.num_previously_returned, Eq(0));
+
+ uint64_t next_page_token = page_result_state1.next_page_token;
+
+ // Second page, 2 results
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
+ result_state_manager.GetNextPage(next_page_token));
+ ASSERT_THAT(page_result_state2.scored_document_hits.size(), Eq(2));
+
+ // num_previously_returned = size of first page
+ EXPECT_THAT(page_result_state2.num_previously_returned, Eq(2));
+
+ // Third page, 1 result
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
+ result_state_manager.GetNextPage(next_page_token));
+ ASSERT_THAT(page_result_state3.scored_document_hits.size(), Eq(1));
+
+ // num_previously_returned = size of first and second pages
+ EXPECT_THAT(page_result_state3.num_previously_returned, Eq(4));
+
+ // No more results
+ EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST(ResultStateManagerTest, ShouldStoreMaxNumberOfScoredDocumentHits) {
+ ResultState original_result_state =
+ CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/4),
+ CreateScoredDocumentHit(/*document_id=*/5)},
+ /*num_per_page=*/2);
+
+ ResultStateManager result_state_manager(
+ /*max_hits_per_query=*/3,
+ /*max_result_states=*/std::numeric_limits<int>::max());
+
+ // The 5 input scored document hits will be truncated to 3.
+
+ // First page, 2 results
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultState page_result_state1,
+ result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ EXPECT_THAT(
+ page_result_state1.scored_document_hits,
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
+
+ uint64_t next_page_token = page_result_state1.next_page_token;
+
+ // Second page, 1 results.
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
+ result_state_manager.GetNextPage(next_page_token));
+ EXPECT_THAT(page_result_state2.scored_document_hits,
+ ElementsAre(EqualsScoredDocumentHit(
+ CreateScoredDocumentHit(/*document_id=*/3))));
+
+ // No third page.
+ EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state.cc b/icing/result/result-state.cc
new file mode 100644
index 0000000..bf28f52
--- /dev/null
+++ b/icing/result/result-state.cc
@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state.h"
+
+#include "icing/scoring/ranker.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+SnippetContext CreateSnippetContext(SectionRestrictQueryTermsMap query_terms,
+ const SearchSpecProto& search_spec,
+ const ResultSpecProto& result_spec) {
+ if (result_spec.snippet_spec().num_to_snippet() > 0 &&
+ result_spec.snippet_spec().num_matches_per_property() > 0) {
+ // Needs snippeting
+ return SnippetContext(std::move(query_terms), result_spec.snippet_spec(),
+ search_spec.term_match_type());
+ }
+ return SnippetContext(/*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::UNKNOWN);
+}
+
+ResultState::ResultState(std::vector<ScoredDocumentHit> scored_document_hits,
+ SectionRestrictQueryTermsMap query_terms,
+ const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec)
+ : scored_document_hits_(std::move(scored_document_hits)),
+ snippet_context_(CreateSnippetContext(std::move(query_terms), search_spec,
+ result_spec)),
+ num_per_page_(result_spec.num_per_page()),
+ num_returned_(0),
+ scored_document_hit_comparator_(scoring_spec.order_by() ==
+ ScoringSpecProto::Order::DESC) {
+ BuildHeapInPlace(&scored_document_hits_, scored_document_hit_comparator_);
+}
+
+std::vector<ScoredDocumentHit> ResultState::GetNextPage() {
+ std::vector<ScoredDocumentHit> scored_document_hits = PopTopResultsFromHeap(
+ &scored_document_hits_, num_per_page_, scored_document_hit_comparator_);
+ num_returned_ += scored_document_hits.size();
+ return scored_document_hits;
+}
+
+void ResultState::TruncateHitsTo(int new_size) {
+ if (new_size < 0 || scored_document_hits_.size() <= new_size) {
+ return;
+ }
+
+ // Copying the best new_size results.
+ scored_document_hits_ = PopTopResultsFromHeap(
+ &scored_document_hits_, new_size, scored_document_hit_comparator_);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state.h b/icing/result/result-state.h
new file mode 100644
index 0000000..82e783b
--- /dev/null
+++ b/icing/result/result-state.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RESULT_STATE_H_
+#define ICING_RESULT_RESULT_STATE_H_
+
+#include <vector>
+
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/snippet-context.h"
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+// Used to hold information needed across multiple pagination requests of the
+// same query. Stored in ResultStateManager.
+class ResultState {
+ public:
+ explicit ResultState(std::vector<ScoredDocumentHit> scored_document_hits,
+ SectionRestrictQueryTermsMap query_terms,
+ const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec);
+
+ // Returns the next page of results. The size of page is passed in from
+ // ResultSpecProto in constructor. Calling this method could increase the
+ // value of num_returned(), so be careful of the order of calling these
+ // methods.
+ std::vector<ScoredDocumentHit> GetNextPage();
+
+ // Truncates the vector of ScoredDocumentHits to the given size. The best
+ // ScoredDocumentHits are kept.
+ void TruncateHitsTo(int new_size);
+
+ // Returns if the current state has more results to return.
+ bool HasMoreResults() const { return !scored_document_hits_.empty(); }
+
+ // Returns a SnippetContext generated from the specs passed in via
+ // constructor.
+ const SnippetContext& snippet_context() const { return snippet_context_; }
+
+ // The number of results that have already been returned. This number is
+ // increased when GetNextPage() is called.
+ int num_returned() const { return num_returned_; }
+
+ private:
+ // The scored document hits. It represents a heap data structure when ranking
+ // is required so that we can get top K hits in O(KlgN) time. If no ranking is
+ // required, it's just a vector of ScoredDocumentHits in the original order.
+ std::vector<ScoredDocumentHit> scored_document_hits_;
+
+ // Information needed for snippeting.
+ SnippetContext snippet_context_;
+
+ // Number of results to return in each page.
+ int num_per_page_;
+
+ // Number of results that have already been returned.
+ int num_returned_;
+
+ // Used to compare two scored document hits.
+ ScoredDocumentHitComparator scored_document_hit_comparator_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RESULT_STATE_H_
diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc
new file mode 100644
index 0000000..85cb242
--- /dev/null
+++ b/icing/result/result-state_test.cc
@@ -0,0 +1,214 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state.h"
+
+#include "gtest/gtest.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id) {
+ return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
+}
+
+SearchSpecProto CreateSearchSpec(TermMatchType::Code match_type) {
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(match_type);
+ return search_spec;
+}
+
+ScoringSpecProto CreateScoringSpec(bool is_descending_order) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_order_by(is_descending_order ? ScoringSpecProto::Order::DESC
+ : ScoringSpecProto::Order::ASC);
+ return scoring_spec;
+}
+
+ResultSpecProto CreateResultSpec(int num_per_page) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+// ResultState::ResultState() and ResultState::GetNextPage() are calling
+// Ranker::BuildHeapInPlace() and Ranker::PopTopResultsFromHeap() directly, so
+// we don't need to test much on what order is returned as that is tested in
+// Ranker's tests. Here we just need one sanity test to make sure that the
+// correct functions are called.
+TEST(ResultStateTest, ShouldReturnNextPage) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/4)};
+
+ ResultState result_state(scored_document_hits, /*query_terms=*/{},
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true),
+ CreateResultSpec(/*num_per_page=*/2));
+
+ EXPECT_THAT(
+ result_state.GetNextPage(),
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
+
+ EXPECT_THAT(
+ result_state.GetNextPage(),
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2))));
+
+ EXPECT_THAT(result_state.GetNextPage(),
+ ElementsAre(EqualsScoredDocumentHit(
+ CreateScoredDocumentHit(/*document_id=*/1))));
+}
+
+TEST(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) {
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultState result_state(
+ /*scored_document_hits=*/{}, query_terms_map,
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec);
+
+ const SnippetContext& snippet_context = result_state.snippet_context();
+
+ // Snippet context should be derived from the specs above.
+ EXPECT_TRUE(snippet_context.query_terms.find("term1") !=
+ snippet_context.query_terms.end());
+ EXPECT_THAT(snippet_context.snippet_spec,
+ EqualsProto(result_spec.snippet_spec()));
+ EXPECT_THAT(snippet_context.match_type, Eq(TermMatchType::EXACT_ONLY));
+
+ // The same copy can be fetched multiple times.
+ const SnippetContext& snippet_context2 = result_state.snippet_context();
+ EXPECT_TRUE(snippet_context2.query_terms.find("term1") !=
+ snippet_context2.query_terms.end());
+ EXPECT_THAT(snippet_context2.snippet_spec,
+ EqualsProto(result_spec.snippet_spec()));
+ EXPECT_THAT(snippet_context2.match_type, Eq(TermMatchType::EXACT_ONLY));
+}
+
+TEST(ResultStateTest, NoSnippetingShouldReturnNull) {
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ // Setting num_to_snippet to 0 so that snippeting info won't be
+ // stored.
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
+ result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultState result_state(/*scored_document_hits=*/{}, query_terms_map,
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true),
+ result_spec);
+
+ const SnippetContext& snippet_context = result_state.snippet_context();
+ EXPECT_THAT(snippet_context.query_terms, IsEmpty());
+ EXPECT_THAT(
+ snippet_context.snippet_spec,
+ EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
+ EXPECT_THAT(snippet_context.match_type, TermMatchType::UNKNOWN);
+}
+
+TEST(ResultStateTest, ShouldTruncateToNewSize) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/4)};
+
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultState result_state(scored_document_hits, /*query_terms=*/{},
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true),
+ CreateResultSpec(/*num_per_page=*/5));
+
+ result_state.TruncateHitsTo(/*new_size=*/3);
+ // The best 3 are left.
+ EXPECT_THAT(
+ result_state.GetNextPage(),
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3))));
+}
+
+TEST(ResultStateTest, ShouldTruncateToZero) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/4)};
+
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultState result_state(scored_document_hits, /*query_terms=*/{},
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true),
+ CreateResultSpec(/*num_per_page=*/5));
+
+ result_state.TruncateHitsTo(/*new_size=*/0);
+ EXPECT_THAT(result_state.GetNextPage(), IsEmpty());
+}
+
+TEST(ResultStateTest, ShouldNotTruncateToNegative) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ CreateScoredDocumentHit(/*document_id=*/2),
+ CreateScoredDocumentHit(/*document_id=*/1),
+ CreateScoredDocumentHit(/*document_id=*/3),
+ CreateScoredDocumentHit(/*document_id=*/5),
+ CreateScoredDocumentHit(/*document_id=*/4)};
+
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultState result_state(scored_document_hits, /*query_terms=*/{},
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true),
+ CreateResultSpec(/*num_per_page=*/5));
+
+ result_state.TruncateHitsTo(/*new_size=*/-1);
+ // Results are not affected.
+ EXPECT_THAT(
+ result_state.GetNextPage(),
+ ElementsAre(
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)),
+ EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1))));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/snippet-context.h b/icing/result/snippet-context.h
new file mode 100644
index 0000000..34a0529
--- /dev/null
+++ b/icing/result/snippet-context.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_SNIPPET_CONTEXT_H_
+#define ICING_RESULT_SNIPPET_CONTEXT_H_
+
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/query/query-terms.h"
+
+namespace icing {
+namespace lib {
+
+// Stores data needed for snippeting. With SnippetContext we can fetch snippets
+// for queries with multiple pages.
+struct SnippetContext {
+ explicit SnippetContext(SectionRestrictQueryTermsMap query_terms_in,
+ ResultSpecProto::SnippetSpecProto snippet_spec_in,
+ TermMatchType::Code match_type_in)
+ : query_terms(std::move(query_terms_in)),
+ snippet_spec(std::move(snippet_spec_in)),
+ match_type(match_type_in) {}
+
+ // Query terms that are used to find snippets
+ SectionRestrictQueryTermsMap query_terms;
+
+ // Spec that defines some quantities of snippeting
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+
+ // Defines how we match each term
+ TermMatchType::Code match_type;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_SNIPPET_CONTEXT_H_
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index d429d41..09d0f7a 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -15,20 +15,28 @@
#include "icing/result/snippet-retriever.h"
#include <algorithm>
-#include <cctype>
#include <memory>
+#include <string>
#include <string_view>
+#include <unordered_map>
#include <unordered_set>
+#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/proto/term.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
-#include "unicode/utf8.h"
namespace icing {
namespace lib {
@@ -45,13 +53,14 @@
public:
explicit TokenMatcherExact(
const std::unordered_set<std::string>& unrestricted_query_terms,
- const std::unordered_set<std::string>& restricted_query_terms)
+ const std::unordered_set<std::string>& restricted_query_terms,
+ const Normalizer& normalizer)
: unrestricted_query_terms_(unrestricted_query_terms),
- restricted_query_terms_(restricted_query_terms) {}
+ restricted_query_terms_(restricted_query_terms),
+ normalizer_(normalizer) {}
bool Matches(Token token) const override {
- // TODO(tjbarron) : Add normalization of token.
- std::string s(token.text);
+ std::string s = normalizer_.NormalizeTerm(token.text);
return (unrestricted_query_terms_.count(s) > 0) ||
(restricted_query_terms_.count(s) > 0);
}
@@ -59,51 +68,57 @@
private:
const std::unordered_set<std::string>& unrestricted_query_terms_;
const std::unordered_set<std::string>& restricted_query_terms_;
+ const Normalizer& normalizer_;
};
class TokenMatcherPrefix : public TokenMatcher {
public:
explicit TokenMatcherPrefix(
const std::unordered_set<std::string>& unrestricted_query_terms,
- const std::unordered_set<std::string>& restricted_query_terms)
+ const std::unordered_set<std::string>& restricted_query_terms,
+ const Normalizer& normalizer)
: unrestricted_query_terms_(unrestricted_query_terms),
- restricted_query_terms_(restricted_query_terms) {}
+ restricted_query_terms_(restricted_query_terms),
+ normalizer_(normalizer) {}
bool Matches(Token token) const override {
+ std::string s = normalizer_.NormalizeTerm(token.text);
if (std::any_of(unrestricted_query_terms_.begin(),
unrestricted_query_terms_.end(),
- [&token](const std::string& term) {
- return term.length() <= token.text.length() &&
- token.text.compare(0, term.length(), term) == 0;
+ [&s](const std::string& term) {
+ return term.length() <= s.length() &&
+ s.compare(0, term.length(), term) == 0;
})) {
return true;
}
return std::any_of(restricted_query_terms_.begin(),
restricted_query_terms_.end(),
- [token](const std::string& term) {
- return term.length() <= token.text.length() &&
- token.text.compare(0, term.length(), term) == 0;
+ [&s](const std::string& term) {
+ return term.length() <= s.length() &&
+ s.compare(0, term.length(), term) == 0;
});
}
private:
const std::unordered_set<std::string>& unrestricted_query_terms_;
const std::unordered_set<std::string>& restricted_query_terms_;
+ const Normalizer& normalizer_;
};
libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
TermMatchType::Code match_type,
const std::unordered_set<std::string>& unrestricted_query_terms,
- const std::unordered_set<std::string>& restricted_query_terms) {
+ const std::unordered_set<std::string>& restricted_query_terms,
+ const Normalizer& normalizer) {
switch (match_type) {
case TermMatchType::EXACT_ONLY:
- return std::make_unique<TokenMatcherExact>(unrestricted_query_terms,
- restricted_query_terms);
+ return std::make_unique<TokenMatcherExact>(
+ unrestricted_query_terms, restricted_query_terms, normalizer);
case TermMatchType::PREFIX:
- return std::make_unique<TokenMatcherPrefix>(unrestricted_query_terms,
- restricted_query_terms);
+ return std::make_unique<TokenMatcherPrefix>(
+ unrestricted_query_terms, restricted_query_terms, normalizer);
case TermMatchType::UNKNOWN:
- U_FALLTHROUGH;
+ [[fallthrough]];
default:
return absl_ports::InvalidArgumentError("Invalid match type provided.");
}
@@ -111,19 +126,18 @@
// Returns true if token matches any of the terms in query terms according to
// the provided match type.
-
+//
// Returns:
// the position of the window start if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
libtextclassifier3::StatusOr<int> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
- int window_start_min =
- std::max((match_mid - snippet_spec.max_window_bytes() / 2), 0);
- if (window_start_min == 0) {
+ int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
+ if (window_start_min < 0) {
return 0;
}
- if (!iterator->ResetToTokenAfter(window_start_min - 1)) {
+ if (!iterator->ResetToTokenAfter(window_start_min)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -158,10 +172,9 @@
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
int window_end_max_exclusive =
- std::min((match_mid + snippet_spec.max_window_bytes() / 2),
- static_cast<int>(value.length()));
- if (window_end_max_exclusive == value.length()) {
- return window_end_max_exclusive;
+ match_mid + snippet_spec.max_window_bytes() / 2;
+ if (window_end_max_exclusive >= value.length()) {
+ return value.length();
}
if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
return absl_ports::InternalError(
@@ -212,8 +225,11 @@
iterator));
snippet_match.set_window_bytes(window_end_exclusive - window_start);
- // Reset the iterator back to the original position.
- if (!iterator->ResetToTokenAfter(match_pos - 1)) {
+ // DetermineWindowStart/End may change the position of the iterator. So,
+ // reset the iterator back to the original position.
+ bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
+ : iterator->ResetToStart();
+ if (!success) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -260,12 +276,14 @@
libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>>
SnippetRetriever::Create(const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter) {
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer) {
ICING_RETURN_ERROR_IF_NULL(schema_store);
ICING_RETURN_ERROR_IF_NULL(language_segmenter);
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
return std::unique_ptr<SnippetRetriever>(
- new SnippetRetriever(schema_store, language_segmenter));
+ new SnippetRetriever(schema_store, language_segmenter, normalizer));
}
SnippetProto SnippetRetriever::RetrieveSnippet(
@@ -275,8 +293,8 @@
const DocumentProto& document, SectionIdMask section_id_mask) const {
SnippetProto snippet_proto;
ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
- schema_store_.GetSchemaTypeId(document.schema()),
- snippet_proto);
+ schema_store_.GetSchemaTypeId(document.schema()),
+ snippet_proto);
const std::unordered_set<std::string> empty_set;
auto itr = query_terms.find("");
const std::unordered_set<std::string>& unrestricted_set =
@@ -307,8 +325,8 @@
const std::unordered_set<std::string>& restricted_set =
(itr != query_terms.end()) ? itr->second : empty_set;
libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
- CreateTokenMatcher(section_match_type, unrestricted_set,
- restricted_set);
+ CreateTokenMatcher(section_match_type, unrestricted_set, restricted_set,
+ normalizer_);
if (!matcher_or.ok()) {
continue;
}
diff --git a/icing/result/snippet-retriever.h b/icing/result/snippet-retriever.h
index 2b5cc78..f68e18d 100644
--- a/icing/result/snippet-retriever.h
+++ b/icing/result/snippet-retriever.h
@@ -15,6 +15,9 @@
#ifndef ICING_SNIPPET_RETRIEVER_H_
#define ICING_SNIPPET_RETRIEVER_H_
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
@@ -22,6 +25,7 @@
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
namespace icing {
namespace lib {
@@ -45,7 +49,8 @@
// FAILED_PRECONDITION on any null pointer input
static libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>> Create(
const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter);
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer);
// Retrieve the snippet information for content in document. terms in
// query_terms are matched to content in document according to match_type.
@@ -60,12 +65,15 @@
private:
explicit SnippetRetriever(const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter)
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer)
: schema_store_(*schema_store),
- language_segmenter_(*language_segmenter) {}
+ language_segmenter_(*language_segmenter),
+ normalizer_(*normalizer) {}
const SchemaStore& schema_store_;
const LanguageSegmenter& language_segmenter_;
+ const Normalizer& normalizer_;
};
} // namespace lib
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 8c60b6f..3b3bf61 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,6 +22,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
@@ -35,8 +36,10 @@
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/i18n-utils.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
namespace icing {
namespace lib {
@@ -55,9 +58,10 @@
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
// Setup the schema
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
@@ -83,10 +87,12 @@
IndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
ICING_ASSERT_OK_AND_ASSIGN(
snippet_retriever_,
- SnippetRetriever::Create(schema_store_.get(),
- language_segmenter_.get()));
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
// Set limits to max - effectively no limit. Enable matching and request a
// window of 64 bytes.
@@ -104,17 +110,24 @@
std::unique_ptr<SchemaStore> schema_store_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<SnippetRetriever> snippet_retriever_;
+ std::unique_ptr<Normalizer> normalizer_;
ResultSpecProto::SnippetSpecProto snippet_spec_;
std::string test_dir_;
};
TEST_F(SnippetRetrieverTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(SnippetRetriever::Create(/*schema_store=*/nullptr,
- language_segmenter_.get()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ SnippetRetriever::Create(/*schema_store=*/nullptr,
+ language_segmenter_.get(), normalizer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(SnippetRetriever::Create(schema_store_.get(),
- /*language_segmenter=*/nullptr),
+ /*language_segmenter=*/nullptr,
+ normalizer_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ /*normalizer=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
@@ -565,6 +578,46 @@
EXPECT_THAT(GetMatch(document, snippet, "body", 1), IsEmpty());
}
+TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "MDI team")
+ .AddStringProperty("body", "Some members are in Zürich.")
+ .Build();
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::PREFIX, snippet_spec_, document,
+ section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("MDI team"));
+ EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("MDI"));
+}
+
+TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "MDI team")
+ .AddStringProperty("body", "Some members are in Zürich.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
+ section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(GetWindow(document, snippet, "body", 0),
+ Eq("Some members are in Zürich."));
+ EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("Zürich"));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index b9da87c..34ccf22 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -382,6 +382,14 @@
libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
+ auto schema_proto_or = GetSchema();
+ if (absl_ports::IsNotFound(schema_proto_or.status())) {
+ return absl_ports::FailedPreconditionError("Schema not set yet.");
+ } else if (!schema_proto_or.ok()) {
+ // Some other real error, pass it up
+ return schema_proto_or.status();
+ }
+
const auto& type_config_iter =
type_config_map_.find(std::string(schema_type));
if (type_config_iter == type_config_map_.end()) {
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index a7217de..f5c6588 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -151,7 +151,9 @@
//
// Returns:
// SchemaTypeConfigProto on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND if schema type name doesn't exist
+ // INTERNAL on any I/O errors
libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const;
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index 1a4188b..7413d73 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -22,6 +22,7 @@
#include <utility>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/annotate.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/absl_ports/str_join.h"
@@ -36,16 +37,6 @@
namespace {
-bool isAlphaNumeric(std::string_view str) {
- for (char c : str) {
- if (!std::isalnum(c)) {
- return false;
- }
- }
-
- return true;
-}
-
bool IsCardinalityCompatible(const PropertyConfigProto& old_property,
const PropertyConfigProto& new_property) {
if (old_property.cardinality() < new_property.cardinality()) {
@@ -148,8 +139,15 @@
// Need to know what schema_type these Document properties should be
// validated against
std::string_view property_schema_type(property_config.schema_type());
- ICING_RETURN_IF_ERROR(ValidatePropertySchemaType(
- property_schema_type, schema_type, property_name));
+ libtextclassifier3::Status validated_status =
+ ValidateSchemaType(property_schema_type);
+ if (!validated_status.ok()) {
+ return absl_ports::Annotate(
+ validated_status,
+ absl_ports::StrCat("Field 'schema_type' is required for DOCUMENT "
+ "data_types in schema property '",
+ schema_type, " ", property_name, "'"));
+ }
// Need to make sure we eventually see/validate this schema_type
if (known_schema_types.count(property_schema_type) == 0) {
@@ -184,14 +182,6 @@
"Field 'schema_type' cannot be empty.");
}
- // Only support alphanumeric values.
- if (!isAlphaNumeric(schema_type)) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("Field 'schema_type' '", schema_type,
- "' can only contain "
- "alphanumeric characters."));
- }
-
return libtextclassifier3::Status::OK;
}
@@ -208,7 +198,7 @@
for (char c : property_name) {
if (!std::isalnum(c)) {
return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("Field 'property_name' '", schema_type,
+ absl_ports::StrCat("Field 'property_name' '", property_name,
"' can only contain alphanumeric characters."));
}
}
@@ -230,18 +220,6 @@
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status SchemaUtil::ValidatePropertySchemaType(
- std::string_view property_schema_type, std::string_view schema_type,
- std::string_view property_name) {
- if (property_schema_type.empty()) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("Field 'schema_type' is required for DOCUMENT "
- "data_types in schema property '",
- schema_type, " ", property_name, "'"));
- }
- return libtextclassifier3::Status::OK;
-}
-
libtextclassifier3::Status SchemaUtil::ValidateCardinality(
PropertyConfigProto::Cardinality::Code cardinality,
std::string_view schema_type, std::string_view property_name) {
@@ -278,23 +256,29 @@
}
}
-void SchemaUtil::BuildPropertyConfigMap(
- const SchemaTypeConfigProto& type_config,
- std::unordered_map<std::string_view, const PropertyConfigProto*>*
- property_config_map,
- int32_t* num_required_properties) {
+SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
+ const SchemaTypeConfigProto& type_config) {
+ ParsedPropertyConfigs parsed_property_configs;
+
// TODO(samzheng): consider caching property_config_map for some properties,
// e.g. using LRU cache. Or changing schema.proto to use go/protomap.
- *num_required_properties = 0;
- property_config_map->clear();
for (const PropertyConfigProto& property_config : type_config.properties()) {
- property_config_map->emplace(property_config.property_name(),
- &property_config);
+ parsed_property_configs.property_config_map.emplace(
+ property_config.property_name(), &property_config);
if (property_config.cardinality() ==
PropertyConfigProto::Cardinality::REQUIRED) {
- (*num_required_properties)++;
+ parsed_property_configs.num_required_properties++;
+ }
+
+ // A non-default term_match_type indicates that this property is meant to be
+ // indexed.
+ if (property_config.indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN) {
+ parsed_property_configs.num_indexed_properties++;
}
}
+
+ return parsed_property_configs;
}
const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
@@ -320,22 +304,21 @@
continue;
}
- std::unordered_map<std::string_view, const PropertyConfigProto*>
- new_property_map;
- int32_t new_required_properties = 0;
- BuildPropertyConfigMap(new_schema_type_and_config->second,
- &new_property_map, &new_required_properties);
+ ParsedPropertyConfigs new_parsed_property_configs =
+ ParsePropertyConfigs(new_schema_type_and_config->second);
// We only need to check the old, existing properties to see if they're
// compatible since we'll have old data that may be invalidated or need to
- // be reindexed. New properties don't have any data that would be
- // invalidated or incompatible, so we blanket accept all new properties.
+ // be reindexed.
int32_t old_required_properties = 0;
+ int32_t old_indexed_properties = 0;
for (const auto& old_property_config : old_type_config.properties()) {
auto new_property_name_and_config =
- new_property_map.find(old_property_config.property_name());
+ new_parsed_property_configs.property_config_map.find(
+ old_property_config.property_name());
- if (new_property_name_and_config == new_property_map.end()) {
+ if (new_property_name_and_config ==
+ new_parsed_property_configs.property_config_map.end()) {
// Didn't find the old property
ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ",
old_type_config.schema_type(), ".",
@@ -362,6 +345,13 @@
++old_required_properties;
}
+ // A non-default term_match_type indicates that this property is meant to
+ // be indexed.
+ if (old_property_config.indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN) {
+ ++old_indexed_properties;
+ }
+
// Any change in the indexed property requires a reindexing
if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
new_property_config->indexing_config())) {
@@ -374,7 +364,8 @@
// guaranteed from our previous checks that all the old properties are also
// present in the new property config, so we can do a simple int comparison
// here to detect new required properties.
- if (new_required_properties > old_required_properties) {
+ if (new_parsed_property_configs.num_required_properties >
+ old_required_properties) {
ICING_VLOG(1) << absl_ports::StrCat(
"New schema ", old_type_config.schema_type(),
" has REQUIRED properties that are not "
@@ -382,6 +373,18 @@
schema_delta.schema_types_incompatible.insert(
old_type_config.schema_type());
}
+
+ // If we've gained any new indexed properties, then the section ids may
+ // change. Since the section ids are stored in the index, we'll need to
+ // reindex everything.
+ if (new_parsed_property_configs.num_indexed_properties >
+ old_indexed_properties) {
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Set of indexed properties in schema type '",
+ old_type_config.schema_type(),
+ "' has changed, required reindexing.");
+ schema_delta.index_incompatible = true;
+ }
}
return schema_delta;
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index 87fa1a8..d65dd10 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -54,6 +54,18 @@
}
};
+ struct ParsedPropertyConfigs {
+ // Mapping of property name to PropertyConfigProto
+ std::unordered_map<std::string_view, const PropertyConfigProto*>
+ property_config_map;
+
+ // Total number of properties that have an indexing config
+ int32_t num_indexed_properties = 0;
+
+ // Total number of properties that were REQUIRED
+ int32_t num_required_properties = 0;
+ };
+
// This function validates:
// 1. SchemaTypeConfigProto.schema_type's must be unique
// 2. Properties within one SchemaTypeConfigProto must be unique
@@ -67,7 +79,7 @@
// 8. PropertyConfigProtos.cardinality cannot be UNKNOWN
// 9. PropertyConfigProtos.schema_type's must correspond to a
// SchemaTypeConfigProto.schema_type
- // 10. All string fields must be alphanumeric.
+ // 10. Property names can only be alphanumeric.
//
// Returns:
// ALREADY_EXISTS for case 1 and 2
@@ -81,14 +93,10 @@
static void BuildTypeConfigMap(const SchemaProto& schema,
TypeConfigMap* type_config_map);
- // Calculate and return a hash map of (property name -> property config)
- // from the given type config. The number of required properties will be
- // assigned to output param num_required_properties.
- static void BuildPropertyConfigMap(
- const SchemaTypeConfigProto& type_config,
- std::unordered_map<std::string_view, const PropertyConfigProto*>*
- property_config_map,
- int32_t* num_required_properties);
+ // Parses the given type_config and returns a struct of easily-parseable
+ // information about the properties.
+ static ParsedPropertyConfigs ParsePropertyConfigs(
+ const SchemaTypeConfigProto& type_config);
// Computes the delta between the old and new schema. There are a few
// differences that'll be reported:
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index 67cfb50..a3ab96f 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -125,13 +125,12 @@
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_NotAlphanumericSchemaType) {
+TEST_F(SchemaUtilTest, AnySchemaTypeOk) {
auto type = schema_proto_.add_types();
*type = CreateSchemaTypeConfig(kEmailType);
- type->set_schema_type("_");
+ type->set_schema_type("abc123!@#$%^&*()_-+=[{]}|\\;:'\",<.>?你好");
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
}
TEST_F(SchemaUtilTest, Invalid_ClearedPropertyName) {
@@ -160,7 +159,7 @@
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_NotAlphanumericPropertyName) {
+TEST_F(SchemaUtilTest, NonAlphanumericPropertyNameIsInvalid) {
auto type = schema_proto_.add_types();
*type = CreateSchemaTypeConfig(kEmailType);
@@ -173,6 +172,18 @@
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
+TEST_F(SchemaUtilTest, AlphanumericPropertyNameOk) {
+ auto type = schema_proto_.add_types();
+ *type = CreateSchemaTypeConfig(kEmailType);
+
+ auto property = type->add_properties();
+ property->set_property_name("abc123");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+}
+
TEST_F(SchemaUtilTest, Invalid_DuplicatePropertyName) {
auto type = schema_proto_.add_types();
*type = CreateSchemaTypeConfig(kEmailType);
@@ -343,8 +354,8 @@
property->set_data_type(PropertyConfigProto::DataType::INT64);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- // Configure new schema, new schema needs to at least have all the previously
- // defined properties
+ // Configure new schema, new schema needs to at least have all the
+ // previously defined properties
SchemaProto new_schema;
type = new_schema.add_types();
*type = CreateSchemaTypeConfig(kEmailType);
@@ -491,6 +502,40 @@
Eq(schema_delta));
}
+TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto old_type = old_schema.add_types();
+ *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto old_property = old_type->add_properties();
+ old_property->set_property_name("Property");
+ old_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Configure new schema
+ SchemaProto new_schema;
+ auto new_type = new_schema.add_types();
+ *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto new_property = new_type->add_properties();
+ new_property->set_property_name("Property");
+ new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ new_property = new_type->add_properties();
+ new_property->set_property_name("NewIndexedProperty");
+ new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ new_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.index_incompatible = true;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
TEST_F(SchemaUtilTest, AddingTypeIsCompatible) {
// Can add a new type, existing data isn't incompatible, since none of them
// are of this new schema type
diff --git a/icing/scoring/ranker.cc b/icing/scoring/ranker.cc
index 71f0cd1..fecee82 100644
--- a/icing/scoring/ranker.cc
+++ b/icing/scoring/ranker.cc
@@ -20,7 +20,6 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/scoring/scored-document-hit.h"
-#include "icing/store/document-id.h"
#include "icing/util/logging.h"
namespace icing {
@@ -33,9 +32,10 @@
// Helper function to wrap the heapify algorithm, it heapifies the target
// subtree node in place.
-void Heapify(std::vector<ScoredDocumentHit>* scored_document_hits,
- int target_subtree_root_index,
- const ScoredDocumentHitComparator scored_document_hit_comparator) {
+void Heapify(
+ std::vector<ScoredDocumentHit>* scored_document_hits,
+ int target_subtree_root_index,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator) {
const int heap_size = scored_document_hits->size();
if (target_subtree_root_index >= heap_size) {
return;
@@ -71,11 +71,40 @@
}
}
-// Helper function to build a heap in place whose root is the best node defined
-// by scored_document_hit_comparator. Time complexity is O(n).
-void BuildHeap(
+// Helper function to extract the root from the heap. The heap structure will be
+// maintained.
+//
+// Returns:
+// The current root element on success
+// RESOURCE_EXHAUSTED_ERROR if heap is empty
+libtextclassifier3::StatusOr<ScoredDocumentHit> PopRoot(
+ std::vector<ScoredDocumentHit>* scored_document_hits_heap,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator) {
+ if (scored_document_hits_heap->empty()) {
+ // An invalid ScoredDocumentHit
+ return absl_ports::ResourceExhaustedError("Heap is empty");
+ }
+
+ // Steps to extract root from heap:
+ // 1. copy out root
+ ScoredDocumentHit root = scored_document_hits_heap->at(0);
+ const size_t last_node_index = scored_document_hits_heap->size() - 1;
+ // 2. swap root and the last node
+ std::swap(scored_document_hits_heap->at(0),
+ scored_document_hits_heap->at(last_node_index));
+ // 3. remove last node
+ scored_document_hits_heap->pop_back();
+ // 4. heapify root
+ Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
+ scored_document_hit_comparator);
+ return root;
+}
+
+} // namespace
+
+void BuildHeapInPlace(
std::vector<ScoredDocumentHit>* scored_document_hits,
- const ScoredDocumentHitComparator scored_document_hit_comparator) {
+ const ScoredDocumentHitComparator& scored_document_hit_comparator) {
const int heap_size = scored_document_hits->size();
// Since we use a vector to represent the heap, [size / 2 - 1] is the index
// of the parent node of the last node.
@@ -86,50 +115,15 @@
}
}
-// Helper function to extract the root from the heap. The heap structure will be
-// maintained.
-//
-// Returns:
-// The current root element on success
-// RESOURCE_EXHAUSTED_ERROR if heap is empty
-libtextclassifier3::StatusOr<ScoredDocumentHit> ExtractRoot(
- std::vector<ScoredDocumentHit>* scored_document_hits,
- ScoredDocumentHitComparator scored_document_hit_comparator) {
- if (scored_document_hits->empty()) {
- // An invalid ScoredDocumentHit
- return absl_ports::ResourceExhaustedError("Heap is empty");
- }
-
- // Steps to extract root from heap:
- // 1. copy out root
- ScoredDocumentHit root = scored_document_hits->at(0);
- const size_t last_node_index = scored_document_hits->size() - 1;
- // 2. swap root and the last node
- std::swap(scored_document_hits->at(0),
- scored_document_hits->at(last_node_index));
- // 3. remove last node
- scored_document_hits->pop_back();
- // 4. heapify root
- Heapify(scored_document_hits, /*target_subtree_root_index=*/0,
- scored_document_hit_comparator);
- return root;
-}
-
-std::vector<ScoredDocumentHit> HeapifyAndProduceTopN(
- std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
- bool is_descending) {
- // Build a heap in place
- const ScoredDocumentHitComparator scored_document_hit_comparator(
- is_descending);
- BuildHeap(&scored_document_hits, scored_document_hit_comparator);
-
- // Get best nodes from heap one by one
+std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
+ std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator) {
std::vector<ScoredDocumentHit> scored_document_hit_result;
- int result_size =
- std::min(num_result, static_cast<int>(scored_document_hits.size()));
+ int result_size = std::min(
+ num_results, static_cast<int>(scored_document_hits_heap->size()));
while (result_size-- > 0) {
libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
- ExtractRoot(&scored_document_hits, scored_document_hit_comparator);
+ PopRoot(scored_document_hits_heap, scored_document_hit_comparator);
if (next_best_document_hit_or.ok()) {
scored_document_hit_result.push_back(
std::move(next_best_document_hit_or).ValueOrDie());
@@ -140,14 +134,5 @@
return scored_document_hit_result;
}
-} // namespace
-
-std::vector<ScoredDocumentHit> GetTopNFromScoredDocumentHits(
- std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
- bool is_descending) {
- return HeapifyAndProduceTopN(std::move(scored_document_hits), num_result,
- is_descending);
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/ranker.h b/icing/scoring/ranker.h
index 1acd06c..785c133 100644
--- a/icing/scoring/ranker.h
+++ b/icing/scoring/ranker.h
@@ -23,13 +23,21 @@
namespace icing {
namespace lib {
-// Returns the top num_result results from scored_document_hits. The returned
-// vector will be sorted and contain no more than num_result elements.
-// is_descending indicates whether the result is in a descending score order
-// or an ascending score order.
-std::vector<ScoredDocumentHit> GetTopNFromScoredDocumentHits(
- std::vector<ScoredDocumentHit> scored_document_hits, int num_result,
- bool is_descending);
+// Builds a heap of scored document hits. The same vector is used to store the
+// heap structure.
+//
+// REQUIRED: scored_document_hits is not null.
+void BuildHeapInPlace(
+ std::vector<ScoredDocumentHit>* scored_document_hits,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator);
+
+// Returns the top num_results results from the given heap and remove those
+// results from the heap. An empty vector will be returned if heap is empty.
+//
+// REQUIRED: scored_document_hits_heap is not null.
+std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
+ std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator);
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/ranker_benchmark.cc b/icing/scoring/ranker_benchmark.cc
index a6621b2..8983dd9 100644
--- a/icing/scoring/ranker_benchmark.cc
+++ b/icing/scoring/ranker_benchmark.cc
@@ -13,11 +13,11 @@
// limitations under the License.
#include <cstdlib>
+#include <random>
#include "testing/base/public/benchmark.h"
#include "icing/scoring/ranker.h"
#include "icing/scoring/scored-document-hit.h"
-#include "icing/util/clock.h"
namespace icing {
namespace lib {
@@ -44,18 +44,33 @@
int num_to_score = state.range(0);
int num_to_return = state.range(1);
+ std::mt19937_64 random_generator;
+ std::uniform_real_distribution<double> distribution(
+ 1, std::numeric_limits<double>::max());
+
std::vector<ScoredDocumentHit> scored_document_hits;
- uint seed = Clock().GetSystemTimeMilliseconds();
+ scored_document_hits.reserve(num_to_score);
for (int i = 0; i < num_to_score; i++) {
- int score = rand_r(&seed);
scored_document_hits.emplace_back(/*document_id=*/0,
- /*hit_section_id_mask=*/0, score);
+ /*hit_section_id_mask=*/0,
+ /*score=*/distribution(random_generator));
}
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
for (auto _ : state) {
+ // Pauses timer so that the cost of copying data is not included.
+ state.PauseTiming();
+ std::vector<ScoredDocumentHit> scored_document_hits_copy =
+ scored_document_hits;
+ state.ResumeTiming();
+
+ BuildHeapInPlace(&scored_document_hits_copy,
+ scored_document_hit_comparator);
auto result =
- GetTopNFromScoredDocumentHits(scored_document_hits, num_to_return,
- /*is_descending=*/true);
+ PopTopResultsFromHeap(&scored_document_hits_copy, num_to_return,
+ scored_document_hit_comparator);
}
}
BENCHMARK(BM_GetTopN)
@@ -89,6 +104,52 @@
->ArgPair(15000, 30)
->ArgPair(17000, 30)
->ArgPair(19000, 30);
+
+void BM_PopTopResultsFromHeap(benchmark::State& state) {
+ int num_to_score = state.range(0);
+ int num_to_return = state.range(1);
+
+ std::mt19937_64 random_generator;
+ std::uniform_real_distribution<double> distribution(
+ 1, std::numeric_limits<double>::max());
+
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ scored_document_hits.reserve(num_to_score);
+ for (int i = 0; i < num_to_score; i++) {
+ scored_document_hits.emplace_back(/*document_id=*/0,
+ /*hit_section_id_mask=*/0,
+ /*score=*/distribution(random_generator));
+ }
+
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
+ for (auto _ : state) {
+ // Pauses timer so that the cost of copying data and building a heap are not
+ // included.
+ state.PauseTiming();
+ std::vector<ScoredDocumentHit> scored_document_hits_copy =
+ scored_document_hits;
+ BuildHeapInPlace(&scored_document_hits_copy,
+ scored_document_hit_comparator);
+ state.ResumeTiming();
+
+ auto result =
+ PopTopResultsFromHeap(&scored_document_hits_copy, num_to_return,
+ scored_document_hit_comparator);
+ }
+}
+BENCHMARK(BM_PopTopResultsFromHeap)
+ ->ArgPair(20000, 100) // (num_to_score, num_to_return)
+ ->ArgPair(20000, 300)
+ ->ArgPair(20000, 500)
+ ->ArgPair(20000, 700)
+ ->ArgPair(20000, 900)
+ ->ArgPair(20000, 1100)
+ ->ArgPair(20000, 1300)
+ ->ArgPair(20000, 1500)
+ ->ArgPair(20000, 1700)
+ ->ArgPair(20000, 1900);
} // namespace
} // namespace lib
diff --git a/icing/scoring/ranker_test.cc b/icing/scoring/ranker_test.cc
index 3aa94e6..f0c135a 100644
--- a/icing/scoring/ranker_test.cc
+++ b/icing/scoring/ranker_test.cc
@@ -14,140 +14,158 @@
#include "icing/scoring/ranker.h"
-#include "icing/scoring/scored-document-hit.h"
-#include "icing/testing/common-matchers.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/testing/common-matchers.h"
namespace icing {
namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::Test;
-class RankerTest : public Test {
- protected:
- RankerTest()
- : test_scored_document_hit1_(/*document_id=*/3, /*hit_section_id_mask=*/3,
- /*score=*/1),
- test_scored_document_hit2_(/*document_id=*/1, /*hit_section_id_mask=*/1,
- /*score=*/2),
- test_scored_document_hit3_(/*document_id=*/2, /*hit_section_id_mask=*/2,
- /*score=*/3),
- test_scored_document_hit4_(/*document_id=*/5, /*hit_section_id_mask=*/5,
- /*score=*/4),
- test_scored_document_hit5_(/*document_id=*/4, /*hit_section_id_mask=*/4,
- /*score=*/5) {}
+ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id,
+ double score) {
+ return ScoredDocumentHit(document_id, kSectionIdMaskAll, score);
+}
- const ScoredDocumentHit& test_scored_document_hit1() {
- return test_scored_document_hit1_;
- }
-
- const ScoredDocumentHit& test_scored_document_hit2() {
- return test_scored_document_hit2_;
- }
-
- const ScoredDocumentHit& test_scored_document_hit3() {
- return test_scored_document_hit3_;
- }
-
- const ScoredDocumentHit& test_scored_document_hit4() {
- return test_scored_document_hit4_;
- }
-
- const ScoredDocumentHit& test_scored_document_hit5() {
- return test_scored_document_hit5_;
- }
-
- private:
- ScoredDocumentHit test_scored_document_hit1_;
- ScoredDocumentHit test_scored_document_hit2_;
- ScoredDocumentHit test_scored_document_hit3_;
- ScoredDocumentHit test_scored_document_hit4_;
- ScoredDocumentHit test_scored_document_hit5_;
-};
-
-TEST_F(RankerTest, ShouldHandleEmpty) {
+TEST(RankerTest, ShouldHandleEmpty) {
std::vector<ScoredDocumentHit> scored_document_hits = {};
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/0,
- /*is_descending=*/true),
- IsEmpty());
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
- /*is_descending=*/true),
- IsEmpty());
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 0, scored_document_hit_comparator),
+ IsEmpty());
+
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 3, scored_document_hit_comparator),
+ IsEmpty());
}
-TEST_F(RankerTest, ShouldCorrectlySortResults) {
+TEST(RankerTest, ShouldCorrectlySortResultsDesc) {
+ ScoredDocumentHit scored_document_hit1 =
+ CreateScoredDocumentHit(/*document_id=*/1, /*score=*/1);
+ ScoredDocumentHit scored_document_hit2 =
+ CreateScoredDocumentHit(/*document_id=*/2, /*score=*/2);
+ ScoredDocumentHit scored_document_hit3 =
+ CreateScoredDocumentHit(/*document_id=*/3, /*score=*/3);
+ ScoredDocumentHit scored_document_hit4 =
+ CreateScoredDocumentHit(/*document_id=*/4, /*score=*/4);
+ ScoredDocumentHit scored_document_hit5 =
+ CreateScoredDocumentHit(/*document_id=*/5, /*score=*/5);
+
std::vector<ScoredDocumentHit> scored_document_hits = {
- test_scored_document_hit2(), test_scored_document_hit1(),
- test_scored_document_hit5(), test_scored_document_hit4(),
- test_scored_document_hit3()};
+ scored_document_hit2, scored_document_hit1, scored_document_hit5,
+ scored_document_hit4, scored_document_hit3};
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/5,
- /*is_descending=*/true),
- ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
- EqualsScoredDocumentHit(test_scored_document_hit4()),
- EqualsScoredDocumentHit(test_scored_document_hit3()),
- EqualsScoredDocumentHit(test_scored_document_hit2()),
- EqualsScoredDocumentHit(test_scored_document_hit1())));
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 5, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit5),
+ EqualsScoredDocumentHit(scored_document_hit4),
+ EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit1)));
}
-TEST_F(RankerTest, ShouldHandleSmallerNumResult) {
+TEST(RankerTest, ShouldCorrectlySortResultsAsc) {
+ ScoredDocumentHit scored_document_hit1 =
+ CreateScoredDocumentHit(/*document_id=*/1, /*score=*/1);
+ ScoredDocumentHit scored_document_hit2 =
+ CreateScoredDocumentHit(/*document_id=*/2, /*score=*/2);
+ ScoredDocumentHit scored_document_hit3 =
+ CreateScoredDocumentHit(/*document_id=*/3, /*score=*/3);
+ ScoredDocumentHit scored_document_hit4 =
+ CreateScoredDocumentHit(/*document_id=*/4, /*score=*/4);
+ ScoredDocumentHit scored_document_hit5 =
+ CreateScoredDocumentHit(/*document_id=*/5, /*score=*/5);
+
std::vector<ScoredDocumentHit> scored_document_hits = {
- test_scored_document_hit2(), test_scored_document_hit1(),
- test_scored_document_hit5(), test_scored_document_hit4(),
- test_scored_document_hit3()};
+ scored_document_hit2, scored_document_hit1, scored_document_hit5,
+ scored_document_hit4, scored_document_hit3};
- // num_result = 3, smaller than the size 5
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
- /*is_descending=*/true),
- ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
- EqualsScoredDocumentHit(test_scored_document_hit4()),
- EqualsScoredDocumentHit(test_scored_document_hit3())));
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/false);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 5, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit4),
+ EqualsScoredDocumentHit(scored_document_hit5)));
}
-TEST_F(RankerTest, ShouldHandleGreaterNumResult) {
+TEST(RankerTest, ShouldHandleSmallerNumResult) {
+ ScoredDocumentHit scored_document_hit1 =
+ CreateScoredDocumentHit(/*document_id=*/1, /*score=*/1);
+ ScoredDocumentHit scored_document_hit2 =
+ CreateScoredDocumentHit(/*document_id=*/2, /*score=*/2);
+ ScoredDocumentHit scored_document_hit3 =
+ CreateScoredDocumentHit(/*document_id=*/3, /*score=*/3);
+ ScoredDocumentHit scored_document_hit4 =
+ CreateScoredDocumentHit(/*document_id=*/4, /*score=*/4);
+ ScoredDocumentHit scored_document_hit5 =
+ CreateScoredDocumentHit(/*document_id=*/5, /*score=*/5);
+
std::vector<ScoredDocumentHit> scored_document_hits = {
- test_scored_document_hit2(), test_scored_document_hit1(),
- test_scored_document_hit5(), test_scored_document_hit4(),
- test_scored_document_hit3()};
+ scored_document_hit2, scored_document_hit1, scored_document_hit5,
+ scored_document_hit4, scored_document_hit3};
- // num_result = 10, greater than the size 5
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/10,
- /*is_descending=*/true),
- ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit5()),
- EqualsScoredDocumentHit(test_scored_document_hit4()),
- EqualsScoredDocumentHit(test_scored_document_hit3()),
- EqualsScoredDocumentHit(test_scored_document_hit2()),
- EqualsScoredDocumentHit(test_scored_document_hit1())));
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ // num_results = 3, smaller than the size 5
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 3, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit5),
+ EqualsScoredDocumentHit(scored_document_hit4),
+ EqualsScoredDocumentHit(scored_document_hit3)));
}
-TEST_F(RankerTest, ShouldHandleAcsendingOrder) {
+TEST(RankerTest, ShouldHandleGreaterNumResult) {
+ ScoredDocumentHit scored_document_hit1 =
+ CreateScoredDocumentHit(/*document_id=*/1, /*score=*/1);
+ ScoredDocumentHit scored_document_hit2 =
+ CreateScoredDocumentHit(/*document_id=*/2, /*score=*/2);
+ ScoredDocumentHit scored_document_hit3 =
+ CreateScoredDocumentHit(/*document_id=*/3, /*score=*/3);
+ ScoredDocumentHit scored_document_hit4 =
+ CreateScoredDocumentHit(/*document_id=*/4, /*score=*/4);
+ ScoredDocumentHit scored_document_hit5 =
+ CreateScoredDocumentHit(/*document_id=*/5, /*score=*/5);
+
std::vector<ScoredDocumentHit> scored_document_hits = {
- test_scored_document_hit2(), test_scored_document_hit1(),
- test_scored_document_hit5(), test_scored_document_hit4(),
- test_scored_document_hit3()};
+ scored_document_hit2, scored_document_hit1, scored_document_hit5,
+ scored_document_hit4, scored_document_hit3};
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/5,
- /*is_descending=*/false),
- ElementsAre(EqualsScoredDocumentHit(test_scored_document_hit1()),
- EqualsScoredDocumentHit(test_scored_document_hit2()),
- EqualsScoredDocumentHit(test_scored_document_hit3()),
- EqualsScoredDocumentHit(test_scored_document_hit4()),
- EqualsScoredDocumentHit(test_scored_document_hit5())));
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ // num_results = 10, greater than the size 5
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 10, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit5),
+ EqualsScoredDocumentHit(scored_document_hit4),
+ EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit1)));
}
-TEST_F(RankerTest, ShouldRespectDocumentIdWhenScoresAreEqual) {
+TEST(RankerTest, ShouldRespectDocumentIdDescWhenScoresAreEqual) {
ScoredDocumentHit scored_document_hit1(
/*document_id=*/1, /*hit_section_id_mask=*/0, /*score=*/100);
ScoredDocumentHit scored_document_hit2(
@@ -158,19 +176,80 @@
std::vector<ScoredDocumentHit> scored_document_hits = {
scored_document_hit3, scored_document_hit1, scored_document_hit2};
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
- /*is_descending=*/true),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
- EqualsScoredDocumentHit(scored_document_hit2),
- EqualsScoredDocumentHit(scored_document_hit1)));
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
- EXPECT_THAT(
- GetTopNFromScoredDocumentHits(scored_document_hits, /*num_result=*/3,
- /*is_descending=*/false),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
- EqualsScoredDocumentHit(scored_document_hit2),
- EqualsScoredDocumentHit(scored_document_hit3)));
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 3, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit1)));
+}
+
+TEST(RankerTest, ShouldRespectDocumentIdAscWhenScoresAreEqual) {
+ ScoredDocumentHit scored_document_hit1(
+ /*document_id=*/1, /*hit_section_id_mask=*/0, /*score=*/100);
+ ScoredDocumentHit scored_document_hit2(
+ /*document_id=*/2, /*hit_section_id_mask=*/0, /*score=*/100);
+ ScoredDocumentHit scored_document_hit3(
+ /*document_id=*/3, /*hit_section_id_mask=*/0, /*score=*/100);
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ scored_document_hit3, scored_document_hit1, scored_document_hit2};
+
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/false);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/
+ 3, scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3)));
+}
+
+TEST(RankerTest, ShouldPopResultsFromHeapMultipleTimes) {
+ ScoredDocumentHit scored_document_hit1 =
+ CreateScoredDocumentHit(/*document_id=*/1, /*score=*/1);
+ ScoredDocumentHit scored_document_hit2 =
+ CreateScoredDocumentHit(/*document_id=*/2, /*score=*/2);
+ ScoredDocumentHit scored_document_hit3 =
+ CreateScoredDocumentHit(/*document_id=*/3, /*score=*/3);
+ ScoredDocumentHit scored_document_hit4 =
+ CreateScoredDocumentHit(/*document_id=*/4, /*score=*/4);
+ ScoredDocumentHit scored_document_hit5 =
+ CreateScoredDocumentHit(/*document_id=*/5, /*score=*/5);
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ scored_document_hit2, scored_document_hit1, scored_document_hit5,
+ scored_document_hit4, scored_document_hit3};
+
+ const ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+
+ EXPECT_THAT(scored_document_hits.size(), Eq(5));
+
+ // Pops 2 results
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/2,
+ scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit5),
+ EqualsScoredDocumentHit(scored_document_hit4)));
+ EXPECT_THAT(scored_document_hits.size(), Eq(3));
+
+ // Pops 2 results
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/2,
+ scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit2)));
+ EXPECT_THAT(scored_document_hits.size(), Eq(1));
+
+ // Pops last 1 result
+ EXPECT_THAT(PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/2,
+ scored_document_hit_comparator),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1)));
+ EXPECT_THAT(scored_document_hits.size(), Eq(0));
}
} // namespace
diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc
new file mode 100644
index 0000000..c3ed40a
--- /dev/null
+++ b/icing/scoring/score-and-rank_benchmark.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scoring-processor.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+// This is an overall benchmark for ScoringProcessor, Scorer, and Ranker. It
+// shows how performance varies when we score different numbers of document
+// hits.
+//
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/scoring:score-and-rank_benchmark
+//
+// $ blaze-bin/icing/scoring/score-and-rank_benchmark
+// --benchmarks=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/scoring:score-and-rank_benchmark
+//
+// $ adb push blaze-bin/icing/scoring/score-and-rank_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/score-and-rank_benchmark --benchmarks=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+SchemaProto CreateSchemaWithEmailType() {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("Email");
+ auto subject = type_config->add_properties();
+ subject->set_property_name("subject");
+ subject->set_data_type(PropertyConfigProto::DataType::STRING);
+ subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ return schema;
+}
+
+DocumentProto CreateEmailDocument(int id, int document_score,
+ uint64_t creation_timestamp_ms) {
+ return DocumentBuilder()
+ .SetKey("icing", "uri" + std::to_string(id))
+ .SetSchema("Email")
+ .AddStringProperty("subject", "subject" + std::to_string(id))
+ .SetScore(document_score)
+ .SetCreationTimestampMs(creation_timestamp_ms)
+ .Build();
+}
+
+void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) {
+ const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
+ const std::string document_store_dir = base_dir + "/document_store";
+ const std::string schema_store_dir = base_dir + "/schema_store";
+
+ // Creates file directories
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
+ filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, base_dir));
+
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
+
+ int num_to_score = state.range(0);
+ int num_of_documents = state.range(1);
+
+ std::mt19937 random_generator;
+ std::uniform_int_distribution<int> distribution(
+ 1, std::numeric_limits<int>::max());
+
+ // Puts documents into document store
+ std::vector<DocHitInfo> doc_hit_infos;
+ for (int i = 0; i < num_of_documents; i++) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store->Put(CreateEmailDocument(
+ /*id=*/i, /*document_score=*/distribution(random_generator),
+ /*creation_timestamp_ms=*/1)));
+ doc_hit_infos.emplace_back(document_id);
+ }
+
+ ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
+ for (auto _ : state) {
+ // Creates a dummy DocHitInfoIterator with results, we need to pause the
+ // timer here so that the cost of copying test data is not included.
+ state.PauseTiming();
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ state.ResumeTiming();
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ num_to_score);
+
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+ // Ranks and gets the first page, 20 is a common page size
+ std::vector<ScoredDocumentHit> results =
+ PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
+ scored_document_hit_comparator);
+ }
+
+ // Clean up
+ document_store.reset();
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_ScoreAndRankDocumentHitsByDocumentScore)
+ // num_to_score, num_of_documents in document store
+ ->ArgPair(1000, 30000)
+ ->ArgPair(3000, 30000)
+ ->ArgPair(5000, 30000)
+ ->ArgPair(7000, 30000)
+ ->ArgPair(9000, 30000)
+ ->ArgPair(11000, 30000)
+ ->ArgPair(13000, 30000)
+ ->ArgPair(15000, 30000)
+ ->ArgPair(17000, 30000)
+ ->ArgPair(19000, 30000)
+ ->ArgPair(21000, 30000)
+ ->ArgPair(23000, 30000)
+ ->ArgPair(25000, 30000)
+ ->ArgPair(27000, 30000)
+ ->ArgPair(29000, 30000)
+ // Starting from this line, we're trying to see if num_of_documents affects
+ // performance
+ ->ArgPair(10000, 10000)
+ ->ArgPair(10000, 12000)
+ ->ArgPair(10000, 14000)
+ ->ArgPair(10000, 16000)
+ ->ArgPair(10000, 18000)
+ ->ArgPair(10000, 20000);
+
+void BM_ScoreAndRankDocumentHitsByCreationTime(benchmark::State& state) {
+ const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
+ const std::string document_store_dir = base_dir + "/document_store";
+ const std::string schema_store_dir = base_dir + "/schema_store";
+
+ // Creates file directories
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
+ filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, base_dir));
+
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
+
+ int num_to_score = state.range(0);
+ int num_of_documents = state.range(1);
+
+ std::mt19937_64 random_generator;
+ std::uniform_int_distribution<int64_t> distribution(
+ 1, std::numeric_limits<int64_t>::max());
+
+ // Puts documents into document store
+ std::vector<DocHitInfo> doc_hit_infos;
+ for (int i = 0; i < num_of_documents; i++) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store->Put(CreateEmailDocument(
+ /*id=*/i, /*document_score=*/1,
+ /*creation_timestamp_ms=*/distribution(random_generator))));
+ doc_hit_infos.emplace_back(document_id);
+ }
+
+ ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
+ for (auto _ : state) {
+ // Creates a dummy DocHitInfoIterator with results, we need to pause the
+ // timer here so that the cost of copying test data is not included.
+ state.PauseTiming();
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ state.ResumeTiming();
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ num_to_score);
+
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+ // Ranks and gets the first page, 20 is a common page size
+ std::vector<ScoredDocumentHit> results =
+ PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
+ scored_document_hit_comparator);
+ }
+
+ // Clean up
+ document_store.reset();
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_ScoreAndRankDocumentHitsByCreationTime)
+ // num_to_score, num_of_documents in document store
+ ->ArgPair(1000, 30000)
+ ->ArgPair(3000, 30000)
+ ->ArgPair(5000, 30000)
+ ->ArgPair(7000, 30000)
+ ->ArgPair(9000, 30000)
+ ->ArgPair(11000, 30000)
+ ->ArgPair(13000, 30000)
+ ->ArgPair(15000, 30000)
+ ->ArgPair(17000, 30000)
+ ->ArgPair(19000, 30000)
+ ->ArgPair(21000, 30000)
+ ->ArgPair(23000, 30000)
+ ->ArgPair(25000, 30000)
+ ->ArgPair(27000, 30000)
+ ->ArgPair(29000, 30000)
+ // Starting from this line, we're trying to see if num_of_documents affects
+ // performance
+ ->ArgPair(10000, 10000)
+ ->ArgPair(10000, 12000)
+ ->ArgPair(10000, 14000)
+ ->ArgPair(10000, 16000)
+ ->ArgPair(10000, 18000)
+ ->ArgPair(10000, 20000);
+
+void BM_ScoreAndRankDocumentHitsNoScoring(benchmark::State& state) {
+ const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
+ const std::string document_store_dir = base_dir + "/document_store";
+ const std::string schema_store_dir = base_dir + "/schema_store";
+
+ // Creates file directories
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
+ filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, base_dir));
+
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(scoring_spec, document_store.get()));
+
+ int num_to_score = state.range(0);
+ int num_of_documents = state.range(1);
+
+ // Puts documents into document store
+ std::vector<DocHitInfo> doc_hit_infos;
+ for (int i = 0; i < num_of_documents; i++) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store->Put(CreateEmailDocument(/*id=*/i, /*document_score=*/1,
+ /*creation_timestamp_ms=*/1)));
+ doc_hit_infos.emplace_back(document_id);
+ }
+
+ ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
+ for (auto _ : state) {
+ // Creates a dummy DocHitInfoIterator with results, we need to pause the
+ // timer here so that the cost of copying test data is not included.
+ state.PauseTiming();
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ state.ResumeTiming();
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ num_to_score);
+
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+ // Ranks and gets the first page, 20 is a common page size
+ std::vector<ScoredDocumentHit> results =
+ PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
+ scored_document_hit_comparator);
+ }
+
+ // Clean up
+ document_store.reset();
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_ScoreAndRankDocumentHitsNoScoring)
+ // num_to_score, num_of_documents in document store
+ ->ArgPair(1000, 30000)
+ ->ArgPair(3000, 30000)
+ ->ArgPair(5000, 30000)
+ ->ArgPair(7000, 30000)
+ ->ArgPair(9000, 30000)
+ ->ArgPair(11000, 30000)
+ ->ArgPair(13000, 30000)
+ ->ArgPair(15000, 30000)
+ ->ArgPair(17000, 30000)
+ ->ArgPair(19000, 30000)
+ ->ArgPair(21000, 30000)
+ ->ArgPair(23000, 30000)
+ ->ArgPair(25000, 30000)
+ ->ArgPair(27000, 30000)
+ ->ArgPair(29000, 30000)
+ // Starting from this line, we're trying to see if num_of_documents affects
+ // performance
+ ->ArgPair(10000, 10000)
+ ->ArgPair(10000, 12000)
+ ->ArgPair(10000, 14000)
+ ->ArgPair(10000, 16000)
+ ->ArgPair(10000, 18000)
+ ->ArgPair(10000, 20000);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
index 58892fe..ab5308c 100644
--- a/icing/scoring/scorer.cc
+++ b/icing/scoring/scorer.cc
@@ -67,6 +67,18 @@
double default_score_;
};
+// A special scorer which does nothing but assigns the default score to each
+// document. This is used especially when no scoring is required in a query.
+class NoScorer : public Scorer {
+ public:
+ explicit NoScorer(double default_score) : default_score_(default_score) {}
+
+ double GetScore(DocumentId document_id) override { return default_score_; }
+
+ private:
+ double default_score_;
+};
+
libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
ScoringSpecProto::RankingStrategy::Code rank_by, double default_score,
const DocumentStore* document_store) {
@@ -80,8 +92,7 @@
return std::make_unique<DocumentCreationTimestampScorer>(document_store,
default_score);
case ScoringSpecProto::RankingStrategy::NONE:
- return absl_ports::InvalidArgumentError(
- "RankingStrategy NONE not supported");
+ return std::make_unique<NoScorer>(default_score);
}
}
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index 69bf4e5..4dda603 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -102,12 +102,6 @@
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScorerTest, CreationWithInvalidRankingStrategyShouldFail) {
- EXPECT_THAT(Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
- /*default_score=*/0, document_store()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
TEST_F(ScorerTest, ShouldGetDefaultScore) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
@@ -193,6 +187,25 @@
Eq(fake_clock2().GetSystemTimeMilliseconds()));
}
+TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
+ /*default_score=*/3, document_store()));
+
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/0), Eq(3));
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/1), Eq(3));
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/2), Eq(3));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
+ /*default_score=*/111, document_store()));
+
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/4), Eq(111));
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/5), Eq(111));
+ EXPECT_THAT(scorer->GetScore(/*document_id=*/6), Eq(111));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc
index cc2b4f2..0933094 100644
--- a/icing/scoring/scoring-processor.cc
+++ b/icing/scoring/scoring-processor.cc
@@ -54,21 +54,15 @@
// Using `new` to access a non-public constructor.
return std::unique_ptr<ScoringProcessor>(
- new ScoringProcessor(std::move(scorer), is_descending_order));
+ new ScoringProcessor(std::move(scorer)));
}
-std::vector<ScoredDocumentHit> ScoringProcessor::ScoreAndRank(
+std::vector<ScoredDocumentHit> ScoringProcessor::Score(
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
- int num_to_return) {
+ int num_to_score) {
std::vector<ScoredDocumentHit> scored_document_hits;
- if (num_to_return <= 0) {
- return scored_document_hits;
- }
-
- // TODO(b/145025400) Determine if we want to score all DocHitInfo or enforce
- // an upper limit.
- while (doc_hit_info_iterator->Advance().ok()) {
+ while (doc_hit_info_iterator->Advance().ok() && num_to_score-- > 0) {
const DocHitInfo& doc_hit_info = doc_hit_info_iterator->doc_hit_info();
// TODO(b/144955274) Calculate hit demotion factor from HitScore
double hit_demotion_factor = 1.0;
@@ -80,8 +74,7 @@
doc_hit_info.document_id(), doc_hit_info.hit_section_ids_mask(), score);
}
- return GetTopNFromScoredDocumentHits(std::move(scored_document_hits),
- num_to_return, is_descending_);
+ return scored_document_hits;
}
} // namespace lib
diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h
index f7984e1..60c3b32 100644
--- a/icing/scoring/scoring-processor.h
+++ b/icing/scoring/scoring-processor.h
@@ -39,26 +39,27 @@
// Returns:
// A ScoringProcessor on success
// FAILED_PRECONDITION on any null pointer input
- // INVALID_ARGUMENT if unable to create what the spec specifies
static libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> Create(
const ScoringSpecProto& scoring_spec,
const DocumentStore* document_store);
- // Returns a vector of ScoredDocHit sorted by their scores. The size of it is
- // no more than num_to_return.
- std::vector<ScoredDocumentHit> ScoreAndRank(
+ // Assigns scores to DocHitInfos from the given DocHitInfoIterator and returns
+ // a vector of ScoredDocumentHits. The size of results is no more than
+ // num_to_score. The order of results is the same as DocHitInfos from
+ // DocHitInfoIterator.
+ //
+ // NOTE: if the scoring spec doesn't require a scoring strategy, all
+ // ScoredDocumentHits will be assigned a default score 0.
+ std::vector<ScoredDocumentHit> Score(
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
- int num_to_return);
+ int num_to_score);
private:
- explicit ScoringProcessor(std::unique_ptr<Scorer> scorer, bool is_descending)
- : scorer_(std::move(scorer)), is_descending_(is_descending) {}
+ explicit ScoringProcessor(std::unique_ptr<Scorer> scorer)
+ : scorer_(std::move(scorer)) {}
+ // The component that assigns scores to documents.
std::unique_ptr<Scorer> scorer_;
-
- // If true, the final result will be sorted in a descending order, otherwise
- // ascending.
- bool is_descending_;
};
} // namespace lib
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index 9ba9a52..b93bf1a 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -126,12 +126,6 @@
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScoringProcessorTest, CreationWithInvalidRankingStrategyShouldFail) {
- ScoringSpecProto spec_proto;
- EXPECT_THAT(ScoringProcessor::Create(spec_proto, document_store()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
TEST_F(ScoringProcessorTest, ShouldCreateInstance) {
ScoringSpecProto spec_proto;
spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
@@ -139,28 +133,25 @@
}
TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
// Creates an empty DocHitInfoIterator
std::vector<DocHitInfo> doc_hit_infos = {};
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/5),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/5),
IsEmpty());
}
-TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToReturn) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
+TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
// Sets up documents
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id1,
@@ -173,63 +164,57 @@
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/-1),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/-1),
IsEmpty());
doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/0),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/0),
IsEmpty());
}
-TEST_F(ScoringProcessorTest, ShouldRespectNumToReturn) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
+TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) {
// Sets up documents
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
// Creates a dummy DocHitInfoIterator with 3 results
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/2),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/2),
SizeIs(2));
doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/4),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/4),
SizeIs(3));
}
-TEST_F(ScoringProcessorTest, ShouldRankByDocumentScoreDesc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
- // Intentionally inserts documents in a different order
+TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) {
+ // Creates input doc_hit_infos and expected output scored_document_hits
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
CreateAndInsertsDocumentsWithScores(document_store(), {1, 3, 2}));
@@ -237,75 +222,26 @@
std::vector<ScoredDocumentHit> scored_document_hits =
std::move(doc_hit_result_pair.second);
- // Rearranges scored_document_hits so that it's in a document score increasing
- // order.
- std::swap(scored_document_hits.at(1), scored_document_hits.at(2));
-
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
// Creates a dummy DocHitInfoIterator with 3 results
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // Creates a ScoringProcessor
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
-
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
- EqualsScoredDocumentHit(scored_document_hits.at(1)),
- EqualsScoredDocumentHit(scored_document_hits.at(0))));
-}
-
-TEST_F(ScoringProcessorTest, ShouldRankByDocumentScoreAsc) {
ScoringSpecProto spec_proto;
spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
-
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
- // Intentionally inserts documents in a different order
- ICING_ASSERT_OK_AND_ASSIGN(
- auto doc_hit_result_pair,
- CreateAndInsertsDocumentsWithScores(document_store(), {1, 3, 2}));
- std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
- std::vector<ScoredDocumentHit> scored_document_hits =
- std::move(doc_hit_result_pair.second);
-
- // Rearranges scored_document_hits so that it's in a document score increasing
- // order.
- std::swap(scored_document_hits.at(1), scored_document_hits.at(2));
-
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
- // Creates a dummy DocHitInfoIterator with 3 results
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3),
ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
EqualsScoredDocumentHit(scored_document_hits.at(1)),
EqualsScoredDocumentHit(scored_document_hits.at(2))));
}
-TEST_F(ScoringProcessorTest, ShouldRankByCreationTimestampDesc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
-
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
+TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
DocumentProto document1 =
CreateDocument("icing", "email/1", kDefaultScore,
/*creation_timestamp_ms=*/1571100001111);
@@ -338,142 +274,23 @@
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hit3),
- EqualsScoredDocumentHit(scored_document_hit2),
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3),
EqualsScoredDocumentHit(scored_document_hit1)));
}
-TEST_F(ScoringProcessorTest, ShouldRankByCreationTimestampAsc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
- spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
-
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
- DocumentProto document1 =
- CreateDocument("icing", "email/1", kDefaultScore,
- /*creation_timestamp_ms=*/1571100001000);
- DocumentProto document2 =
- CreateDocument("icing", "email/2", kDefaultScore,
- /*creation_timestamp_ms=*/1571100002000);
- DocumentProto document3 =
- CreateDocument("icing", "email/3", kDefaultScore,
- /*creation_timestamp_ms=*/1571100003000);
- // Intentionally inserts documents in a different order
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store()->Put(document1));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- document_store()->Put(document3));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- document_store()->Put(document2));
- DocHitInfo doc_hit_info1(document_id1);
- DocHitInfo doc_hit_info2(document_id2);
- DocHitInfo doc_hit_info3(document_id3);
- ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
- document1.creation_timestamp_ms());
- ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
- document2.creation_timestamp_ms());
- ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
- document3.creation_timestamp_ms());
-
- // Creates a dummy DocHitInfoIterator with 3 results
- std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info2, doc_hit_info3,
- doc_hit_info1};
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- // Creates a ScoringProcessor which ranks in descending order
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
-
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
- EqualsScoredDocumentHit(scored_document_hit2),
- EqualsScoredDocumentHit(scored_document_hit3)));
-}
-
-TEST_F(ScoringProcessorTest, ShouldHandleSameScoresDesc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
- // Creates 3 documents with the same score.
- ICING_ASSERT_OK_AND_ASSIGN(
- auto doc_hit_result_pair,
- CreateAndInsertsDocumentsWithScores(document_store(), {100, 100, 100}));
- std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
- std::vector<ScoredDocumentHit> scored_document_hits =
- std::move(doc_hit_result_pair.second);
-
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
- // Creates a dummy DocHitInfoIterator with 3 results
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- // Creates a ScoringProcessor which ranks in descending order
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
-
- // Results should be ranked in descending document id order.
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
- EqualsScoredDocumentHit(scored_document_hits.at(1)),
- EqualsScoredDocumentHit(scored_document_hits.at(0))));
-}
-
-TEST_F(ScoringProcessorTest, ShouldHandleSameScoresAsc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
-
- // Creates 3 documents with the same score.
- ICING_ASSERT_OK_AND_ASSIGN(
- auto doc_hit_result_pair,
- CreateAndInsertsDocumentsWithScores(document_store(), {100, 100, 100}));
- std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
- std::vector<ScoredDocumentHit> scored_document_hits =
- std::move(doc_hit_result_pair.second);
-
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
- // Creates a dummy DocHitInfoIterator with 3 results
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- // Creates a ScoringProcessor which ranks in ascending order
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
-
- // Results should be ranked in ascending document id order.
- EXPECT_THAT(scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/3),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
- EqualsScoredDocumentHit(scored_document_hits.at(1)),
- EqualsScoredDocumentHit(scored_document_hits.at(2))));
-}
-
-TEST_F(ScoringProcessorTest, ShouldHandleNoScoresDesc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
-
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
+TEST_F(ScoringProcessorTest, ShouldHandleNoScores) {
+ // Creates input doc_hit_infos and corresponding scored_document_hits
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
@@ -481,10 +298,6 @@
std::vector<ScoredDocumentHit> scored_document_hits =
std::move(doc_hit_result_pair.second);
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
-
// Creates a dummy DocHitInfoIterator with 4 results one of which doesn't have
// a score.
doc_hit_infos.emplace(doc_hit_infos.begin(), /*document_id_in=*/4,
@@ -494,63 +307,71 @@
// The document hit without a score will be be assigned the default score 0 in
// a descending order.
- ScoredDocumentHit scored_document_hit_default_desc =
+ ScoredDocumentHit scored_document_hit_default =
ScoredDocumentHit(4, kSectionIdMaskNone, /*score=*/0.0);
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(
- scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/4),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(2)),
- EqualsScoredDocumentHit(scored_document_hits.at(1)),
- EqualsScoredDocumentHit(scored_document_hits.at(0)),
- EqualsScoredDocumentHit(scored_document_hit_default_desc)));
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/4),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit_default),
+ EqualsScoredDocumentHit(scored_document_hits.at(0)),
+ EqualsScoredDocumentHit(scored_document_hits.at(1)),
+ EqualsScoredDocumentHit(scored_document_hits.at(2))));
}
-TEST_F(ScoringProcessorTest, ShouldHandleNoScoresAsc) {
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- spec_proto.set_order_by(ScoringSpecProto::Order::ASC);
+TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
+ DocumentProto document1 = CreateDocument("icing", "email/1", /*score=*/1,
+ kDefaultCreationTimestampMs);
+ DocumentProto document2 = CreateDocument("icing", "email/2", /*score=*/2,
+ kDefaultCreationTimestampMs);
+ DocumentProto document3 = CreateDocument("icing", "email/3", /*score=*/3,
+ kDefaultCreationTimestampMs);
- // Sets up documents, guaranteed relationship:
- // document1 < document2 < document3
- ICING_ASSERT_OK_AND_ASSIGN(
- auto doc_hit_result_pair,
- CreateAndInsertsDocumentsWithScores(document_store(), {1, 2, 3}));
- std::vector<DocHitInfo> doc_hit_infos = std::move(doc_hit_result_pair.first);
- std::vector<ScoredDocumentHit> scored_document_hits =
- std::move(doc_hit_result_pair.second);
+ // Intentionally inserts documents in a different order
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store()->Put(document3));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(document2));
+ DocHitInfo doc_hit_info1(document_id1);
+ DocHitInfo doc_hit_info2(document_id2);
+ DocHitInfo doc_hit_info3(document_id3);
- // Disarrays doc_hit_infos
- std::swap(doc_hit_infos.at(0), doc_hit_infos.at(1));
- std::swap(doc_hit_infos.at(1), doc_hit_infos.at(2));
+ // The expected results should all have the default score 0.
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ kDefaultScore);
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ kDefaultScore);
+ ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
+ kDefaultScore);
- // Creates a dummy DocHitInfoIterator with 4 results one of which doesn't have
- // a score.
- doc_hit_infos.emplace(doc_hit_infos.begin(), /*document_id_in=*/4,
- kSectionIdMaskNone);
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info2, doc_hit_info3,
+ doc_hit_info1};
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // The document hit without a score will be be assigned the default score
- // max of double in an ascending order.
- ScoredDocumentHit scored_document_hit_default_asc = ScoredDocumentHit(
- 4, kSectionIdMaskNone, /*score=*/std::numeric_limits<double>::max());
+ // A ScoringSpecProto with no scoring strategy
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
- // Creates a ScoringProcessor which ranks in ascending order
+ // Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
ScoringProcessor::Create(spec_proto, document_store()));
- EXPECT_THAT(
- scoring_processor->ScoreAndRank(std::move(doc_hit_info_iterator),
- /*num_to_return=*/4),
- ElementsAre(EqualsScoredDocumentHit(scored_document_hits.at(0)),
- EqualsScoredDocumentHit(scored_document_hits.at(1)),
- EqualsScoredDocumentHit(scored_document_hits.at(2)),
- EqualsScoredDocumentHit(scored_document_hit_default_asc)));
+
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3),
+ EqualsScoredDocumentHit(scored_document_hit1)));
}
} // namespace
diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
index 99da6b1..198bc49 100644
--- a/icing/store/document-filter-data.h
+++ b/icing/store/document-filter-data.h
@@ -19,11 +19,11 @@
#include <type_traits>
#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/namespace-id.h"
namespace icing {
namespace lib {
-using NamespaceId = int16_t;
using SchemaTypeId = int16_t;
class DocumentFilterData {
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index f5b94d0..93cebaa 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -40,6 +40,7 @@
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-id.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
@@ -777,6 +778,36 @@
return document_id_or.ValueOrDie();
}
+std::vector<std::string> DocumentStore::GetAllNamespaces() const {
+ std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+ namespace_mapper_->GetValuesToKeys();
+
+ std::unordered_set<NamespaceId> existing_namespace_ids;
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ // filter_cache_->Get can only fail if document_id is < 0
+ // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
+ auto status_or_data = filter_cache_->Get(document_id);
+ if (!status_or_data.ok()) {
+ ICING_LOG(ERROR)
+ << "Error while iterating over filter cache in GetAllNamespaces";
+ return std::vector<std::string>();
+ }
+ const DocumentFilterData* data = status_or_data.ValueOrDie();
+
+ if (DoesDocumentExist(document_id)) {
+ existing_namespace_ids.insert(data->namespace_id());
+ }
+ }
+
+ std::vector<std::string> existing_namespaces;
+ for (auto itr = existing_namespace_ids.begin();
+ itr != existing_namespace_ids.end(); ++itr) {
+ existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
+ }
+ return existing_namespaces;
+}
+
libtextclassifier3::StatusOr<int64_t>
DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
if (!IsDocumentIdValid(document_id)) {
@@ -817,11 +848,7 @@
const std::string_view name_space, const std::string_view uri) {
// Try to get the DocumentId first
auto document_id_or = GetDocumentId(name_space, uri);
- if (absl_ports::IsNotFound(document_id_or.status())) {
- // No need to delete nonexistent (name_space, uri)
- return libtextclassifier3::Status::OK;
- } else if (!document_id_or.ok()) {
- // Real error
+ if (!document_id_or.ok()) {
return absl_ports::Annotate(
document_id_or.status(),
absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
@@ -831,15 +858,11 @@
// Check if the DocumentId's Document still exists.
DocumentId document_id = document_id_or.ValueOrDie();
auto file_offset_or = DoesDocumentExistAndGetFileOffset(document_id);
- if (absl_ports::IsNotFound(file_offset_or.status())) {
- // No need to delete nonexistent documents
- return libtextclassifier3::Status::OK;
- } else if (!file_offset_or.ok()) {
- // Real error, pass it up
+ if (!file_offset_or.ok()) {
return absl_ports::Annotate(
file_offset_or.status(),
- IcingStringUtil::StringPrintf(
- "Failed to retrieve file offset for DocumentId %d", document_id));
+ absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
+ ", uri: ", uri));
}
// Update ground truth first.
@@ -852,10 +875,9 @@
document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
.status();
if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete Document. namespace: " << name_space
- << ", uri: " << uri;
- return status;
+ return absl_ports::Annotate(
+ status, absl_ports::StrCat("Failed to delete Document. namespace: ",
+ name_space, ", uri: ", uri));
}
ICING_RETURN_IF_ERROR(
@@ -894,12 +916,11 @@
libtextclassifier3::Status DocumentStore::DeleteByNamespace(
std::string_view name_space) {
auto namespace_id_or = namespace_mapper_->Get(name_space);
- if (absl_ports::IsNotFound(namespace_id_or.status())) {
- // Namespace doesn't exist. Don't need to delete anything.
- return libtextclassifier3::Status::OK;
- } else if (!namespace_id_or.ok()) {
- // Real error, pass it up.
- return namespace_id_or.status();
+ if (!namespace_id_or.ok()) {
+ return absl_ports::Annotate(
+ namespace_id_or.status(),
+ absl_ports::StrCat("Failed to delete by namespace. namespace: ",
+ name_space));
}
// Update ground truth first.
@@ -916,23 +937,31 @@
return status;
}
- return UpdateDerivedFilesNamespaceDeleted(name_space);
+ ICING_ASSIGN_OR_RETURN(bool updated_existing_document,
+ UpdateDerivedFilesNamespaceDeleted(name_space));
+ if (!updated_existing_document) {
+ // Treat the fact that no existing documents had this namespace to be the
+ // same as this namespace not existing at all.
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
+ }
+ return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocumentStore::UpdateDerivedFilesNamespaceDeleted(
- std::string_view name_space) {
+libtextclassifier3::StatusOr<bool>
+DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) {
auto namespace_id_or = namespace_mapper_->Get(name_space);
- if (absl_ports::IsNotFound(namespace_id_or.status())) {
- // Namespace doesn't exist. Don't need to delete anything.
- return libtextclassifier3::Status::OK;
- } else if (!namespace_id_or.ok()) {
- // Real error, pass it up.
+ if (!namespace_id_or.ok()) {
return namespace_id_or.status();
}
// Guaranteed to have a NamespaceId now.
NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ // Tracks if there were any existing documents with this namespace that we
+ // will mark as deleted.
+ bool updated_existing_document = false;
+
// Traverse FilterCache and delete all docs that match namespace_id
for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
++document_id) {
@@ -941,6 +970,10 @@
ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
filter_cache_->Get(document_id));
if (data->namespace_id() == namespace_id) {
+ if (DoesDocumentExist(document_id)) {
+ updated_existing_document = true;
+ }
+
// docid_mapper_->Set can only fail if document_id is < 0
// or >= docid_mapper_->num_elements. So the only possible way to get an
// error here would be if filter_cache_->num_elements >
@@ -950,18 +983,17 @@
}
}
- return libtextclassifier3::Status::OK;
+ return updated_existing_document;
}
libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
std::string_view schema_type) {
auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
- if (absl_ports::IsNotFound(schema_type_id_or.status())) {
- // SchemaType doesn't exist. Don't need to delete anything.
- return libtextclassifier3::Status::OK;
- } else if (!schema_type_id_or.ok()) {
- // Real error, pass it up.
- return schema_type_id_or.status();
+ if (!schema_type_id_or.ok()) {
+ return absl_ports::Annotate(
+ schema_type_id_or.status(),
+ absl_ports::StrCat("Failed to delete by schema type. schema_type: ",
+ schema_type));
}
// Update ground truth first.
@@ -1076,7 +1108,11 @@
} else {
// Document is no longer valid with the new SchemaStore. Mark as
// deleted
- ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ auto delete_status = Delete(document.namespace_(), document.uri());
+ if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+ // Real error, pass up
+ return delete_status;
+ }
}
}
@@ -1167,7 +1203,11 @@
if (!document_validator_.Validate(document).ok()) {
// Document is no longer valid with the new SchemaStore. Mark as
// deleted
- ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ auto delete_status = Delete(document.namespace_(), document.uri());
+ if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+ // Real error, pass up
+ return delete_status;
+ }
}
}
}
@@ -1225,6 +1265,59 @@
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
+DocumentStore::GetOptimizeInfo() const {
+ OptimizeInfo optimize_info;
+
+ // Figure out our ratio of optimizable/total docs.
+ int32_t num_documents = document_id_mapper_->num_elements();
+ for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
+ ++document_id) {
+ if (!DoesDocumentExist(document_id)) {
+ ++optimize_info.optimizable_docs;
+ }
+
+ ++optimize_info.total_docs;
+ }
+
+ if (optimize_info.total_docs == 0) {
+ // Can exit early since there's nothing to calculate.
+ return optimize_info;
+ }
+
+ // Get the total element size.
+ //
+ // We use file size instead of disk usage here because the files are not
+ // sparse, so it's more accurate. Disk usage rounds up to the nearest block
+ // size.
+ ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
+ document_log_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
+ document_id_mapper_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
+ score_cache_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
+ filter_cache_->GetElementsFileSize());
+
+ // We use a combined disk usage and file size for the KeyMapper because it's
+ // backed by a trie, which has some sparse property bitmaps.
+ ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
+ document_key_mapper_->GetElementsSize());
+
+ // We don't include the namespace mapper because it's not clear if we could
+ // recover any space even if Optimize were called. Deleting 100s of documents
+ // could still leave a few documents of a namespace, and then there would be
+ // no change.
+
+ int64_t total_size = document_log_file_size + document_key_mapper_size +
+ document_id_mapper_file_size + score_cache_file_size +
+ filter_cache_file_size;
+
+ optimize_info.estimated_optimizable_bytes =
+ total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
+ return optimize_info;
+}
+
libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
return score_cache_->Set(document_id, score_data);
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 5bb167d..3f4b72f 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -33,6 +33,7 @@
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-id.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/document-validator.h"
@@ -53,6 +54,20 @@
uint32_t checksum;
};
+ struct OptimizeInfo {
+ // The estimated size in bytes of the optimizable docs. We don't track the
+ // size of each document, so we estimate by taking the size of the entire
+ // DocumentStore and dividing that by the total number of documents we have.
+ // So we end up with an average document size.
+ int64_t estimated_optimizable_bytes = 0;
+
+ // Number of total documents the DocumentStore tracks.
+ int32_t total_docs = 0;
+
+ // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
+ int32_t optimizable_docs = 0;
+ };
+
// Not copyable
DocumentStore(const DocumentStore&) = delete;
DocumentStore& operator=(const DocumentStore&) = delete;
@@ -94,6 +109,9 @@
//
// Returns:
// A newly generated document id on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // NOT_FOUND if the schema_type or a property config of the document doesn't
+ // exist in schema
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<DocumentId> Put(const DocumentProto& document);
libtextclassifier3::StatusOr<DocumentId> Put(DocumentProto&& document);
@@ -118,8 +136,15 @@
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<DocumentProto> Get(DocumentId document_id) const;
- // Returns true if there's an existing document associated with the given
- // document id.
+ // Returns all namespaces which have at least 1 active document (not deleted
+ // or expired). Order of namespaces is undefined.
+ std::vector<std::string> GetAllNamespaces() const;
+
+ // Check if a document exists. Existence means it hasn't been deleted and it
+ // hasn't expired yet.
+ //
+ // Returns:
+ // boolean whether a document exists or not
bool DoesDocumentExist(DocumentId document_id) const;
// Deletes the document identified by the given namespace and uri
@@ -129,6 +154,7 @@
//
// Returns:
// OK on success
+ // NOT_FOUND if no document exists with namespace, uri
// INTERNAL_ERROR on IO error
libtextclassifier3::Status Delete(std::string_view name_space,
std::string_view uri);
@@ -178,6 +204,7 @@
//
// Returns:
// OK on success
+ // NOT_FOUND if namespace doesn't exist
// INTERNAL_ERROR on IO error
libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
@@ -188,6 +215,7 @@
//
// Returns:
// OK on success
+ // NOT_FOUND if schema_type doesn't exist
// INTERNAL_ERROR on IO error
libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
@@ -198,7 +226,8 @@
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
@@ -263,6 +292,15 @@
// INTERNAL_ERROR on IO error
libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
+ // Calculates status for a potential Optimize call. Includes how many docs
+ // there are vs how many would be optimized away. And also includes an
+ // estimated size gains, in bytes, if Optimize were called.
+ //
+ // Returns:
+ // OptimizeInfo on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
+
// Computes the combined checksum of the document store - includes the ground
// truth and all derived files.
//
@@ -394,9 +432,9 @@
// called.
//
// Returns:
- // OK on success
+ // bool on whether an existing document was actually updated to be deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status UpdateDerivedFilesNamespaceDeleted(
+ libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted(
std::string_view name_space);
// Update derived files that the schema type schema_type_id has been deleted.
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index e246ca9..ad56b9a 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -33,6 +33,7 @@
#include "icing/schema/schema-store.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/tmp-directory.h"
@@ -46,10 +47,12 @@
using ::testing::Eq;
using ::testing::Gt;
using ::testing::HasSubstr;
+using ::testing::IsEmpty;
using ::testing::IsFalse;
using ::testing::IsTrue;
using ::testing::Not;
using ::testing::Return;
+using ::testing::UnorderedElementsAre;
class DocumentStoreTest : public ::testing::Test {
protected:
@@ -340,7 +343,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, Delete) {
+TEST_F(DocumentStoreTest, DeleteOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -352,18 +355,45 @@
EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
EXPECT_THAT(doc_store->Get(document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
// Validates that deleting something non-existing won't append anything to
// ground truth
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- // icing + email/1 has already been deleted.
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+
+ EXPECT_THAT(
+ document_store->Delete("nonexistent_namespace", "nonexistent_uri"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
int64_t ground_truth_size_after = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
+TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+
+ // First time is OK
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ // Deleting it again is NOT_FOUND
+ EXPECT_THAT(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
@@ -403,7 +433,7 @@
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceOk) {
+TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -414,13 +444,30 @@
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- ICING_EXPECT_OK(doc_store->DeleteByNamespace("nonexistent_namespace"));
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
+TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ // At this point, there are no existing documents with the namespace, even
+ // though Icing's derived files know about this namespace. We should still
+ // return NOT_FOUND since nothing existing has this namespace.
+ EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
DocumentProto document1 = test_document1_;
document1.set_namespace_("namespace.1");
@@ -568,7 +615,7 @@
IsOkAndHolds(EqualsProto(person_document)));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeOk) {
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -579,7 +626,8 @@
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("nonexistent_type"));
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
@@ -587,6 +635,21 @@
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
+TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ // At this point, there are no existing documents with the schema type, but we
+ // still return OK because the SchemaStore is the ground truth on schemas and
+ // knows about the type
+ ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema()));
+}
+
TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
SchemaProto schema;
auto type_config = schema.add_types();
@@ -1007,16 +1070,25 @@
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(int64_t size1, doc_store->GetDiskUsage());
- EXPECT_THAT(size1, Gt(0));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_doc_store_size,
+ doc_store->GetDiskUsage());
+ EXPECT_THAT(empty_doc_store_size, Gt(0));
- ICING_ASSERT_OK(doc_store->Put(test_document1_));
- ICING_ASSERT_OK_AND_ASSIGN(int64_t size2, doc_store->GetDiskUsage());
- EXPECT_THAT(size2, Gt(size1));
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "foo")
+ .Build();
- ICING_ASSERT_OK(doc_store->Put(test_document2_));
- EXPECT_THAT(doc_store->GetDiskUsage(), IsOkAndHolds(Gt(size2)));
- doc_store.reset();
+ // Since our GetDiskUsage can only get sizes in increments of block_size, we
+ // need to insert enough documents so the disk usage will increase by at least
+ // 1 block size. The number 100 is a bit arbitrary, gotten from manually
+ // testing.
+ for (int i = 0; i < 100; ++i) {
+ ICING_ASSERT_OK(doc_store->Put(document));
+ }
+ EXPECT_THAT(doc_store->GetDiskUsage(),
+ IsOkAndHolds(Gt(empty_doc_store_size)));
// Bad file system
MockFilesystem mock_filesystem;
@@ -1896,5 +1968,121 @@
IsOkAndHolds(EqualsProto(message_document)));
}
+TEST_F(DocumentStoreTest, GetOptimizeInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Nothing should be optimizable yet
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentStore::OptimizeInfo optimize_info,
+ document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(0));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+ ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+
+ // Adding a document, still nothing is optimizable
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(1));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+ // Delete a document. Now something is optimizable
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(1));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(1));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Gt(0));
+
+ // Optimize it into a different directory, should bring us back to nothing
+ // since all documents were optimized away.
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+ document_store.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> optimized_document_store,
+ DocumentStore::Create(&filesystem_, optimized_dir, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info,
+ optimized_document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(0));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+}
+
+TEST_F(DocumentStoreTest, GetAllNamespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Empty namespaces to start with
+ EXPECT_THAT(document_store->GetAllNamespaces(), IsEmpty());
+
+ DocumentProto namespace1 = DocumentBuilder()
+ .SetKey("namespace1", "uri")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(500)
+ .Build();
+ DocumentProto namespace2_uri1 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(500)
+ .Build();
+ DocumentProto namespace2_uri2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(500)
+ .Build();
+ DocumentProto namespace3 = DocumentBuilder()
+ .SetKey("namespace3", "uri")
+ .SetSchema("email")
+ .SetCreationTimestampMs(0)
+ .SetTtlMs(100)
+ .Build();
+
+ ICING_ASSERT_OK(document_store->Put(namespace1));
+ ICING_ASSERT_OK(document_store->Put(namespace2_uri1));
+ ICING_ASSERT_OK(document_store->Put(namespace2_uri2));
+ ICING_ASSERT_OK(document_store->Put(namespace3));
+
+ auto get_result = document_store->Get("namespace1", "uri");
+ get_result = document_store->Get("namespace2", "uri1");
+ get_result = document_store->Get("namespace2", "uri2");
+ get_result = document_store->Get("namespace3", "uri");
+
+ // Have all the namespaces now
+ EXPECT_THAT(document_store->GetAllNamespaces(),
+ UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
+
+ // After deleting namespace2_uri1, there's still namespace2_uri2, so
+ // "namespace2" still shows up in results
+ ICING_EXPECT_OK(document_store->Delete("namespace2", "uri1"));
+
+ EXPECT_THAT(document_store->GetAllNamespaces(),
+ UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
+
+ // After deleting namespace2_uri2, there's no more documents in "namespace2"
+ ICING_EXPECT_OK(document_store->Delete("namespace2", "uri2"));
+
+ EXPECT_THAT(document_store->GetAllNamespaces(),
+ UnorderedElementsAre("namespace1", "namespace3"));
+
+ // Some arbitrary time past namespace3's creation time (0) and ttl (100)
+ fake_clock_.SetSystemTimeMilliseconds(110);
+
+ EXPECT_THAT(document_store->GetAllNamespaces(),
+ UnorderedElementsAre("namespace1"));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index b01a8f1..4571df2 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -84,6 +84,9 @@
// Returns any encountered IO errors.
libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
+ // Returns a map of values to keys. Empty map if the mapper is empty.
+ std::unordered_map<T, std::string> GetValuesToKeys() const;
+
// Count of unique keys stored in the KeyMapper.
int32_t num_keys() const { return trie_.size(); }
@@ -99,13 +102,23 @@
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the size of the elements held in the key mapper. This excludes the
+ // size of any internal metadata of the key mapper, e.g. the key mapper's
+ // header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
// Computes and returns the checksum of the header and contents.
Crc32 ComputeChecksum();
@@ -242,6 +255,21 @@
}
template <typename T>
+std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const {
+ std::unordered_map<T, std::string> values_to_keys;
+ for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid();
+ itr.Advance()) {
+ if (itr.IsValid()) {
+ T value;
+ memcpy(&value, itr.GetValue(), sizeof(T));
+ values_to_keys.insert({value, itr.GetKey()});
+ }
+ }
+
+ return values_to_keys;
+}
+
+template <typename T>
libtextclassifier3::Status KeyMapper<T>::PersistToDisk() {
if (!trie_.Sync()) {
return absl_ports::InternalError(
@@ -261,6 +289,16 @@
}
template <typename T>
+libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetElementsSize() const {
+ int64_t size = trie_.GetElementsSize();
+ if (size == IcingFilesystem::kBadFileSize || size < 0) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of elements in the key mapper");
+ }
+ return size;
+}
+
+template <typename T>
Crc32 KeyMapper<T>::ComputeChecksum() {
return Crc32(trie_.UpdateCrc());
}
diff --git a/icing/store/key-mapper_test.cc b/icing/store/key-mapper_test.cc
index c75c203..4e3dd8a 100644
--- a/icing/store/key-mapper_test.cc
+++ b/icing/store/key-mapper_test.cc
@@ -22,6 +22,9 @@
using ::testing::_;
using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
namespace icing {
namespace lib {
@@ -163,6 +166,23 @@
EXPECT_THAT(key_mapper->num_keys(), 1);
}
+TEST_F(KeyMapperTest, GetValuesToKeys) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ EXPECT_THAT(key_mapper->GetValuesToKeys(), IsEmpty());
+
+ ICING_EXPECT_OK(key_mapper->Put("foo", /*value=*/1));
+ ICING_EXPECT_OK(key_mapper->Put("bar", /*value=*/2));
+ EXPECT_THAT(key_mapper->GetValuesToKeys(),
+ UnorderedElementsAre(Pair(1, "foo"), Pair(2, "bar")));
+
+ ICING_EXPECT_OK(key_mapper->Put("baz", /*value=*/3));
+ EXPECT_THAT(
+ key_mapper->GetValuesToKeys(),
+ UnorderedElementsAre(Pair(1, "foo"), Pair(2, "bar"), Pair(3, "baz")));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/testing/i18n-test-utils.h b/icing/store/namespace-id.h
similarity index 73%
copy from icing/testing/i18n-test-utils.h
copy to icing/store/namespace-id.h
index 4e8a3b8..4225be3 100644
--- a/icing/testing/i18n-test-utils.h
+++ b/icing/store/namespace-id.h
@@ -12,19 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TESTING_I18N_TEST_UTILS_H_
-#define ICING_TESTING_I18N_TEST_UTILS_H_
+#ifndef ICING_STORE_NAMESPACE_ID_H_
+#define ICING_STORE_NAMESPACE_ID_H_
-#include <string>
-
-#include "unicode/umachine.h"
+#include <cstdint>
namespace icing {
namespace lib {
-std::string UcharToString(UChar32 uchar);
+// Id of unique namespace in DocumentProto. Generated in DocumentStore.
+using NamespaceId = int16_t;
} // namespace lib
} // namespace icing
-#endif // ICING_TESTING_I18N_TEST_UTILS_H_
+#endif // ICING_STORE_NAMESPACE_ID_H_
diff --git a/icing/testing/document-generator.h b/icing/testing/document-generator.h
new file mode 100644
index 0000000..ba48de3
--- /dev/null
+++ b/icing/testing/document-generator.h
@@ -0,0 +1,134 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_DOCUMENT_GENERATOR_H_
+#define ICING_TESTING_DOCUMENT_GENERATOR_H_
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "icing/document-builder.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+
+namespace icing {
+namespace lib {
+
+class EvenDistributionNamespaceSelector {
+ public:
+ explicit EvenDistributionNamespaceSelector(
+ const std::vector<std::string>& namespaces)
+ : namespaces_(&namespaces), num_invocations_(0) {}
+ const std::string& operator()() {
+ return namespaces_->at(num_invocations_++ % namespaces_->size());
+ }
+
+ private:
+ const std::vector<std::string>* namespaces_;
+ int num_invocations_;
+};
+
+class EvenDistributionTypeSelector {
+ public:
+ explicit EvenDistributionTypeSelector(const SchemaProto& schema)
+ : schema_(&schema), num_invocations_(0) {}
+ const SchemaTypeConfigProto& operator()() {
+ return schema_->types(num_invocations_++ % schema_->types_size());
+ }
+
+ private:
+ const SchemaProto* schema_;
+ int num_invocations_;
+};
+
+template <typename Rand>
+class UniformDistributionLanguageTokenGenerator {
+ public:
+ explicit UniformDistributionLanguageTokenGenerator(
+ const std::vector<std::string>& language, Rand* r)
+ : language_(&language),
+ rand_(r),
+ dist_(0, language.size() - 1),
+ num_invocations_(0) {}
+ const std::string& operator()() { return language_->at(dist_(*rand_)); }
+
+ private:
+ const std::vector<std::string>* language_;
+ Rand* rand_;
+ std::uniform_int_distribution<> dist_;
+ int num_invocations_;
+};
+
+template <typename NamespaceSelector, typename TypeSelector,
+ typename TokenGenerator>
+class DocumentGenerator {
+ public:
+ explicit DocumentGenerator(NamespaceSelector* namespaces,
+ TypeSelector* schema_types, TokenGenerator* tokens,
+ int doc_content_size)
+ : namespaces_(namespaces),
+ schema_types_(schema_types),
+ tokens_(tokens),
+ doc_content_size_(doc_content_size),
+ num_docs_generated_(0) {}
+
+ DocumentProto generateDoc() {
+ const SchemaTypeConfigProto& type_config = (*schema_types_)();
+ const std::string& name_space = (*namespaces_)();
+ DocumentBuilder doc_builder = DocumentBuilder()
+ .SetNamespace(name_space)
+ .SetSchema(type_config.schema_type())
+ .SetUri(GetUri());
+ // Reserve room to add a token for the namespace in the first section. This
+ // ensures that each document will contain at least one token that will be
+ // stable across all runs.
+ std::string starting_content = name_space + " ";
+ // Distribute content evenly between all properties, but add a token with
+ // the namespace to the first property.
+ int prop_content_size = (doc_content_size_ - starting_content.length()) /
+ type_config.properties_size();
+ for (const PropertyConfigProto& prop : type_config.properties()) {
+ doc_builder.AddStringProperty(
+ prop.property_name(),
+ starting_content + GetPropertyContent(prop_content_size, name_space));
+ // We've added the namespace token now. No need for more starting_content.
+ starting_content.clear();
+ }
+ ++num_docs_generated_;
+ return doc_builder.Build();
+ }
+
+ private:
+ std::string GetUri() { return std::to_string(num_docs_generated_); }
+ std::string GetPropertyContent(int content_size,
+ const std::string& name_space) {
+ std::string content;
+ while (content.size() < content_size) {
+ content += " " + (*tokens_)();
+ }
+ return content;
+ }
+
+ NamespaceSelector* namespaces_;
+ TypeSelector* schema_types_;
+ TokenGenerator* tokens_;
+ int doc_content_size_;
+ int num_docs_generated_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_DOCUMENT_GENERATOR_H_
diff --git a/icing/testing/hit-test-utils.cc b/icing/testing/hit-test-utils.cc
new file mode 100644
index 0000000..0e2eb2a
--- /dev/null
+++ b/icing/testing/hit-test-utils.cc
@@ -0,0 +1,58 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/hit-test-utils.h"
+
+namespace icing {
+namespace lib {
+
+// Returns a hit that has a delta of desired_byte_length from last_hit.
+Hit CreateHit(Hit last_hit, int desired_byte_length) {
+ Hit hit =
+ (last_hit.section_id() == kMinSectionId)
+ ? Hit(kMaxSectionId, last_hit.document_id() + 1, last_hit.score())
+ : Hit(last_hit.section_id() - 1, last_hit.document_id(),
+ last_hit.score());
+ uint8_t buf[5];
+ while (VarInt::Encode(last_hit.value() - hit.value(), buf) <
+ desired_byte_length) {
+ hit = (hit.section_id() == kMinSectionId)
+ ? Hit(kMaxSectionId, hit.document_id() + 1, hit.score())
+ : Hit(hit.section_id() - 1, hit.document_id(), hit.score());
+ }
+ return hit;
+}
+
+// Returns a vector of num_hits Hits with the first hit starting at start_docid
+// and with 1-byte deltas.
+std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
+ int desired_byte_length) {
+ std::vector<Hit> hits;
+ if (num_hits < 1) {
+ return hits;
+ }
+ hits.push_back(
+ Hit(/*section_id=*/1, /*document_id=*/start_docid, Hit::kMaxHitScore));
+ while (hits.size() < num_hits) {
+ hits.push_back(CreateHit(hits.back(), desired_byte_length));
+ }
+ return hits;
+}
+
+std::vector<Hit> CreateHits(int num_hits, int desired_byte_length) {
+ return CreateHits(/*start_docid=*/0, num_hits, desired_byte_length);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/hit-test-utils.h b/icing/testing/hit-test-utils.h
new file mode 100644
index 0000000..e236ec0
--- /dev/null
+++ b/icing/testing/hit-test-utils.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_HIT_TEST_UTILS_H_
+#define ICING_TESTING_HIT_TEST_UTILS_H_
+
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Returns a hit that has a delta of desired_byte_length from last_hit.
+Hit CreateHit(Hit last_hit, int desired_byte_length);
+
+// Returns a vector of num_hits Hits with the first hit starting at start_docid
+// and with desired_byte_length deltas.
+std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
+ int desired_byte_length);
+
+// Returns a vector of num_hits Hits with the first hit starting at 0 and each
+// with desired_byte_length deltas.
+std::vector<Hit> CreateHits(int num_hits, int desired_byte_length);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_HIT_TEST_UTILS_H_
diff --git a/icing/testing/i18n-test-utils.cc b/icing/testing/icu-i18n-test-utils.cc
similarity index 93%
rename from icing/testing/i18n-test-utils.cc
rename to icing/testing/icu-i18n-test-utils.cc
index 3839dc8..09878db 100644
--- a/icing/testing/i18n-test-utils.cc
+++ b/icing/testing/icu-i18n-test-utils.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/testing/i18n-test-utils.h"
+#include "icing/testing/icu-i18n-test-utils.h"
#include <cstdint>
#include <string>
@@ -24,7 +24,7 @@
namespace icing {
namespace lib {
-std::string UcharToString(UChar32 uchar) {
+std::string UCharToString(UChar32 uchar) {
std::string result;
uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
diff --git a/icing/testing/i18n-test-utils.h b/icing/testing/icu-i18n-test-utils.h
similarity index 80%
rename from icing/testing/i18n-test-utils.h
rename to icing/testing/icu-i18n-test-utils.h
index 4e8a3b8..3e79b32 100644
--- a/icing/testing/i18n-test-utils.h
+++ b/icing/testing/icu-i18n-test-utils.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TESTING_I18N_TEST_UTILS_H_
-#define ICING_TESTING_I18N_TEST_UTILS_H_
+#ifndef ICING_TESTING_ICU_I18N_TEST_UTILS_H_
+#define ICING_TESTING_ICU_I18N_TEST_UTILS_H_
#include <string>
@@ -22,9 +22,9 @@
namespace icing {
namespace lib {
-std::string UcharToString(UChar32 uchar);
+std::string UCharToString(UChar32 uchar);
} // namespace lib
} // namespace icing
-#endif // ICING_TESTING_I18N_TEST_UTILS_H_
+#endif // ICING_TESTING_ICU_I18N_TEST_UTILS_H_
diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h
new file mode 100644
index 0000000..adc469a
--- /dev/null
+++ b/icing/testing/jni-test-helpers.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_JNI_TEST_HELPERS_H_
+#define ICING_TESTING_JNI_TEST_HELPERS_H_
+
+#include "icing/jni/jni-cache.h"
+
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+
+#include <jni.h>
+
+extern JNIEnv* g_jenv;
+
+#define ICING_TEST_JNI_CACHE JniCache::Create(g_jenv).ValueOrDie()
+
+#else
+
+#define ICING_TEST_JNI_CACHE nullptr
+
+#endif // ICING_REVERSE_JNI_SEGMENTATION
+
+namespace icing {
+namespace lib {
+
+std::unique_ptr<JniCache> GetTestJniCache() { return ICING_TEST_JNI_CACHE; }
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_JNI_TEST_HELPERS_H_
diff --git a/icing/testing/logging-event-listener.cc b/icing/testing/logging-event-listener.cc
new file mode 100644
index 0000000..4b42825
--- /dev/null
+++ b/icing/testing/logging-event-listener.cc
@@ -0,0 +1,121 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/logging-event-listener.h"
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+void LoggingEventListener::OnTestProgramStart(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationStart(
+ const testing::UnitTest& unit_test, int iteration) {
+ ICING_LOG(INFO) << "[==========] Running " << unit_test.test_to_run_count()
+ << " test(s) from " << unit_test.test_case_to_run_count()
+ << " test case(s)";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpStart(
+ const testing::UnitTest& unit_test) {
+ ICING_LOG(INFO) << "[----------] Global test environment set-up.";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestCaseStart(const testing::TestCase& test_case) {
+ std::string param_text;
+ if (test_case.type_param()) {
+ param_text = IcingStringUtil::StringPrintf(", where TypeParam = %s",
+ test_case.type_param());
+ }
+ ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+ << " test(s) from " << test_case.name() << param_text;
+}
+
+void LoggingEventListener::OnTestStart(const testing::TestInfo& test_info) {
+ ICING_LOG(INFO) << "[ RUN ] " << test_info.test_case_name() << "."
+ << test_info.name();
+}
+
+void LoggingEventListener::OnTestPartResult(
+ const testing::TestPartResult& test_part_result) {
+ if (test_part_result.type() != testing::TestPartResult::kSuccess) {
+ ICING_LOG(ERROR) << test_part_result.file_name() << ":"
+ << test_part_result.line_number() << ": Failure "
+ << test_part_result.message();
+ }
+}
+
+void LoggingEventListener::OnTestEnd(const testing::TestInfo& test_info) {
+ if (test_info.result()->Passed()) {
+ ICING_LOG(INFO) << "[ OK ] " << test_info.test_case_name() << "."
+ << test_info.name();
+ } else {
+ ICING_LOG(ERROR) << "[ FAILED ] " << test_info.test_case_name() << "."
+ << test_info.name();
+ }
+}
+
+void LoggingEventListener::OnTestCaseEnd(const testing::TestCase& test_case) {
+ ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+ << " test(s) from " << test_case.name() << " ("
+ << test_case.elapsed_time() << " ms total)";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownStart(
+ const testing::UnitTest& unit_test) {
+ ICING_LOG(INFO) << "[----------] Global test environment tear-down.";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationEnd(
+ const testing::UnitTest& unit_test, int iteration) {
+ ICING_LOG(INFO) << "[==========] " << unit_test.test_to_run_count()
+ << " test(s) from " << unit_test.test_case_to_run_count()
+ << " test case(s) ran. (" << unit_test.elapsed_time()
+ << " ms total)";
+ ICING_LOG(INFO) << "[ PASSED ] " << unit_test.successful_test_count()
+ << " test(s)";
+ if (!unit_test.Passed()) {
+ ICING_LOG(ERROR) << "[ FAILED ] " << unit_test.failed_test_count()
+ << " test(s), listed below:";
+ for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+ const testing::TestCase& test_case = *unit_test.GetTestCase(i);
+ if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+ continue;
+ }
+ for (int j = 0; j < test_case.total_test_count(); ++j) {
+ const testing::TestInfo& test_info = *test_case.GetTestInfo(j);
+ if (!test_info.should_run() || test_info.result()->Passed()) {
+ continue;
+ }
+ ICING_LOG(ERROR) << "[ FAILED ] " << test_case.name() << "."
+ << test_info.name();
+ }
+ }
+ }
+}
+
+void LoggingEventListener::OnTestProgramEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/logging-event-listener.h b/icing/testing/logging-event-listener.h
new file mode 100644
index 0000000..8024222
--- /dev/null
+++ b/icing/testing/logging-event-listener.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+#define ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+// TestEventListener that writes test results to the log so that they will be
+// visible in the logcat output in Sponge.
+// The formatting of the output is patterned after the output produced by the
+// standard PrettyUnitTestResultPrinter.
+class LoggingEventListener : public ::testing::TestEventListener {
+ public:
+ void OnTestProgramStart(const testing::UnitTest& unit_test) override;
+
+ void OnTestIterationStart(const testing::UnitTest& unit_test,
+ int iteration) override;
+
+ void OnEnvironmentsSetUpStart(const testing::UnitTest& unit_test) override;
+
+ void OnEnvironmentsSetUpEnd(const testing::UnitTest& unit_test) override;
+
+ void OnTestCaseStart(const testing::TestCase& test_case) override;
+
+ void OnTestStart(const testing::TestInfo& test_info) override;
+
+ void OnTestPartResult(
+ const testing::TestPartResult& test_part_result) override;
+
+ void OnTestEnd(const testing::TestInfo& test_info) override;
+
+ void OnTestCaseEnd(const testing::TestCase& test_case) override;
+
+ void OnEnvironmentsTearDownStart(const testing::UnitTest& unit_test) override;
+
+ void OnEnvironmentsTearDownEnd(const testing::UnitTest& unit_test) override;
+
+ void OnTestIterationEnd(const testing::UnitTest& unit_test,
+ int iteration) override;
+
+ void OnTestProgramEnd(const testing::UnitTest& unit_test) override;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_LOGGING_EVENT_LISTENER_H_
diff --git a/icing/testing/recorder-test-utils.cc b/icing/testing/recorder-test-utils.cc
new file mode 100644
index 0000000..ca87fd0
--- /dev/null
+++ b/icing/testing/recorder-test-utils.cc
@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/recorder-test-utils.h"
+
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "perftools/profiles/proto/profile.proto.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "util/gzip/gzipstring.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// The implementation for these functions is shamelessly plagiarized from
+// https://cs.corp.google.com/search/?q=heap/alloc_recorder_test.cc
+void ReadProfileFromFile(const std::string& filename,
+ perftools::profiles::Profile* profile) {
+ Filesystem filesystem;
+ ScopedFd sfd(filesystem.OpenForRead(filename.c_str()));
+ int size = filesystem.GetFileSize(sfd.get());
+ std::string str_profilez;
+ str_profilez.resize(size);
+ ASSERT_TRUE(filesystem.Read(sfd.get(), str_profilez.data(), size));
+
+ std::string str_profile;
+ ASSERT_TRUE(GunzipString(str_profilez, &str_profile));
+
+ ASSERT_TRUE(profile->ParseFromString(str_profile));
+}
+
+} // namespace
+
+ProfileInfo SummarizeProfileProto(const std::string& filename) {
+ perftools::profiles::Profile profile;
+ ReadProfileFromFile(filename, &profile);
+ ProfileInfo profile_info{};
+ for (const auto &sample : profile.sample()) {
+ EXPECT_THAT(sample.value(), testing::SizeIs(6));
+ profile_info.peak_objects += sample.value(0);
+ profile_info.peak_bytes += sample.value(1);
+ profile_info.total_alloc_objects += sample.value(2);
+ profile_info.total_alloc_bytes += sample.value(3);
+ profile_info.inuse_objects += sample.value(4);
+ profile_info.inuse_bytes += sample.value(5);
+ }
+
+ return profile_info;
+}
+
+} // namespace lib
+} // namespace icing
+
diff --git a/icing/testing/recorder-test-utils.h b/icing/testing/recorder-test-utils.h
new file mode 100644
index 0000000..7242728
--- /dev/null
+++ b/icing/testing/recorder-test-utils.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_RECORDER_TEST_UTILS_H_
+#define ICING_TESTING_RECORDER_TEST_UTILS_H_
+
+#include <cstdint>
+#include <string>
+
+namespace icing {
+namespace lib {
+
+struct ProfileInfo {
+ int64_t peak_objects;
+ int64_t peak_bytes;
+ int64_t total_alloc_objects;
+ int64_t total_alloc_bytes;
+ int64_t inuse_objects;
+ int64_t inuse_bytes;
+};
+
+// Reads a compressed profile.proto from a file and returns
+// statistics summed across all samples. This functions is heavily tied to the
+// implementation of https://cs.corp.google.com/search/?q=heap/alloc_recorder.cc
+ProfileInfo SummarizeProfileProto(const std::string& filename);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_RECORDER_TEST_UTILS_H_
diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h
new file mode 100644
index 0000000..e733612
--- /dev/null
+++ b/icing/testing/schema-generator.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_SCHEMA_GENERATOR_H_
+#define ICING_TESTING_SCHEMA_GENERATOR_H_
+
+#include <random>
+#include <string>
+
+#include "icing/proto/schema.proto.h"
+#include "icing/proto/schema.pb.h"
+
+namespace icing {
+namespace lib {
+
+class ExactStringPropertyGenerator {
+ public:
+ PropertyConfigProto operator()(std::string_view name) const {
+ PropertyConfigProto prop;
+ prop.set_property_name(name.data(), name.length());
+ prop.set_data_type(PropertyConfigProto::DataType::STRING);
+ prop.set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ IndexingConfig* indexing_config = prop.mutable_indexing_config();
+ indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY);
+ indexing_config->set_tokenizer_type(IndexingConfig::TokenizerType::PLAIN);
+ return prop;
+ }
+};
+
+template <typename Rand, typename PropertyGenerator>
+class RandomSchemaGenerator {
+ public:
+ explicit RandomSchemaGenerator(Rand* rand, PropertyGenerator* prop_generator)
+ : rand_(rand), prop_generator_(prop_generator) {}
+
+ SchemaProto GenerateSchema(int num_types, int max_num_properties) {
+ SchemaProto schema;
+ std::uniform_int_distribution<> dist(1, max_num_properties);
+ while (--num_types >= 0) {
+ int num_properties = dist(*rand_);
+ SetType(schema.add_types(), "Type" + std::to_string(num_types),
+ num_properties);
+ }
+ return schema;
+ }
+
+ private:
+ void SetType(SchemaTypeConfigProto* type_config, std::string_view name,
+ int num_properties) const {
+ type_config->set_schema_type(name.data(), name.length());
+ while (--num_properties >= 0) {
+ std::string prop_name = "Prop" + std::to_string(num_properties);
+ (*type_config->add_properties()) = (*prop_generator_)(prop_name);
+ }
+ }
+
+ Rand* rand_;
+ PropertyGenerator* prop_generator_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_SCHEMA_GENERATOR_H_
diff --git a/icing/testing/test-data.cc b/icing/testing/test-data.cc
index 3460657..db2af51 100644
--- a/icing/testing/test-data.cc
+++ b/icing/testing/test-data.cc
@@ -34,41 +34,5 @@
"/google3/", google3_relative_file_path);
}
-// ICU data file needs to be set up only once every test run, it can be shared
-// between different test cases. Setting up the file too many times may cause
-// out-of-memory / segmentation fault errors in Android emulator.
-bool has_set_up_icu_data_file = false;
-
-libtextclassifier3::Status SetUpICUDataFile(
- const std::string& icu_data_file_relative_path) {
- if (has_set_up_icu_data_file) {
- return libtextclassifier3::Status::OK;
- }
-
- const std::string& file_path = GetTestFilePath(icu_data_file_relative_path);
-
- Filesystem filesystem;
- int64_t file_size = filesystem.GetFileSize(file_path.c_str());
- ScopedFd fd(filesystem.OpenForRead(file_path.c_str()));
-
- // TODO(samzheng): figure out why icing::MemoryMappedFile causes
- // segmentation fault here.
- const void* data =
- mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd.get(), 0);
-
- UErrorCode status = U_ZERO_ERROR;
- udata_setCommonData(data, &status);
-
- if (U_FAILURE(status)) {
- return absl_ports::InternalError(
- "Failed to set up ICU data, please check if you have the data file at "
- "the given path.");
- }
-
- has_set_up_icu_data_file = true;
-
- return libtextclassifier3::Status::OK;
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/testing/test-data.h b/icing/testing/test-data.h
index a13749b..875214a 100644
--- a/icing/testing/test-data.h
+++ b/icing/testing/test-data.h
@@ -31,16 +31,6 @@
// portable_cc_test or any other test build rules.
std::string GetTestFilePath(const std::string& google3_relative_file_path);
-// This is for unit testing in Google3. The library binary doesn't contain any
-// ICU data files, so we generate a .dat file at compile time and here make ICU
-// use that file.
-//
-// Returns:
-// Ok on success
-// INTERNAL_ERROR if failed on any errors
-libtextclassifier3::Status SetUpICUDataFile(
- const std::string& icu_data_file_relative_path);
-
} // namespace lib
} // namespace icing
diff --git a/icing/testing/tmp-directory.cc b/icing/testing/tmp-directory.cc
index ea25fe2..c38be4b 100644
--- a/icing/testing/tmp-directory.cc
+++ b/icing/testing/tmp-directory.cc
@@ -32,13 +32,18 @@
// The sparse file related methods are mostly for reporting/logging purposes
// and not affecting any system behaviors.
std::string GetTestTempDir() {
-#ifdef __ANDROID__
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+ // The "icing.jni" should be whatever the package name is set for the
+ // Android test app. Set in //icing/testing/AndroidManifest.xml
+ // For JNI tests, we can only write to our package's directory.
+ return "/data/data/icing.jni";
+#elif defined(__ANDROID__)
return "/data/local/tmp";
#elif defined(__APPLE__)
return absl_ports::StrCat(getenv("HOME"), "/tmp");
#else
return "/tmp";
-#endif // __ANDROID__
+#endif // ICING_REVERSE_JNI_SEGMENTATION
}
} // namespace lib
diff --git a/icing/text_classifier/lib3/utils/base/logging.h b/icing/text_classifier/lib3/utils/base/logging.h
index 6ba8dc7..92d775e 100644
--- a/icing/text_classifier/lib3/utils/base/logging.h
+++ b/icing/text_classifier/lib3/utils/base/logging.h
@@ -22,7 +22,6 @@
#include "icing/text_classifier/lib3/utils/base/logging_levels.h"
#include "icing/text_classifier/lib3/utils/base/port.h"
-
namespace libtextclassifier3 {
namespace logging {
@@ -142,7 +141,7 @@
// Debug checks: a TC3_DCHECK<suffix> macro should behave like TC3_CHECK<suffix>
// in debug mode an don't check / don't print anything in non-debug mode.
-#ifdef NDEBUG
+#if defined(NDEBUG) && !defined(TC3_DEBUG_LOGGING) && !defined(TC3_DEBUG_CHECKS)
#define TC3_DCHECK(x) TC3_NULLSTREAM
#define TC3_DCHECK_EQ(x, y) TC3_NULLSTREAM
diff --git a/icing/text_classifier/lib3/utils/base/logging_raw.cc b/icing/text_classifier/lib3/utils/base/logging_raw.cc
index 08ad13a..9f2cb3a 100644
--- a/icing/text_classifier/lib3/utils/base/logging_raw.cc
+++ b/icing/text_classifier/lib3/utils/base/logging_raw.cc
@@ -15,8 +15,14 @@
#include "icing/text_classifier/lib3/utils/base/logging_raw.h"
#include <stdio.h>
+
#include <string>
+#define TC3_RETURN_IF_NOT_ERROR_OR_FATAL \
+ if (severity != ERROR && severity != FATAL) { \
+ return; \
+ }
+
// NOTE: this file contains two implementations: one for Android, one for all
// other cases. We always build exactly one implementation.
#if defined(__ANDROID__)
@@ -47,13 +53,10 @@
void LowLevelLogging(LogSeverity severity, const std::string& tag,
const std::string& message) {
- const int android_log_level = GetAndroidLogLevel(severity);
#if !defined(TC3_DEBUG_LOGGING)
- if (android_log_level != ANDROID_LOG_ERROR &&
- android_log_level != ANDROID_LOG_FATAL) {
- return;
- }
+ TC3_RETURN_IF_NOT_ERROR_OR_FATAL
#endif
+ const int android_log_level = GetAndroidLogLevel(severity);
__android_log_write(android_log_level, tag.c_str(), message.c_str());
}
@@ -86,6 +89,9 @@
void LowLevelLogging(LogSeverity severity, const std::string &tag,
const std::string &message) {
+#if !defined(TC3_DEBUG_LOGGING)
+ TC3_RETURN_IF_NOT_ERROR_OR_FATAL
+#endif
fprintf(stderr, "[%s] %s : %s\n", LogSeverityToString(severity), tag.c_str(),
message.c_str());
fflush(stderr);
diff --git a/icing/text_classifier/lib3/utils/base/status_macros.h b/icing/text_classifier/lib3/utils/base/status_macros.h
index 532691e..466e5b0 100644
--- a/icing/text_classifier/lib3/utils/base/status_macros.h
+++ b/icing/text_classifier/lib3/utils/base/status_macros.h
@@ -54,11 +54,20 @@
// TC3_RETURN_IF_ERROR(foo.Method(args...));
// return libtextclassifier3::Status();
// }
-#define TC3_RETURN_IF_ERROR(expr) \
+#define TC3_RETURN_IF_ERROR(expr) \
+ TC3_RETURN_IF_ERROR_INTERNAL(expr, std::move(adapter).status())
+
+#define TC3_RETURN_NULL_IF_ERROR(expr) \
+ TC3_RETURN_IF_ERROR_INTERNAL(expr, nullptr)
+
+#define TC3_RETURN_FALSE_IF_ERROR(expr) \
+ TC3_RETURN_IF_ERROR_INTERNAL(expr, false)
+
+#define TC3_RETURN_IF_ERROR_INTERNAL(expr, return_value) \
TC3_STATUS_MACROS_IMPL_ELSE_BLOCKER_ \
if (::libtextclassifier3::StatusAdapter adapter{expr}) { \
} else /* NOLINT */ \
- return std::move(adapter).status()
+ return return_value
// The GNU compiler emits a warning for code like:
//
diff --git a/icing/text_classifier/lib3/utils/base/statusor.h b/icing/text_classifier/lib3/utils/base/statusor.h
index 89c94e9..9ec3d91 100644
--- a/icing/text_classifier/lib3/utils/base/statusor.h
+++ b/icing/text_classifier/lib3/utils/base/statusor.h
@@ -32,7 +32,7 @@
inline StatusOr();
// Builds from a non-OK status. Crashes if an OK status is specified.
- inline StatusOr(const Status& status); // NOLINT
+ inline StatusOr(const Status& status); // NOLINT
// Builds from the specified value.
inline StatusOr(const T& value); // NOLINT
@@ -86,6 +86,8 @@
// Conversion assignment operator, T must be assignable from U
template <typename U>
inline StatusOr& operator=(const StatusOr<U>& other);
+ template <typename U>
+ inline StatusOr& operator=(StatusOr<U>&& other);
inline ~StatusOr();
@@ -134,6 +136,40 @@
friend class StatusOr;
private:
+ void Clear() {
+ if (ok()) {
+ value_.~T();
+ }
+ }
+
+ // Construct the value through placement new with the passed argument.
+ template <typename... Arg>
+ void MakeValue(Arg&&... arg) {
+ new (&value_) T(std::forward<Arg>(arg)...);
+ }
+
+ // Creates a valid instance of type T constructed with U and assigns it to
+ // value_. Handles how to properly assign to value_ if value_ was never
+ // actually initialized (if this is currently non-OK).
+ template <typename U>
+ void AssignValue(U&& value) {
+ if (ok()) {
+ value_ = std::forward<U>(value);
+ } else {
+ MakeValue(std::forward<U>(value));
+ status_ = Status::OK;
+ }
+ }
+
+ // Creates a status constructed with U and assigns it to status_. It also
+ // properly destroys value_ if this is OK and value_ represents a valid
+ // instance of T.
+ template <typename U>
+ void AssignStatus(U&& v) {
+ Clear();
+ status_ = static_cast<Status>(std::forward<U>(v));
+ }
+
Status status_;
// The members of unions do not require initialization and are not destructed
// unless specifically called. This allows us to construct instances of
@@ -210,35 +246,47 @@
template <typename T>
inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = other.value_;
+ if (other.ok()) {
+ AssignValue(other.value_);
+ } else {
+ AssignStatus(other.status_);
}
return *this;
}
template <typename T>
inline StatusOr<T>& StatusOr<T>::operator=(StatusOr&& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = std::move(other.value_);
+ if (other.ok()) {
+ AssignValue(std::move(other.value_));
+ } else {
+ AssignStatus(std::move(other.status_));
}
return *this;
}
template <typename T>
inline StatusOr<T>::~StatusOr() {
- if (ok()) {
- value_.~T();
- }
+ Clear();
}
template <typename T>
template <typename U>
inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = other.value_;
+ if (other.ok()) {
+ AssignValue(other.value_);
+ } else {
+ AssignStatus(other.status_);
+ }
+ return *this;
+}
+
+template <typename T>
+template <typename U>
+inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
+ if (other.ok()) {
+ AssignValue(std::move(other.value_));
+ } else {
+ AssignStatus(std::move(other.status_));
}
return *this;
}
@@ -257,7 +305,17 @@
#define TC3_ASSIGN_OR_RETURN_FALSE(lhs, rexpr) \
TC3_ASSIGN_OR_RETURN(lhs, rexpr, false)
-#define TC3_ASSIGN_OR_RETURN_0(lhs, rexpr) TC3_ASSIGN_OR_RETURN(lhs, rexpr, 0)
+#define TC3_ASSIGN_OR_RETURN_0(...) \
+ TC_STATUS_MACROS_IMPL_GET_VARIADIC_( \
+ (__VA_ARGS__, TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_0_3_, \
+ TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_0_2_)) \
+ (__VA_ARGS__)
+
+#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_0_2_(lhs, rexpr) \
+ TC3_ASSIGN_OR_RETURN(lhs, rexpr, 0)
+#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_0_3_(lhs, rexpr, \
+ log_expression) \
+ TC3_ASSIGN_OR_RETURN(lhs, rexpr, (log_expression, 0))
// =================================================================
// == Implementation details, do not rely on anything below here. ==
@@ -279,11 +337,11 @@
#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(lhs, rexpr) \
TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, _)
-#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, \
- error_expression) \
- TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_( \
- TC_STATUS_MACROS_IMPL_CONCAT_(_status_or_value, __LINE__), lhs, rexpr, \
- error_expression)
+#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, \
+ error_expression) \
+ TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_( \
+ TC_STATUS_MACROS_IMPL_CONCAT_(_status_or_value, __COUNTER__), lhs, \
+ rexpr, error_expression)
#define TC_STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_(statusor, lhs, rexpr, \
error_expression) \
auto statusor = (rexpr); \
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.cc b/icing/text_classifier/lib3/utils/java/jni-base.cc
new file mode 100644
index 0000000..897628c
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.cc
@@ -0,0 +1,35 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+
+namespace libtextclassifier3 {
+
+bool EnsureLocalCapacity(JNIEnv* env, int capacity) {
+ return env->EnsureLocalCapacity(capacity) == JNI_OK;
+}
+
+bool JniExceptionCheckAndClear(JNIEnv* env) {
+ TC3_CHECK(env != nullptr);
+ const bool result = env->ExceptionCheck();
+ if (result) {
+ env->ExceptionDescribe();
+ env->ExceptionClear();
+ }
+ return result;
+}
+
+} // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.h b/icing/text_classifier/lib3/utils/java/jni-base.h
new file mode 100644
index 0000000..5876eba
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.h
@@ -0,0 +1,215 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+// When we use a macro as an argument for a macro, an additional level of
+// indirection is needed, if the macro argument is used with # or ##.
+#define TC3_ADD_QUOTES_HELPER(TOKEN) #TOKEN
+#define TC3_ADD_QUOTES(TOKEN) TC3_ADD_QUOTES_HELPER(TOKEN)
+
+#ifndef TC3_PACKAGE_NAME
+#define TC3_PACKAGE_NAME com_google_knowledge_cerebra_sense_textclassifier_lib3
+#endif
+
+#ifndef TC3_PACKAGE_PATH
+#define TC3_PACKAGE_PATH \
+ "com/google/knowledge/cerebra/sense/textclassifier/lib3/"
+#endif
+
+#define TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name) \
+ Java_##package_name##_##class_name##_##method_name
+
+#define TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, \
+ method_name) \
+ JNIEXPORT return_type JNICALL TC3_JNI_METHOD_NAME_INTERNAL( \
+ package_name, class_name, method_name)
+
+// The indirection is needed to correctly expand the TC3_PACKAGE_NAME macro.
+// See the explanation near TC3_ADD_QUOTES macro.
+#define TC3_JNI_METHOD2(return_type, package_name, class_name, method_name) \
+ TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD(return_type, class_name, method_name) \
+ TC3_JNI_METHOD2(return_type, TC3_PACKAGE_NAME, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME2(package_name, class_name, method_name) \
+ TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME(class_name, method_name) \
+ TC3_JNI_METHOD_NAME2(TC3_PACKAGE_NAME, class_name, method_name)
+
+namespace libtextclassifier3 {
+
+// Returns true if the requested capacity is available.
+bool EnsureLocalCapacity(JNIEnv* env, int capacity);
+
+// Returns true if there was an exception. Also it clears the exception.
+bool JniExceptionCheckAndClear(JNIEnv* env);
+
+// A deleter to be used with std::unique_ptr to delete JNI global references.
+class GlobalRefDeleter {
+ public:
+ explicit GlobalRefDeleter(JavaVM* jvm) : jvm_(jvm) {}
+
+ GlobalRefDeleter(const GlobalRefDeleter& orig) = default;
+
+ // Copy assignment to allow move semantics in ScopedGlobalRef.
+ GlobalRefDeleter& operator=(const GlobalRefDeleter& rhs) {
+ TC3_CHECK_EQ(jvm_, rhs.jvm_);
+ return *this;
+ }
+
+ // The delete operator.
+ void operator()(jobject object) const {
+ JNIEnv* env;
+ if (object != nullptr && jvm_ != nullptr &&
+ JNI_OK ==
+ jvm_->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_4)) {
+ env->DeleteGlobalRef(object);
+ }
+ }
+
+ private:
+ // The jvm_ stashed to use for deletion.
+ JavaVM* const jvm_;
+};
+
+// A deleter to be used with std::unique_ptr to delete JNI local references.
+class LocalRefDeleter {
+ public:
+ explicit LocalRefDeleter(JNIEnv* env)
+ : env_(env) {} // NOLINT(runtime/explicit)
+
+ LocalRefDeleter(const LocalRefDeleter& orig) = default;
+
+ // Copy assignment to allow move semantics in ScopedLocalRef.
+ LocalRefDeleter& operator=(const LocalRefDeleter& rhs) {
+ env_ = rhs.env_;
+ return *this;
+ }
+
+ // The delete operator.
+ void operator()(jobject object) const {
+ if (env_) {
+ env_->DeleteLocalRef(object);
+ }
+ }
+
+ private:
+ // The env_ stashed to use for deletion. Thread-local, don't share!
+ JNIEnv* env_;
+};
+
+// A smart pointer that deletes a reference when it goes out of scope.
+//
+// Note that this class is not thread-safe since it caches JNIEnv in
+// the deleter. Do not use the same jobject across different threads.
+template <typename T, typename Env, typename Deleter>
+class ScopedRef {
+ public:
+ ScopedRef() : ptr_(nullptr, Deleter(nullptr)) {}
+ ScopedRef(T value, Env* env) : ptr_(value, Deleter(env)) {}
+
+ T get() const { return ptr_.get(); }
+
+ T release() { return ptr_.release(); }
+
+ bool operator!() const { return !ptr_; }
+
+ bool operator==(void* value) const { return ptr_.get() == value; }
+
+ explicit operator bool() const { return ptr_ != nullptr; }
+
+ void reset(T value, Env* env) {
+ ptr_.reset(value);
+ ptr_.get_deleter() = Deleter(env);
+ }
+
+ private:
+ std::unique_ptr<typename std::remove_pointer<T>::type, Deleter> ptr_;
+};
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() == y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+ return x.get() == nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+ return nullptr == x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() != y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+ return x.get() != nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+ return nullptr != x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator<(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() < y.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator>(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() > y.get();
+}
+
+// A smart pointer that deletes a JNI global reference when it goes out
+// of scope. Usage is:
+// ScopedGlobalRef<jobject> scoped_global(env->JniFunction(), jvm);
+template <typename T>
+using ScopedGlobalRef = ScopedRef<T, JavaVM, GlobalRefDeleter>;
+
+// Ditto, but usage is:
+// ScopedLocalRef<jobject> scoped_local(env->JniFunction(), env);
+template <typename T>
+using ScopedLocalRef = ScopedRef<T, JNIEnv, LocalRefDeleter>;
+
+// A helper to create global references.
+template <typename T>
+ScopedGlobalRef<T> MakeGlobalRef(T object, JNIEnv* env, JavaVM* jvm) {
+ const jobject global_object = env->NewGlobalRef(object);
+ return ScopedGlobalRef<T>(reinterpret_cast<T>(global_object), jvm);
+}
+
+} // namespace libtextclassifier3
+
+#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.cc b/icing/text_classifier/lib3/utils/java/jni-helper.cc
new file mode 100644
index 0000000..60a9dfb
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.cc
@@ -0,0 +1,286 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+
+#include "icing/text_classifier/lib3/utils/base/status_macros.h"
+
+namespace libtextclassifier3 {
+
+StatusOr<ScopedLocalRef<jclass>> JniHelper::FindClass(JNIEnv* env,
+ const char* class_name) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jclass> result(env->FindClass(class_name), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jclass>> JniHelper::GetObjectClass(JNIEnv* env,
+ jobject object) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jclass> result(env->GetObjectClass(object), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jmethodID> JniHelper::GetMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* signature) {
+ jmethodID result = env->GetMethodID(clazz, method_name, signature);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jmethodID> JniHelper::GetStaticMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* signature) {
+ jmethodID result = env->GetStaticMethodID(clazz, method_name, signature);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jfieldID> JniHelper::GetFieldID(JNIEnv* env, jclass clazz,
+ const char* field_name,
+ const char* signature) {
+ jfieldID result = env->GetFieldID(clazz, field_name, signature);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jfieldID> JniHelper::GetStaticFieldID(JNIEnv* env, jclass clazz,
+ const char* field_name,
+ const char* signature) {
+ jfieldID result = env->GetStaticFieldID(clazz, field_name, signature);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jobject>> JniHelper::GetStaticObjectField(
+ JNIEnv* env, jclass class_name, jfieldID field_id) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jobject> result(
+ env->GetStaticObjectField(class_name, field_id), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jint> JniHelper::GetStaticIntField(JNIEnv* env, jclass class_name,
+ jfieldID field_id) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ jint result = env->GetStaticIntField(class_name, field_id);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jbyteArray>> JniHelper::NewByteArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jbyteArray> result(env->NewByteArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+Status JniHelper::CallVoidMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ env->CallVoidMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+StatusOr<bool> JniHelper::CallBooleanMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ bool result = env->CallBooleanMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<int32> JniHelper::CallIntMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jint result = env->CallIntMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<int64> JniHelper::CallLongMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jlong result = env->CallLongMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<float> JniHelper::CallFloatMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jfloat result = env->CallFloatMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<double> JniHelper::CallDoubleMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jdouble result = env->CallDoubleMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jintArray>> JniHelper::NewIntArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jintArray> result(env->NewIntArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jfloatArray>> JniHelper::NewFloatArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jfloatArray> result(env->NewFloatArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+Status JniHelper::SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+ jsize index, jobject val) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->SetObjectArrayElement(array, index, val);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+StatusOr<jsize> JniHelper::GetArrayLength(JNIEnv* env, jarray array) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ jsize result = env->GetArrayLength(array);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+Status JniHelper::GetByteArrayRegion(JNIEnv* env, jbyteArray array, jsize start,
+ jsize len, jbyte* buf) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->GetByteArrayRegion(array, start, len, buf);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+Status JniHelper::SetByteArrayRegion(JNIEnv* env, jbyteArray array, jsize start,
+ jsize len, const jbyte* buf) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->SetByteArrayRegion(array, start, len, buf);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+Status JniHelper::SetIntArrayRegion(JNIEnv* env, jintArray array, jsize start,
+ jsize len, const jint* buf) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->SetIntArrayRegion(array, start, len, buf);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+Status JniHelper::SetFloatArrayRegion(JNIEnv* env, jfloatArray array,
+ jsize start, jsize len,
+ const jfloat* buf) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->SetFloatArrayRegion(array, start, len, buf);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+StatusOr<ScopedLocalRef<jobjectArray>> JniHelper::NewObjectArray(
+ JNIEnv* env, jsize length, jclass element_class, jobject initial_element) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jobjectArray> result(
+ env->NewObjectArray(length, element_class, initial_element), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jstring>> JniHelper::NewStringUTF(JNIEnv* env,
+ const char* bytes) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jstring> result(env->NewStringUTF(bytes), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<std::string> JByteArrayToString(JNIEnv* env, jbyteArray array) {
+ std::string result;
+ TC3_ASSIGN_OR_RETURN(const int array_length,
+ JniHelper::GetArrayLength(env, array));
+ result.resize(array_length);
+ TC3_RETURN_IF_ERROR(JniHelper::GetByteArrayRegion(
+ env, array, 0, array_length,
+ reinterpret_cast<jbyte*>(const_cast<char*>(result.data()))));
+ return result;
+}
+
+StatusOr<std::string> JStringToUtf8String(JNIEnv* env, jstring jstr) {
+ if (jstr == nullptr) {
+ return "";
+ }
+
+ TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jclass> string_class,
+ JniHelper::FindClass(env, "java/lang/String"));
+ TC3_ASSIGN_OR_RETURN(
+ jmethodID get_bytes_id,
+ JniHelper::GetMethodID(env, string_class.get(), "getBytes",
+ "(Ljava/lang/String;)[B"));
+
+ TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> encoding,
+ JniHelper::NewStringUTF(env, "UTF-8"));
+
+ TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jbyteArray> array,
+ JniHelper::CallObjectMethod<jbyteArray>(
+ env, jstr, get_bytes_id, encoding.get()));
+
+ return JByteArrayToString(env, array.get());
+}
+
+} // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.h b/icing/text_classifier/lib3/utils/java/jni-helper.h
new file mode 100644
index 0000000..907ad0d
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.h
@@ -0,0 +1,189 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utility class that provides similar calls like JNIEnv, but performs
+// additional checks on them, so that it's harder to use them incorrectly.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#define TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN \
+ if (!EnsureLocalCapacity(env, 1)) { \
+ TC3_LOG(ERROR) << "EnsureLocalCapacity(1) failed."; \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_NO_EXCEPTION_OR_RETURN \
+ if (JniExceptionCheckAndClear(env)) { \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_NOT_NULL_OR_RETURN \
+ if (result == nullptr) { \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD( \
+ METHOD_NAME, RETURN_TYPE, INPUT_TYPE, POST_CHECK) \
+ template <typename T = RETURN_TYPE> \
+ static StatusOr<ScopedLocalRef<T>> METHOD_NAME( \
+ JNIEnv* env, INPUT_TYPE object, jmethodID method_id, ...) { \
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; \
+ \
+ va_list args; \
+ va_start(args, method_id); \
+ ScopedLocalRef<T> result( \
+ reinterpret_cast<T>(env->METHOD_NAME##V(object, method_id, args)), \
+ env); \
+ POST_CHECK \
+ va_end(args); \
+ \
+ TC3_NO_EXCEPTION_OR_RETURN; \
+ return result; \
+ }
+
+#define TC3_JNI_NO_CHECK \
+ {}
+
+namespace libtextclassifier3 {
+
+class JniHelper {
+ public:
+ // Misc methods.
+ static StatusOr<ScopedLocalRef<jclass>> FindClass(JNIEnv* env,
+ const char* class_name);
+
+ static StatusOr<ScopedLocalRef<jclass>> GetObjectClass(JNIEnv* env,
+ jobject object);
+
+ template <typename T = jobject>
+ static StatusOr<ScopedLocalRef<T>> GetObjectArrayElement(JNIEnv* env,
+ jobjectArray array,
+ jsize index);
+ static StatusOr<jmethodID> GetMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* signature);
+ static StatusOr<jmethodID> GetStaticMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* signature);
+
+ static StatusOr<jfieldID> GetFieldID(JNIEnv* env, jclass clazz,
+ const char* field_name,
+ const char* signature);
+ static StatusOr<jfieldID> GetStaticFieldID(JNIEnv* env, jclass clazz,
+ const char* field_name,
+ const char* signature);
+
+ static StatusOr<ScopedLocalRef<jobject>> GetStaticObjectField(
+ JNIEnv* env, jclass class_name, jfieldID field_id);
+ static StatusOr<jint> GetStaticIntField(JNIEnv* env, jclass class_name,
+ jfieldID field_id);
+
+ // New* methods.
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(NewObject, jobject, jclass,
+ TC3_NOT_NULL_OR_RETURN);
+ static StatusOr<ScopedLocalRef<jobjectArray>> NewObjectArray(
+ JNIEnv* env, jsize length, jclass element_class,
+ jobject initial_element = nullptr);
+ static StatusOr<ScopedLocalRef<jbyteArray>> NewByteArray(JNIEnv* env,
+ jsize length);
+ static StatusOr<ScopedLocalRef<jintArray>> NewIntArray(JNIEnv* env,
+ jsize length);
+ static StatusOr<ScopedLocalRef<jstring>> NewStringUTF(JNIEnv* env,
+ const char* bytes);
+ static StatusOr<ScopedLocalRef<jfloatArray>> NewFloatArray(JNIEnv* env,
+ jsize length);
+
+ static StatusOr<jsize> GetArrayLength(JNIEnv* env, jarray array);
+
+ static Status SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+ jsize index, jobject val);
+
+ static Status GetByteArrayRegion(JNIEnv* env, jbyteArray array, jsize start,
+ jsize len, jbyte* buf);
+
+ static Status SetByteArrayRegion(JNIEnv* env, jbyteArray array, jsize start,
+ jsize len, const jbyte* buf);
+
+ static Status SetIntArrayRegion(JNIEnv* env, jintArray array, jsize start,
+ jsize len, const jint* buf);
+
+ static Status SetFloatArrayRegion(JNIEnv* env, jfloatArray array, jsize start,
+ jsize len, const jfloat* buf);
+
+ // Call* methods.
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallObjectMethod, jobject,
+ jobject, TC3_JNI_NO_CHECK);
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallStaticObjectMethod,
+ jobject, jclass,
+ TC3_JNI_NO_CHECK);
+ static Status CallVoidMethod(JNIEnv* env, jobject object, jmethodID method_id,
+ ...);
+ static StatusOr<bool> CallBooleanMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<int32> CallIntMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<int64> CallLongMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<float> CallFloatMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<double> CallDoubleMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+
+ template <class T>
+ static StatusOr<T> CallStaticIntMethod(JNIEnv* env, jclass clazz,
+ jmethodID method_id, ...);
+};
+
+template <typename T>
+StatusOr<ScopedLocalRef<T>> JniHelper::GetObjectArrayElement(JNIEnv* env,
+ jobjectArray array,
+ jsize index) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<T> result(
+ reinterpret_cast<T>(env->GetObjectArrayElement(array, index)), env);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+template <class T>
+StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env, jclass clazz,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jint result = env->CallStaticIntMethodV(clazz, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+// Converts Java byte[] object to std::string.
+StatusOr<std::string> JByteArrayToString(JNIEnv* env, jbyteArray array);
+
+// Converts Java String object to UTF8-encoded std::string.
+StatusOr<std::string> JStringToUtf8String(JNIEnv* env, jstring jstr);
+
+} // namespace libtextclassifier3
+
+#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
new file mode 100644
index 0000000..0ef1824
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -0,0 +1,55 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/icu/icu-language-segmenter.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+} // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions options) {
+ // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+ // different from other locales. E.g. "email.subject" will be split into 3
+ // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+ // term in other locales. Our current LanguageSegmenter doesn't handle this
+ // special rule, so we replace it with "en_US".
+ if (options.locale == kLocaleAmericanEnglishComputer) {
+ ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+ << " not supported. Converting to locale " << ULOC_US;
+ options.locale = ULOC_US;
+ }
+ return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale));
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
new file mode 100644
index 0000000..d43a78d
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -0,0 +1,261 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/icu/icu-language-segmenter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+#include "unicode/ubrk.h"
+#include "unicode/uchar.h"
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+constexpr char kASCIISpace = ' ';
+} // namespace
+
+class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
+ public:
+ // Factory function to create a segment iterator based on the given locale.
+ //
+ // Returns:
+ // An iterator on success
+ // INTERNAL_ERROR if unable to create
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<LanguageSegmenter::Iterator>>
+ Create(std::string_view text, std::string_view locale) {
+ std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
+ new IcuLanguageSegmenterIterator(text, locale));
+ if (iterator->Initialize()) {
+ return iterator;
+ }
+ return absl_ports::InternalError("Unable to create a term iterator");
+ }
+
+ ~IcuLanguageSegmenterIterator() {
+ ubrk_close(break_iterator_);
+ utext_close(&u_text_);
+ }
+
+ // Advances to the next term. Returns false if it has reached the end.
+ bool Advance() override {
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
+
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return false;
+ }
+
+ if (!IsValidSegment()) {
+ return Advance();
+ }
+ return true;
+ }
+
+ // Returns the current term. It can be called only when Advance() returns
+ // true.
+ std::string_view GetTerm() const override {
+ int term_length = term_end_index_exclusive_ - term_start_index_;
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ term_length = 0;
+ } else if (text_[term_start_index_] == kASCIISpace) {
+ // Rule 3: multiple continuous whitespaces are treated as one.
+ term_length = 1;
+ }
+ return text_.substr(term_start_index_, term_length);
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ term_start_index_ = ubrk_following(break_iterator_, offset);
+ if (term_start_index_ == UBRK_DONE) {
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ if (!IsValidSegment()) {
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ }
+ return term_start_index_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
+ if (term_end_index_exclusive_ > offset) {
+ // This term ends after offset. So we need to get the term just before
+ // this one.
+ ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
+ }
+ return term_start_index_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_index_ = 0;
+ term_end_index_exclusive_ = 0;
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_index_;
+ }
+
+ private:
+ explicit IcuLanguageSegmenterIterator(std::string_view text,
+ std::string_view locale)
+ : break_iterator_(nullptr),
+ text_(text),
+ locale_(locale),
+ u_text_(UTEXT_INITIALIZER),
+ term_start_index_(0),
+ term_end_index_exclusive_(0) {}
+
+ // Returns true on success
+ bool Initialize() {
+ UErrorCode status = U_ZERO_ERROR;
+ utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status);
+ break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
+ /*textLength=*/0, &status);
+ ubrk_setUText(break_iterator_, &u_text_, &status);
+ return !U_FAILURE(status);
+ }
+
+ libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
+ term_start_index_ = ubrk_preceding(break_iterator_, offset);
+ if (term_start_index_ == UBRK_DONE) {
+ MarkAsDone();
+ return absl_ports::NotFoundError("");
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return absl_ports::NotFoundError("");
+ }
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Ensures that all members are consistent with the 'Done' state.
+ // In the 'Done' state, term_start_index_ will point to the first character
+ // and term_end_index_exclusive_ will be marked with the kDone value.
+ // break_iterator_ may be in any state.
+ void MarkAsDone() {
+ term_end_index_exclusive_ = UBRK_DONE;
+ term_start_index_ = 0;
+ }
+
+ bool IsValidSegment() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_index_])) {
+ return true;
+ }
+
+ UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+ term_start_index_);
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (u_isUAlphabetic(uchar32)) {
+ return true;
+ }
+ return false;
+ }
+
+ // The underlying class that does the segmentation, ubrk_close() must be
+ // called after using.
+ UBreakIterator* break_iterator_;
+
+ // Text to be segmented
+ std::string_view text_;
+
+ // Locale of the input text, used to help segment more accurately. If a
+ // wrong locale is set, text could probably still be segmented correctly
+ // because the default break iterator behavior is used for most locales.
+ std::string_view locale_;
+
+ // A thin wrapper around the input UTF8 text, needed by break_iterator_.
+ // utext_close() must be called after using.
+ UText u_text_;
+
+ // The start and end indices are used to track the positions of current
+ // term.
+ int term_start_index_;
+ int term_end_index_exclusive_;
+};
+
+IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
+ : locale_(std::move(locale)) {}
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+IcuLanguageSegmenter::Segment(const std::string_view text) const {
+ return IcuLanguageSegmenterIterator::Create(text, locale_);
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text));
+ std::vector<std::string_view> terms;
+ while (iterator->Advance()) {
+ terms.push_back(iterator->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h
new file mode 100644
index 0000000..4115461
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to segment sentences into words based on rules
+// (https://unicode.org/reports/tr29/#Word_Boundaries) and language
+// understanding. Based on the basic segmentation done by UBreakIterator,
+// some extra rules are applied in this class:
+//
+// 1. All ASCII terms will be returned.
+// 2. For non-ASCII terms, only the alphabetic terms are returned, which means
+// non-ASCII punctuation and special characters are left out.
+// 3. Multiple continuous whitespaces are treated as one.
+//
+// The rules above are common to the high-level tokenizers that might use this
+// class. Other special tokenization logic will be in each tokenizer.
+class IcuLanguageSegmenter : public LanguageSegmenter {
+ public:
+ explicit IcuLanguageSegmenter(std::string locale);
+
+ IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete;
+ IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete;
+
+ // The segmentation depends on the language detected in the input text.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ //
+ // Returns:
+ // An iterator of terms on success
+ // INTERNAL_ERROR if any error occurs
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const override;
+
+ // The segmentation depends on the language detected in the input text.
+ //
+ // Note: It could happen that the language detected from text is wrong, then
+ // there would be a small chance that the text is segmented incorrectly.
+ //
+ // Returns:
+ // A list of terms on success
+ // INTERNAL_ERROR if any error occurs
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const override;
+
+ private:
+ // Used to help segment text
+ const std::string locale_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
new file mode 100644
index 0000000..31c2726
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -0,0 +1,1016 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling
+// ResetToStart/ResetAfter with the current position to simulate Advancing on
+// the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ if (!itr->ResetToStart().ok()) {
+ return terms;
+ }
+ terms.push_back(itr->GetTerm());
+ const char* text_begin = itr->GetTerm().data();
+ // Calling ResetToTermStartingAfter with the current position should get the
+ // very next term in the sequence.
+ for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ std::vector<std::string_view> terms;
+
+ bool is_ok = true;
+ int current_pos = 0;
+ while (is_ok) {
+ // Alternate between using Advance and ResetToTermAfter.
+ if (terms.size() % 2 == 0) {
+ is_ok = itr->Advance();
+ } else {
+ // Calling ResetToTermStartingAfter with the current position should get
+ // the very next term in the sequence.
+ current_pos = itr->GetTerm().data() - text_begin;
+ is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+ }
+ if (is_ok) {
+ terms.push_back(itr->GetTerm());
+ }
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ int last_pos = 0;
+ while (itr->Advance()) {
+ last_pos = itr->GetTerm().data() - text_begin;
+ }
+ std::vector<std::string_view> terms;
+ // Calling ResetToTermEndingBefore with the current position should get the
+ // previous term in the sequence.
+ for (int current_pos = last_pos;
+ itr->ResetToTermEndingBefore(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+class IcuLanguageSegmenterAllLocalesTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ static std::string GetLocale() { return GetParam(); }
+ static language_segmenter_factory::SegmenterOptions GetOptions() {
+ return language_segmenter_factory::SegmenterOptions(GetLocale());
+ }
+};
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+ UCharToString(0x0009), "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
+ IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAre("I.B.M", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+ // Multiple continuous whitespaces are treated as one. Whitespace at the
+ // beginning of the text doesn't affect the results of GetTerm() after the
+ // iterator is done.
+ text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+ "Hello", " ", "World");
+ ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+ language_segmenter->Segment(text_with_spaces));
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ // Japanese
+ EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+ "い", "てい", "ます")));
+ // Khmer
+ EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ // Thai
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Turkish
+ EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+ IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+ // Korean
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+ IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か")));
+
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 California에 산다"),
+ IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+ EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ LocaleName, IcuLanguageSegmenterAllLocalesTest,
+ testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+ ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+ ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+ ULOC_TRADITIONAL_CHINESE,
+ "es_ES", // Spanish
+ "hi_IN", // Hindi
+ "th_TH", // Thai
+ "lo_LA", // Lao
+ "km_KH", // Khmer
+ "ar_DZ", // Arabic
+ "ru_RU", // Russian
+ "pt_PT", // Portuguese
+ "en_US_POSIX" // American English (Computer)
+ "wrong_locale" // Will fall back to ICU default locale
+ "" // Will fall back to ICU default locale
+ ));
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
new file mode 100644
index 0000000..ce50d0b
--- /dev/null
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_LANGUAGE_SEGMENTER_FACTORY_H_
+#define ICING_TOKENIZATION_LANGUAGE_SEGMENTER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+struct SegmenterOptions {
+ explicit SegmenterOptions(std::string locale = ULOC_US,
+ const JniCache* jni_cache = nullptr)
+ : locale(std::move(locale)), jni_cache(jni_cache) {}
+
+ std::string locale;
+
+ // Does not hold ownership.
+ const JniCache* jni_cache;
+};
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions options = SegmenterOptions());
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_LANGUAGE_SEGMENTER_FACTORY_H_
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
new file mode 100644
index 0000000..c7b068d
--- /dev/null
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -0,0 +1,173 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+// We have a separate test just for the language segmenter iterators because we
+// don't need to stress test the implementation's definition of a term. These
+// test that it advances and traverses through simple terms consistently between
+// all the implementations.
+class LanguageSegmenterIteratorTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+};
+
+TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ EXPECT_TRUE(iterator->Advance());
+ EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
+
+ EXPECT_TRUE(iterator->Advance());
+ EXPECT_THAT(iterator->GetTerm(), Eq(" "));
+
+ EXPECT_TRUE(iterator->Advance());
+ EXPECT_THAT(iterator->GetTerm(), Eq("bar"));
+
+ EXPECT_FALSE(iterator->Advance());
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithOffsetInText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/0),
+ IsOkAndHolds(3)); // The term " "
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/3),
+ IsOkAndHolds(4)); // The term "bar"
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/4),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithNegativeOffsetNotOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+ EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
+ std::string text = "foo bar";
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
+ std::string text = "foo bar";
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+
+ EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/6),
+ IsOkAndHolds(3)); // The term " "
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/3),
+ IsOkAndHolds(0)); // The term "foo"
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithZeroNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ // Zero is a valid argument, but there aren't any terms that end before it.
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
+
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
+ std::string text = "foo bar";
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter.cc b/icing/tokenization/language-segmenter.cc
deleted file mode 100644
index 3dde330..0000000
--- a/icing/tokenization/language-segmenter.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/language-segmenter.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/util/i18n-utils.h"
-#include "icing/util/status-macros.h"
-#include "unicode/ubrk.h"
-#include "unicode/uchar.h"
-#include "unicode/umachine.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-constexpr char kASCIISpace = ' ';
-constexpr char kLocaleAmericanEnglishComputer[] = "en_US_POSIX";
-} // namespace
-
-LanguageSegmenter::LanguageSegmenter(std::string locale)
- : locale_(std::move(locale)) {}
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>>
-LanguageSegmenter::Create(const std::string& locale) {
- // TODO(samzheng): Figure out if we want to verify locale strings and notify
- // users. Right now illegal locale strings will be ignored by ICU. ICU
- // components will be created with its default locale.\
-
- // Word connector rules for "en_US_POSIX" (American English (Computer)) are
- // different from other locales. E.g. "email.subject" will be split into 3
- // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
- // term in other locales. Our current LanguageSegmenter doesn't handle this
- // special rule, so we replace it with "en_US".
- if (locale == kLocaleAmericanEnglishComputer) {
- return std::unique_ptr<LanguageSegmenter>(new LanguageSegmenter(ULOC_US));
- }
- return std::unique_ptr<LanguageSegmenter>(new LanguageSegmenter(locale));
-}
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-LanguageSegmenter::Segment(const std::string_view text) const {
- return LanguageSegmenter::Iterator::Create(text, locale_);
-}
-
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-LanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Iterator> iterator, Segment(text));
- std::vector<std::string_view> terms;
- while (iterator->Advance()) {
- terms.push_back(iterator->GetTerm());
- }
- return terms;
-}
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-LanguageSegmenter::Iterator::Create(std::string_view text,
- const std::string locale) {
- std::unique_ptr<Iterator> iterator(new Iterator(text, std::move(locale)));
- if (iterator->Initialize()) {
- return iterator;
- }
- return absl_ports::InternalError("Unable to create a term iterator");
-}
-
-LanguageSegmenter::Iterator::Iterator(const std::string_view text,
- const std::string&& locale)
- : break_iterator_(nullptr),
- text_(text),
- locale_(std::move(locale)),
- u_text_(UTEXT_INITIALIZER),
- term_start_index_(0),
- term_end_index_exclusive_(0) {}
-
-LanguageSegmenter::Iterator::~Iterator() {
- ubrk_close(break_iterator_);
- utext_close(&u_text_);
-}
-
-bool LanguageSegmenter::Iterator::Initialize() {
- UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status);
- break_iterator_ = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
- /*textLength=*/0, &status);
- ubrk_setUText(break_iterator_, &u_text_, &status);
- return !U_FAILURE(status);
-}
-
-bool LanguageSegmenter::Iterator::Advance() {
- // Prerequisite check
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
-
- if (term_end_index_exclusive_ == 0) {
- // First Advance() call
- term_start_index_ = ubrk_first(break_iterator_);
- } else {
- term_start_index_ = term_end_index_exclusive_;
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
-
- // Reached the end
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
-
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_index_])) {
- return true;
- }
-
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (u_isUAlphabetic(uchar32)) {
- return true;
- } else {
- return Advance();
- }
-}
-
-std::string_view LanguageSegmenter::Iterator::GetTerm() const {
- if (text_[term_start_index_] == kASCIISpace) {
- // Rule 3: multiple continuous whitespaces are treated as one.
- return std::string_view(&text_[term_start_index_], 1);
- }
- return text_.substr(term_start_index_,
- term_end_index_exclusive_ - term_start_index_);
-}
-
-libtextclassifier3::StatusOr<int32_t>
-LanguageSegmenter::Iterator::ResetToTermStartingAfter(int32_t offset) {
- term_start_index_ = ubrk_following(break_iterator_, offset);
- if (term_start_index_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
- }
- return term_start_index_;
-}
-
-libtextclassifier3::Status
-LanguageSegmenter::Iterator::ResetToTermStartingBefore(int32_t offset) {
- term_start_index_ = ubrk_preceding(break_iterator_, offset);
- if (term_start_index_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
- }
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::StatusOr<int32_t>
-LanguageSegmenter::Iterator::ResetToTermEndingBefore(int32_t offset) {
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
- if (term_end_index_exclusive_ > offset) {
- // This term ends after offset. So we need to get the term just before this
- // one.
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
- }
- return term_start_index_;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index 2e10751..fdb1846 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -17,42 +17,26 @@
#include <cstdint>
#include <memory>
-#include <string>
#include <string_view>
#include <vector>
-#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "unicode/ubrk.h"
-#include "unicode/uloc.h"
namespace icing {
namespace lib {
-// This class is used to segment sentences into words based on rules
-// (https://unicode.org/reports/tr29/#Word_Boundaries) and language
-// understanding. Based on the basic segmentation done by UBreakIterator,
-// some extra rules are applied in this class:
+// A base class that all other LanguageSegmenters should inherit from. It
+// provides interfaces that allow callers to segment text. The return value
+// could be an iterator or a list of tokens. Example usage:
//
-// 1. All ASCII terms will be returned.
-// 2. For non-ASCII terms, only the alphabetic terms are returned, which means
-// non-ASCII punctuation and special characters are left out.
-// 3. Multiple continuous whitespaces are treated as one.
-//
-// The rules above are common to the high-level tokenizers that might use this
-// class. Other special tokenization logic will be in each tokenizer.
+// std::unique_ptr<LanguageSegmenter> segmenter = GetSegmenter();
+// ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iter,
+// segmenter->Segment(text));
+// ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> segments,
+// segmenter->GetAllTerms(text));
class LanguageSegmenter {
public:
- LanguageSegmenter(const LanguageSegmenter&) = delete;
- LanguageSegmenter& operator=(const LanguageSegmenter&) = delete;
-
- // Creates a language segmenter with the given locale.
- //
- // Returns:
- // A LanguageSegmenter on success
- // INVALID_ARGUMENT if locale string is invalid
- static libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>>
- Create(const std::string& locale = ULOC_US);
+ virtual ~LanguageSegmenter() = default;
// An iterator helping to find terms in the input text.
// Example usage:
@@ -63,85 +47,65 @@
// }
class Iterator {
public:
- // Factory function to create a segment iterator based on the given locale.
- //
- // Returns:
- // An iterator on success
- // INTERNAL_ERROR if unable to create
- static libtextclassifier3::StatusOr<
- std::unique_ptr<LanguageSegmenter::Iterator>>
- Create(std::string_view text, const std::string locale);
-
- ~Iterator();
+ virtual ~Iterator() = default;
// Advances to the next term. Returns false if it has reached the end.
- bool Advance();
+ virtual bool Advance() = 0;
// Returns the current term. It can be called only when Advance() returns
// true.
- std::string_view GetTerm() const;
+ virtual std::string_view GetTerm() const = 0;
// Resets the iterator to point to the first term that starts after offset.
- // GetTerm will now return that term.
+ // GetTerm will now return that term. For example:
+ //
+ // language_segmenter = language_segmenter_factory::Create(type);
+ // iterator = language_segmenter->Segment("foo bar baz");
+ // iterator.ResetToTermStartingAfter(4);
+ // iterator.GetTerm() // returns "baz";
+ //
+ // Return types of OK and NOT_FOUND indicate that the function call was
+ // valid and the state of the iterator has changed. Return type of
+ // INVALID_ARGUMENT will leave the iterator unchanged.
//
// Returns:
// On success, the starting position of the first term that starts after
// offset.
// NOT_FOUND if an error occurred or there are no terms that start after
// offset.
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset);
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset) = 0;
// Resets the iterator to point to the first term that ends before offset.
- // GetTerm will now return that term.
+ // GetTerm will now return that term. For example:
+ //
+ // language_segmenter = language_segmenter_factory::Create(type);
+ // iterator = language_segmenter->Segment("foo bar baz");
+ // iterator.ResetToTermEndingBefore(7);
+ // iterator.GetTerm() // returns "bar";
+ //
+ // Return types of OK and NOT_FOUND indicate that the function call was
+ // valid and the state of the iterator has changed. Return type of
+ // INVALID_ARGUMENT will leave the iterator unchanged.
//
// Returns:
// On success, the starting position of the first term that ends before
// offset.
// NOT_FOUND if an error occurred or there are no terms that ends before
// offset.
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset);
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset) = 0;
- private:
- Iterator(std::string_view text, const std::string&& locale);
-
- // Returns true on success
- bool Initialize();
-
- // Resets the iterator to point to the first term that starts before offset.
- // GetTerm will now return that term.
- //
- // Returns:
- // OK on success
- // NOT_FOUND if an error occurred or there are no terms that start before
- // offset.
- libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset);
-
- // The underlying class that does the segmentation, ubrk_close() must be
- // called after using.
- UBreakIterator* break_iterator_;
-
- // Text to be segmented
- const std::string_view text_;
-
- // Locale of the input text, used to help segment more accurately. If a
- // wrong locale is set, text could probably still be segmented correctly
- // because the default break iterator behavior is used for most locales.
- const std::string locale_;
-
- // A thin wrapper around the input UTF8 text, needed by break_iterator_.
- // utext_close() must be called after using.
- UText u_text_;
-
- // The start and end indices are used to track the positions of current
- // term.
- int term_start_index_;
- int term_end_index_exclusive_;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
};
- // Segments the input text into terms. The segmentation depends on the
- // language detected in the input text.
+ // Segments the input text into terms.
//
// Returns:
// An iterator of terms on success
@@ -150,14 +114,11 @@
// Note: The underlying char* data of the input string won't be copied but
// shared with the return strings, so please make sure the input string
// outlives the returned iterator.
- //
- // Note: It could happen that the language detected from text is wrong, then
- // there would be a small chance that the text is segmented incorrectly.
- libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const;
+ virtual libtextclassifier3::StatusOr<
+ std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const = 0;
- // Segments and returns all terms in the input text. The segmentation depends
- // on the language detected in the input text.
+ // Segments and returns all terms in the input text.
//
// Returns:
// A list of terms on success
@@ -166,17 +127,8 @@
// Note: The underlying char* data of the input string won't be copied but
// shared with the return strings, so please make sure the input string
// outlives the returned terms.
- //
- // Note: It could happen that the language detected from text is wrong, then
- // there would be a small chance that the text is segmented incorrectly.
- libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
- std::string_view text) const;
-
- private:
- explicit LanguageSegmenter(std::string locale);
-
- // Used to help segment text
- const std::string locale_;
+ virtual libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ GetAllTerms(std::string_view text) const = 0;
};
} // namespace lib
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index cf7aa70..49ddfca 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,8 +14,10 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
@@ -53,11 +55,12 @@
void BM_SegmentNoSpace(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string(state.range(0), 'A');
@@ -88,11 +91,12 @@
void BM_SegmentWithSpaces(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string(state.range(0), 'A');
for (int i = 1; i < input_string.length(); i += 2) {
@@ -126,11 +130,12 @@
void BM_SegmentCJK(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- LanguageSegmenter::Create().ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string;
while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/language-segmenter_test.cc b/icing/tokenization/language-segmenter_test.cc
deleted file mode 100644
index ce10be9..0000000
--- a/icing/tokenization/language-segmenter_test.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/language-segmenter.h"
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-class LanguageSegmenterAllLocalesTest
- : public testing::TestWithParam<const char*> {
- protected:
- void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
- }
-
- static std::string GetLocale() { return GetParam(); }
-};
-
-TEST_P(LanguageSegmenterAllLocalesTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // ASCII punctuation marks are kept
- EXPECT_THAT(
- language_segmenter->GetAllTerms("Hello, World!!!"),
- IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
- EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
- IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
- EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
- IsOkAndHolds(ElementsAre("100", "%")));
- EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
- IsOkAndHolds(ElementsAre("A", "&", "B")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // ASCII special characters are kept
- EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
- IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
- EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
- IsOkAndHolds(ElementsAre("A", "+", "B")));
- // 0x0009 is the unicode for tab (within ASCII range).
- std::string text_with_tab = absl_ports::StrCat(
- "Hello", UcharToString(0x0009), UcharToString(0x0009), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
- IsOkAndHolds(ElementsAre("Hello", UcharToString(0x0009),
- UcharToString(0x0009), "World")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // Full-width (non-ASCII) punctuation marks and special characters are left
- // out.
- EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Acronym) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
- IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
- IsOkAndHolds(ElementsAre("I.B.M", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
- IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
- IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, WordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // According to unicode word break rules
- // WB6(https://unicode.org/reports/tr29/#WB6),
- // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
- // punctuation characters are used as word connecters. That is, words don't
- // break before and after them. Here we just test some that we care about.
-
- // Word connecters
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
- IsOkAndHolds(ElementsAre("com.google.android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
- IsOkAndHolds(ElementsAre("com:google:android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
- IsOkAndHolds(ElementsAre("com'google'android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
- IsOkAndHolds(ElementsAre("com_google_android")));
-
- // Word connecters can be mixed
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
- IsOkAndHolds(ElementsAre("com.google.android:icing")));
-
- // Any heading and trailing characters are not connecters
- EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
- IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
-
- // Not word connecters
- EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
- IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
- IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
- IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
- IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
- IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
- IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
- IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
- IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
- IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
- IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
- EXPECT_THAT(
- language_segmenter->GetAllTerms("com\"google\"android"),
- IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Apostrophes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
- IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
- IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
- IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
- IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
- // 0x2019 is the single right quote, should be treated the same as "'"
- std::string token_with_quote =
- absl_ports::StrCat("He", UcharToString(0x2019), "ll");
- std::string text_with_quote =
- absl_ports::StrCat(token_with_quote, " be back.");
- EXPECT_THAT(
- language_segmenter->GetAllTerms(text_with_quote),
- IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
- IsOkAndHolds(ElementsAre("(", "Hello", ")")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
- IsOkAndHolds(ElementsAre(")", "Hello", "(")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Quotes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
- IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
- IsOkAndHolds(ElementsAre("'", "Hello", "'")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
- IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(
- language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
- IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
- IsOkAndHolds(ElementsAre("3,456.789")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
- IsOkAndHolds(ElementsAre("-", "123")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // Multiple continuous whitespaces are treated as one.
- const int kNumSeparators = 256;
- const std::string text_with_spaces =
- absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
- // have whitespaces as word delimiter.
-
- // Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
- // Japanese
- EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます")));
- // Khmer
- EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
- // Thai
- EXPECT_THAT(
- language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
- IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
-}
-
-// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
-TEST_P(LanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // Turkish
- EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
- IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
- // Korean
- EXPECT_THAT(
- language_segmenter->GetAllTerms("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
-}
-
-// TODO(samzheng): more mixed languages test cases
-TEST_P(LanguageSegmenterAllLocalesTest, MixedLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
- IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
- "吗", "お", "元気", "です", "か")));
-
- EXPECT_THAT(
- language_segmenter->GetAllTerms("나는 California에 산다"),
- IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
-}
-
-TEST_P(LanguageSegmenterAllLocalesTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create(GetLocale()));
- // Validates that the input strings are not copied
- const std::string text = "Hello World";
- const char* word1_address = text.c_str();
- const char* word2_address = text.c_str() + 6;
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
- language_segmenter->GetAllTerms(text));
- ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
- const char* word1_result_address = terms.at(0).data();
- const char* word2_result_address = terms.at(2).data();
-
- // The underlying char* should be the same
- EXPECT_THAT(word1_address, Eq(word1_result_address));
- EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-INSTANTIATE_TEST_SUITE_P(
- LocaleName, LanguageSegmenterAllLocalesTest,
- testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
- ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
- ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
- ULOC_TRADITIONAL_CHINESE,
- "es_ES", // Spanish
- "hi_IN", // Hindi
- "th_TH", // Thai
- "lo_LA", // Lao
- "km_KH", // Khmer
- "ar_DZ", // Arabic
- "ru_RU", // Russian
- "pt_PT", // Portuguese
- "en_US_POSIX" // American English (Computer)
- "wrong_locale" // Will fall back to ICU default locale
- "" // Will fall back to ICU default locale
- ));
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 9dee491..6e54af9 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -20,7 +20,6 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
-#include "unicode/umachine.h"
namespace icing {
namespace lib {
@@ -40,8 +39,8 @@
}
// Gets the first unicode character. We can know what the whole term is by
// checking only the first character.
- UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), 0);
- return !u_isUWhiteSpace(uchar32) && !u_ispunct(uchar32);
+ return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
+ !i18n_utils::IsPunctuationAt(term, /*position=*/0);
}
} // namespace
@@ -97,6 +96,18 @@
return true;
}
+ bool ResetToStart() override {
+ if (!base_iterator_->ResetToStart().ok()) {
+ return false;
+ }
+ current_term_ = base_iterator_->GetTerm();
+ if (!IsValidTerm(current_term_)) {
+ // If the current value isn't valid, advance to the next valid value.
+ return Advance();
+ }
+ return true;
+ }
+
private:
std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
std::string_view current_term_;
diff --git a/icing/tokenization/plain-tokenizer.h b/icing/tokenization/plain-tokenizer.h
index f6a4a92..25b40fd 100644
--- a/icing/tokenization/plain-tokenizer.h
+++ b/icing/tokenization/plain-tokenizer.h
@@ -15,6 +15,10 @@
#ifndef ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
#define ICING_TOKENIZATION_PLAIN_TOKENIZER_H_
+#include <memory>
+#include <string_view>
+#include <vector>
+
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/tokenizer.h"
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index 26e7dc8..f2fc678 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,9 +18,11 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/i18n-test-utils.h"
+#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
namespace icing {
@@ -34,7 +36,8 @@
void SetUp() override {
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
};
@@ -47,7 +50,7 @@
TEST_F(PlainTokenizerTest, Simple) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -79,7 +82,7 @@
TEST_F(PlainTokenizerTest, Whitespace) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -90,14 +93,14 @@
// 0x0009 is horizontal tab, considered as a whitespace
std::string text_with_horizontal_tab =
- absl_ports::StrCat("Hello", UcharToString(0x0009), "World");
+ absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
EqualsToken(Token::REGULAR, "World"))));
// 0x000B is vertical tab, considered as a whitespace
std::string text_with_vertical_tab =
- absl_ports::StrCat("Hello", UcharToString(0x000B), "World");
+ absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
EqualsToken(Token::REGULAR, "World"))));
@@ -105,7 +108,7 @@
TEST_F(PlainTokenizerTest, Punctuation) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -134,7 +137,7 @@
TEST_F(PlainTokenizerTest, SpecialCharacters) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -155,7 +158,7 @@
TEST_F(PlainTokenizerTest, CJKT) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -207,7 +210,7 @@
TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -224,7 +227,7 @@
TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -241,7 +244,7 @@
TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -289,7 +292,7 @@
TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index e811397..8b2edc9 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -14,9 +14,20 @@
#include "icing/tokenization/raw-query-tokenizer.h"
+#include <stddef.h>
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
@@ -306,7 +317,7 @@
}
// Checks the first char to see if it's an ASCII term
if (i18n_utils::IsAscii(term[0])) {
- if (u_isalnum(term[0])) {
+ if (std::isalnum(term[0])) {
return ALPHANUMERIC_TERM;
}
return OTHER;
@@ -431,7 +442,7 @@
case OUTPUT:
ICING_RETURN_IF_ERROR(
OutputToken(new_state, *current_term, *current_term_type, tokens));
- U_FALLTHROUGH;
+ [[fallthrough]];
case IGNORE:
*current_term = next_term;
*current_term_type = next_term_type;
diff --git a/icing/tokenization/raw-query-tokenizer.h b/icing/tokenization/raw-query-tokenizer.h
index 8706886..6316e45 100644
--- a/icing/tokenization/raw-query-tokenizer.h
+++ b/icing/tokenization/raw-query-tokenizer.h
@@ -15,10 +15,13 @@
#ifndef ICING_TOKENIZATION_RAW_QUERY_TOKENIZER_H_
#define ICING_TOKENIZATION_RAW_QUERY_TOKENIZER_H_
+#include <memory>
#include <string_view>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer.h"
namespace icing {
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 59cda88..351f7c1 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,8 +16,10 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
@@ -32,7 +34,8 @@
void SetUp() override {
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
};
@@ -44,7 +47,7 @@
TEST_F(RawQueryTokenizerTest, Simple) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -57,7 +60,7 @@
TEST_F(RawQueryTokenizerTest, Parentheses) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -157,7 +160,7 @@
TEST_F(RawQueryTokenizerTest, Exclustion) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -224,7 +227,7 @@
TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -312,7 +315,7 @@
TEST_F(RawQueryTokenizerTest, OR) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -433,7 +436,7 @@
// here we test Chinese and Japanese to represent CJKT.
TEST_F(RawQueryTokenizerTest, CJKT) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -486,7 +489,7 @@
// so we can choose comma "," to represent all OTHER characters.
TEST_F(RawQueryTokenizerTest, OtherChars) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -531,7 +534,7 @@
TEST_F(RawQueryTokenizerTest, Mix) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- LanguageSegmenter::Create());
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
new file mode 100644
index 0000000..f79bc68
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+} // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions options) {
+ if (options.jni_cache == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot create Reverse Jni Language Segmenter without a valid JniCache "
+ "pointer");
+ }
+ // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+ // different from other locales. E.g. "email.subject" will be split into 3
+ // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+ // term in other locales. Our current LanguageSegmenter doesn't handle this
+ // special rule, so we replace it with "en_US".
+ if (options.locale == kLocaleAmericanEnglishComputer) {
+ ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+ << " not supported. Converting to locale " << ULOC_US;
+ options.locale = ULOC_US;
+ }
+ return std::make_unique<ReverseJniLanguageSegmenter>(
+ std::move(options.locale), options.jni_cache);
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
new file mode 100644
index 0000000..8392363
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
+ JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "reverse-jni-language-segmenter-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
new file mode 100644
index 0000000..a01d944
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
@@ -0,0 +1,1085 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+ const std::string& locale, const JniCache* jni_cache) {
+ return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the current position to simulate Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ if (!itr->ResetToStart().ok()) {
+ return terms;
+ }
+ terms.push_back(itr->GetTerm());
+ const char* text_begin = itr->GetTerm().data();
+ // Calling ResetToTermStartingAfter with the current position should get the
+ // very next term in the sequence.
+ for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ std::vector<std::string_view> terms;
+
+ bool is_ok = true;
+ int current_pos = 0;
+ while (is_ok) {
+ // Alternate between using Advance and ResetToTermAfter.
+ if (terms.size() % 2 == 0) {
+ is_ok = itr->Advance();
+ } else {
+ // Calling ResetToTermStartingAfter with the current position should get
+ // the very next term in the sequence.
+ current_pos = itr->GetTerm().data() - text_begin;
+ is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+ }
+ if (is_ok) {
+ terms.push_back(itr->GetTerm());
+ }
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ int last_pos = 0;
+ while (itr->Advance()) {
+ last_pos = itr->GetTerm().data() - text_begin;
+ }
+ std::vector<std::string_view> terms;
+ // Calling ResetToTermEndingBefore with the current position should get the
+ // previous term in the sequence.
+ for (int current_pos = last_pos;
+ itr->ResetToTermEndingBefore(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace
+
+TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+ UCharToString(0x0009), "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
+ IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAre("I.B.M", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+ // Multiple continuous whitespaces are treated as one. Whitespace at the
+ // beginning of the text doesn't affect the results of GetTerm() after the
+ // iterator is done.
+ text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+ "Hello", " ", "World");
+ ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+ language_segmenter->Segment(text_with_spaces));
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ // Japanese
+ EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+ "い", "てい", "ます")));
+ // Khmer
+ EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ // Thai
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Turkish
+ EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+ IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+ // Korean
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+ IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か")));
+
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 California에 산다"),
+ IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+ EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ContinuousWhitespacesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ LocaleName, ReverseJniLanguageSegmenterTest,
+ testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+ ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+ ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+ ULOC_TRADITIONAL_CHINESE,
+ "es_ES", // Spanish
+ "hi_IN", // Hindi
+ "th_TH", // Thai
+ "lo_LA", // Lao
+ "km_KH", // Khmer
+ "ar_DZ", // Arabic
+ "ru_RU", // Russian
+ "pt_PT", // Portuguese
+ "en_US_POSIX" // American English (Computer)
+ "wrong_locale" // Will fall back to ICU default locale
+ "" // Will fall back to ICU default locale
+ ));
+
+} // namespace test_internal
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
new file mode 100644
index 0000000..64b68ec
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+
+#include <jni.h>
+
+#include "icing/jni/jni-cache.h"
+#include "gtest/gtest.h"
+
+extern JNIEnv* g_jenv;
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+class ReverseJniLanguageSegmenterTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ ReverseJniLanguageSegmenterTest()
+ : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
+
+ static std::string GetLocale() { return GetParam(); }
+
+ std::unique_ptr<JniCache> jni_cache_;
+};
+
+} // namespace test_internal
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
new file mode 100644
index 0000000..2256022
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -0,0 +1,452 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+ while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+ --current_byte_index;
+ }
+ return current_byte_index;
+}
+
+class CharacterIterator {
+ public:
+ explicit CharacterIterator(std::string_view text)
+ : CharacterIterator(text, 0, 0) {}
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+ : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: desired_utf8_index <= text_.length()
+ // desired_utf8_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work forwards.
+ while (utf8_index_ < desired_utf8_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > desired_utf8_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: 0 <= desired_utf8_index
+ bool RewindToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index < 0) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work backwards.
+ while (utf8_index_ > desired_utf8_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ // Advances current position to desired_utf16_index.
+ // REQUIRES: desired_utf16_index <= text_.utf16_length()
+ // desired_utf16_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf16(int desired_utf16_index) {
+ while (utf16_index_ < desired_utf16_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ if (utf16_index_ + utf16_length > desired_utf16_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ }
+ return true;
+ }
+
+ // Rewinds current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index
+ bool RewindToUtf16(int desired_utf16_index) {
+ if (desired_utf16_index < 0) {
+ return false;
+ }
+ while (utf16_index_ > desired_utf16_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ bool IsValidCharacter() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[utf8_index_])) {
+ return true;
+ }
+
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ int utf8_index() const { return utf8_index_; }
+ int utf16_index() const { return utf16_index_; }
+
+ private:
+ std::string_view text_;
+ int utf8_index_;
+ int utf16_index_;
+};
+
+} // namespace
+
+class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
+ public:
+ explicit ReverseJniLanguageSegmenterIterator(
+ std::string_view text,
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator)
+ : break_iterator_(std::move(break_iterator)),
+ text_(text),
+ term_start_(text),
+ term_end_exclusive_(text) {}
+
+ // Advances to the next term. Returns false if it has reached the end.
+ bool Advance() override {
+ // Prerequisite check
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ return false;
+ }
+
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.AdvanceToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return false;
+ }
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
+
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
+ }
+ if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
+ MarkAsDone();
+ return false;
+ }
+
+ // Check if the current term is valid. We consider any term valid if its
+ // first character is valid. If it's not valid, then we need to advance to
+ // the next term.
+ if (term_start_.IsValidCharacter()) {
+ return true;
+ }
+ return Advance();
+ }
+
+ // Returns the current term. It can be called only when Advance() returns
+ // true.
+ std::string_view GetTerm() const override {
+ int term_length =
+ term_end_exclusive_.utf8_index() - term_start_.utf8_index();
+ if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
+ // Rule 3: multiple continuous whitespaces are treated as one.
+ term_length = 1;
+ }
+ return text_.substr(term_start_.utf8_index(), term_length);
+ }
+
+ // Resets the iterator to point to the first term that starts after offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that starts after
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that start after
+ // offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ // We're done. Need to start from the beginning if we're going to reset
+ // properly.
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ }
+
+ // 1. Find the unicode character that contains the byte at offset.
+ CharacterIterator offset_iterator = term_end_exclusive_;
+ bool success = (offset > offset_iterator.utf8_index())
+ ? offset_iterator.AdvanceToUtf8(offset)
+ : offset_iterator.RewindToUtf8(offset);
+ if (!success) {
+ // Offset is guaranteed to be within bounds thanks to the check above. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts after this character.
+ int following_utf16_index =
+ break_iterator_->Following(offset_iterator.utf16_index());
+ if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
+ // following_utf16_index is guaranteed to be within bonds thanks to the
+ // check for kDone above. So the only possible failure could be an invalid
+ // sequence. Mark as DONE and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+ term_end_exclusive_ = offset_iterator;
+
+ // 3. The term_end_exclusive_ points to the term that we want to return. We
+ // need to Advance so that term_start_ will now point to this term.
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ return term_start_.utf8_index();
+ }
+
+ // Resets the iterator to point to the first term that ends before offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that ends before
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that end before
+ // offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ // We're done. Need to start from the beginning if we're going to reset
+ // properly.
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ }
+
+ // 1. Find the unicode character that contains the byte at offset.
+ CharacterIterator offset_iterator = term_end_exclusive_;
+ bool success = (offset > offset_iterator.utf8_index())
+ ? offset_iterator.AdvanceToUtf8(offset)
+ : offset_iterator.RewindToUtf8(offset);
+ if (!success) {
+ // Offset is guaranteed to be within bounds thanks to the check above. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts before this character.
+ int starting_utf16_index =
+ break_iterator_->Preceding(offset_iterator.utf16_index());
+ if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+ // Rewind the end indices.
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
+ // starting_utf16_index is guaranteed to be within bonds thanks to the
+ // check for kDone above. So the only possible failure could be an invalid
+ // sequence. Mark as DONE and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+ term_start_ = offset_iterator;
+
+ // 3. We've correctly set the start index and the iterator currently points
+ // to that position. Now we need to find the correct end position and
+ // advance the iterator to that position.
+ int end_utf16_index = break_iterator_->Next();
+ term_end_exclusive_ = term_start_;
+ term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+
+ // 4. The start and end indices point to a segment, but we need to ensure
+ // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+ // need a segment prior to this one.
+ if (term_end_exclusive_.utf8_index() > offset ||
+ !term_start_.IsValidCharacter()) {
+ return ResetToTermEndingBefore(term_start_.utf8_index());
+ }
+ return term_start_.utf8_index();
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_.utf8_index();
+ }
+
+ private:
+ // Ensures that all members are consistent with the 'Done' state.
+ // In the 'Done' state, both term_start_.utf8_index() and
+ // term_end_exclusive_.utf8_index() will point to the same character, causing
+ // GetTerm() to return an empty string and term_start_.utf16_index() and
+ // term_end_exclusive_.utf16_index() will be marked with the kDone value.
+ // break_iterator_ may be in any state.
+ void MarkAsDone() {
+ term_start_ =
+ CharacterIterator(text_, /*utf8_index=*/0,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ term_end_exclusive_ =
+ CharacterIterator(text_, /*utf8_index=*/0,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ }
+
+ // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
+ // this class needs to maintain state to convert between UTF-16 and UTF-8.
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
+
+ // Text to be segmented
+ std::string_view text_;
+
+ // Index used to track the start position of current term.
+ CharacterIterator term_start_;
+
+ // Index used to track the end position of current term.
+ CharacterIterator term_end_exclusive_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ReverseJniLanguageSegmenter::Segment(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator,
+ ReverseJniBreakIterator::Create(jni_cache_, text, locale_));
+ return std::make_unique<ReverseJniLanguageSegmenterIterator>(
+ text, std::move(break_iterator));
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text));
+ std::vector<std::string_view> terms;
+ while (iterator->Advance()) {
+ terms.push_back(iterator->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
new file mode 100644
index 0000000..f06dac9
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+class ReverseJniLanguageSegmenter : public LanguageSegmenter {
+ public:
+ ReverseJniLanguageSegmenter(std::string locale, const JniCache* jni_cache)
+ : locale_(std::move(locale)), jni_cache_(jni_cache) {}
+
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const override;
+
+ private:
+ std::string locale_;
+
+ const JniCache* jni_cache_; // does not own!
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
new file mode 100644
index 0000000..1cca603
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter-factory.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/simple/space-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions) {
+ return std::make_unique<SpaceLanguageSegmenter>();
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
new file mode 100644
index 0000000..7e301ec
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/simple/space-language-segmenter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+constexpr char kASCIISpace = ' ';
+} // namespace
+
+class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
+ public:
+ SpaceLanguageSegmenterIterator(std::string_view text)
+ : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
+
+ // Advances to the next term. Returns false if it has reached the end.
+ bool Advance() override {
+ if (term_end_index_exclusive_ >= text_.size() ||
+ term_start_index_ >= text_.size()) {
+ // Reached the end
+ return false;
+ }
+
+ // Next term starts where we left off.
+ term_start_index_ = term_end_index_exclusive_;
+
+ // We know a term is at least one length, so we can +1 first.
+ term_end_index_exclusive_++;
+
+ // We alternate terms between space and non-space. Figure out what type of
+ // term we're currently on so we know how to stop.
+ bool is_space = text_[term_start_index_] == kASCIISpace;
+
+ while (term_end_index_exclusive_ < text_.size()) {
+ bool end_is_space = text_[term_end_index_exclusive_] == kASCIISpace;
+ if (is_space != end_is_space) {
+ // We finally see a different type of character, reached the end.
+ break;
+ }
+ // We're still seeing the same types of characters (saw a space and
+ // still seeing spaces, or saw a non-space and still seeing non-spaces).
+ // Haven't reached the next term yet, keep advancing.
+ term_end_index_exclusive_++;
+ }
+
+ return true;
+ }
+
+ // Returns the current term. It can be called only when Advance() returns
+ // true.
+ std::string_view GetTerm() const override {
+ if (text_[term_start_index_] == kASCIISpace) {
+ // Rule: multiple continuous whitespaces are treated as one.
+ return std::string_view(&text_[term_start_index_], 1);
+ }
+ return text_.substr(term_start_index_,
+ term_end_index_exclusive_ - term_start_index_);
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset) override {
+ if (offset < 0) {
+ // Start over from the beginning to find the first term.
+ term_start_index_ = 0;
+ term_end_index_exclusive_ = 0;
+ } else {
+ // Offset points to a term right now. Advance to get past the current
+ // term.
+ term_end_index_exclusive_ = offset;
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No term found in '%s' that starts after offset %d",
+ std::string(text_).c_str(), offset));
+ }
+ }
+
+ // Advance again so we can point to the next term.
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No term found in '%s' that starts after offset %d",
+ std::string(text_).c_str(), offset));
+ }
+
+ return term_start_index_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset) override {
+ if (offset <= 0 || offset > text_.size()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No term found in '%s' that ends before offset %d",
+ std::string(text_).c_str(), offset));
+ }
+
+ if (offset == text_.size()) {
+ // Special-case if the offset is the text length, this is the last term in
+ // the text, which is also considered to be "ending before" the offset.
+ term_end_index_exclusive_ = offset;
+ ICING_ASSIGN_OR_RETURN(term_start_index_, GetTermStartingBefore(offset));
+ return term_start_index_;
+ }
+
+ // Otherwise, this is just the end of the previous term and we still need to
+ // find the start of the previous term.
+ ICING_ASSIGN_OR_RETURN(term_end_index_exclusive_,
+ GetTermStartingBefore(offset));
+
+ if (term_end_index_exclusive_ == 0) {
+ // The current term starts at the beginning of the underlying text_.
+ // There is no term before this.
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No term found in '%s' that ends before offset %d",
+ std::string(text_).c_str(), offset));
+ }
+
+ // Reset ourselves to find the term before the end.
+ ICING_ASSIGN_OR_RETURN(
+ term_start_index_,
+ GetTermStartingBefore(term_end_index_exclusive_ - 1));
+ return term_start_index_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_index_ = 0;
+ term_end_index_exclusive_ = 0;
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_index_;
+ }
+
+ private:
+ // Return the start offset of the term starting right before the given offset.
+ libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
+ bool is_space = text_[offset] == kASCIISpace;
+
+ // Special-case that if offset was the text length, then we're already at
+ // the "end" of our current term.
+ if (offset == text_.size()) {
+ is_space = text_[--offset] == kASCIISpace;
+ }
+
+ // While it's the same type of character (space vs non-space), we're in the
+ // same term. So keep iterating backwards until we see a change.
+ while (offset >= 0 && (text_[offset] == kASCIISpace) == is_space) {
+ --offset;
+ }
+
+ // +1 is because offset was off-by-one to exit the while-loop.
+ return ++offset;
+ }
+
+ // Text to be segmented
+ std::string_view text_;
+
+ // The start and end indices are used to track the positions of current
+ // term.
+ int term_start_index_;
+ int term_end_index_exclusive_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+SpaceLanguageSegmenter::Segment(const std::string_view text) const {
+ return std::make_unique<SpaceLanguageSegmenterIterator>(text);
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+SpaceLanguageSegmenter::GetAllTerms(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text));
+ std::vector<std::string_view> terms;
+ while (iterator->Advance()) {
+ terms.push_back(iterator->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
new file mode 100644
index 0000000..de0a6d3
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+// Simple segmenter that splits on spaces, regardless of language. Continuous
+// whitespaces will be returned as a single whitespace character.
+class SpaceLanguageSegmenter : public LanguageSegmenter {
+ public:
+ SpaceLanguageSegmenter() = default;
+ SpaceLanguageSegmenter(const SpaceLanguageSegmenter&) = delete;
+ SpaceLanguageSegmenter& operator=(const SpaceLanguageSegmenter&) = delete;
+
+ // Segmentation is based purely on whitespace; does not take into account the
+ // language of the text.
+ //
+ // Returns:
+ // An iterator of terms on success
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const override;
+
+ // Does not take into account the language of the text.
+ //
+ // Returns:
+ // A list of terms on success
+ // INTERNAL_ERROR if any error occurs
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const override;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
new file mode 100644
index 0000000..8ed38b2
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+TEST(SpaceLanguageSegmenterTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(SpaceLanguageSegmenterTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST(SpaceLanguageSegmenterTest, Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open-source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("(A&B)"),
+ IsOkAndHolds(ElementsAre("(A&B)")));
+}
+
+TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST(SpaceLanguageSegmenterTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-123")));
+}
+
+TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ const std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index f11e544..9ebbce5 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -14,10 +14,15 @@
#include "icing/tokenization/tokenizer-factory.h"
+#include <memory>
+
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/plain-tokenizer.h"
#include "icing/tokenization/raw-query-tokenizer.h"
+#include "icing/tokenization/tokenizer.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -34,7 +39,7 @@
case IndexingConfig::TokenizerType::PLAIN:
return std::make_unique<PlainTokenizer>(lang_segmenter);
case IndexingConfig::TokenizerType::NONE:
- U_FALLTHROUGH;
+ [[fallthrough]];
default:
// This should never happen.
return absl_ports::InvalidArgumentError(
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 87c3933..38c4745 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -16,9 +16,12 @@
#define ICING_TOKENIZATION_TOKENIZER_H_
#include <cstdint>
+#include <memory>
+#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/token.h"
+
namespace icing {
namespace lib {
@@ -82,6 +85,8 @@
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+
+ virtual bool ResetToStart() { return false; }
};
// Tokenizes the input text. The input text should outlive the returned
diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc
new file mode 100644
index 0000000..493aeb5
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer-factory.cc
@@ -0,0 +1,52 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/icu/icu-normalizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates an ICU-based normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+ return IcuNormalizer::Create(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/normalizer.cc b/icing/transform/icu/icu-normalizer.cc
similarity index 70%
rename from icing/transform/normalizer.cc
rename to icing/transform/icu/icu-normalizer.cc
index 5479c44..0bb8326 100644
--- a/icing/transform/normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/transform/normalizer.h"
+#include "icing/transform/icu/icu-normalizer.h"
#include <cctype>
#include <memory>
@@ -23,6 +23,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/transform/normalizer.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -54,37 +55,70 @@
constexpr int kTransformRulesLength =
sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
-// An invalid value defined by Unicode.
-constexpr UChar32 kInvalidUchar32 = 0xFFFD;
+// Transforms a Unicode character with diacritics to its counterpart in ASCII
+// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
+// the transformation is successful.
+//
+// NOTE: According to our convention this function should have returned
+// StatusOr<char>. However, this function is performance-sensitive because is
+// could be called on every Latin character in normalization, so we make it
+// return a bool here to save a bit more time and memory.
+bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
+ char* char_out) {
+ if (i18n_utils::IsAscii(uchar32_in)) {
+ // The Unicode character is within ASCII range
+ if (char_out != nullptr) {
+ *char_out = uchar32_in;
+ }
+ return true;
+ }
+
+ // Maximum number of pieces a Unicode character can be decomposed into.
+ // TODO(samzheng) figure out if this number is proper.
+ constexpr int kDecompositionBufferCapacity = 5;
+
+ // A buffer used to store Unicode decomposition mappings of only one
+ // character.
+ UChar decomposition_buffer[kDecompositionBufferCapacity];
+
+ // Decomposes the Unicode character, trying to get an ASCII char and some
+ // diacritic chars.
+ UErrorCode status = U_ZERO_ERROR;
+ if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
+ kDecompositionBufferCapacity, &status) > 0 &&
+ !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
+ if (char_out != nullptr) {
+ *char_out = decomposition_buffer[0];
+ }
+ return true;
+ }
+ return false;
+}
+
} // namespace
-// Creates a Normalizer with a valid TermTransformer instance.
+// Creates a IcuNormalizer with a valid TermTransformer instance.
//
// Note: UTokenizer2 is also an option to normalize Unicode strings, but since
// we need some custom transform rules other than NFC/NFKC we have to use
// TermTransformer as a custom transform rule executor.
-libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Normalizer::Create(
- int max_term_byte_size) {
- if (max_term_byte_size <= 0) {
- return absl_ports::InvalidArgumentError(
- "max_term_byte_size must be greater than zero.");
- }
-
+libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>>
+IcuNormalizer::Create(int max_term_byte_size) {
ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<Normalizer::TermTransformer> term_transformer,
- Normalizer::TermTransformer::Create());
+ std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
+ IcuNormalizer::TermTransformer::Create());
- return std::unique_ptr<Normalizer>(
- new Normalizer(std::move(term_transformer), max_term_byte_size));
+ return std::unique_ptr<IcuNormalizer>(
+ new IcuNormalizer(std::move(term_transformer), max_term_byte_size));
}
-Normalizer::Normalizer(
- std::unique_ptr<Normalizer::TermTransformer> term_transformer,
+IcuNormalizer::IcuNormalizer(
+ std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
int max_term_byte_size)
: term_transformer_(std::move(term_transformer)),
max_term_byte_size_(max_term_byte_size) {}
-std::string Normalizer::NormalizeTerm(const std::string_view term) const {
+std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
std::string normalized_text;
if (term.empty()) {
@@ -103,8 +137,8 @@
// term can be transformed into ASCII if the first character can.
UChar32 first_uchar32 =
i18n_utils::GetUChar32At(term.data(), term.length(), 0);
- if (normalizer2 != nullptr && first_uchar32 != kInvalidUchar32 &&
- i18n_utils::DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
+ if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
+ DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
// This is a faster method to normalize Latin terms.
normalized_text = NormalizeLatin(normalizer2, term);
} else {
@@ -118,8 +152,8 @@
return normalized_text;
}
-std::string Normalizer::NormalizeLatin(const UNormalizer2* normalizer2,
- const std::string_view term) const {
+std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
+ const std::string_view term) const {
std::string result;
result.reserve(term.length());
for (int i = 0; i < term.length(); i++) {
@@ -127,13 +161,13 @@
result.push_back(std::tolower(term[i]));
} else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
- if (uchar32 == kInvalidUchar32) {
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
<< " at position" << i;
continue;
}
char ascii_char;
- if (i18n_utils::DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
+ if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
result.push_back(std::tolower(ascii_char));
} else {
// We don't know how to transform / decompose this Unicode character, it
@@ -150,8 +184,8 @@
return result;
}
-libtextclassifier3::StatusOr<std::unique_ptr<Normalizer::TermTransformer>>
-Normalizer::TermTransformer::Create() {
+libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer::TermTransformer>>
+IcuNormalizer::TermTransformer::Create() {
UErrorCode status = U_ZERO_ERROR;
UTransliterator* term_transformer = utrans_openU(
kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
@@ -161,20 +195,21 @@
return absl_ports::InternalError("Failed to create UTransliterator.");
}
- return std::unique_ptr<Normalizer::TermTransformer>(
- new Normalizer::TermTransformer(term_transformer));
+ return std::unique_ptr<IcuNormalizer::TermTransformer>(
+ new IcuNormalizer::TermTransformer(term_transformer));
}
-Normalizer::TermTransformer::TermTransformer(UTransliterator* u_transliterator)
+IcuNormalizer::TermTransformer::TermTransformer(
+ UTransliterator* u_transliterator)
: u_transliterator_(u_transliterator) {}
-Normalizer::TermTransformer::~TermTransformer() {
+IcuNormalizer::TermTransformer::~TermTransformer() {
if (u_transliterator_ != nullptr) {
utrans_close(u_transliterator_);
}
}
-std::string Normalizer::TermTransformer::Transform(
+std::string IcuNormalizer::TermTransformer::Transform(
const std::string_view term) const {
auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
if (!utf16_term_or.ok()) {
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
new file mode 100644
index 0000000..f20a9fb
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/transform/normalizer.h"
+#include "unicode/unorm2.h"
+#include "unicode/utrans.h"
+
+namespace icing {
+namespace lib {
+
+// Used to normalize UTF8 strings for text matching. It enforces a set of rules:
+// 1. Transforms text to be lowercase UTF8.
+// 2. Transforms full-width Latin characters to ASCII characters if possible.
+// 3. Transforms hiragana to katakana.
+// 4. Removes accent / diacritic marks on Latin characters
+// 5. Normalized text must be less than or equal to max_term_byte_size,
+// otherwise it will be truncated.
+//
+// There're some other rules from ICU not listed here, please see .cc file for
+// details.
+class IcuNormalizer : public Normalizer {
+ public:
+ // Creates a normalizer with the subcomponents it needs. max_term_byte_size
+ // enforces the max size of text after normalization, text will be truncated
+ // if exceeds the max size.
+ //
+ // Returns:
+ // A normalizer on success
+ // INVALID_ARGUMENT if max_term_byte_size <= 0
+ // INTERNAL_ERROR if failed to create any subcomponent
+ static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create(
+ int max_term_byte_size);
+
+ // Normalizes the input term based on rules. See .cc file for rule details.
+ //
+ // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
+ // result in the non-Latin characters not properly being normalized
+ std::string NormalizeTerm(std::string_view term) const override;
+
+ private:
+ // A handler class that helps manage the lifecycle of UTransliterator. It's
+ // used in IcuNormalizer to transform terms into the formats we need.
+ class TermTransformer {
+ public:
+ // Creates TermTransformer with a valid UTransliterator instance
+ //
+ // Returns:
+ // A term transformer on success
+ // INTERNAL_ERROR if failed to create any subcomponent
+ static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
+ Create();
+
+ // Closes the UTransliterator instance
+ ~TermTransformer();
+
+ // Transforms the text based on our rules described at top of this file
+ std::string Transform(std::string_view term) const;
+
+ private:
+ explicit TermTransformer(UTransliterator* u_transliterator);
+
+ // An ICU class to execute custom term transformation / normalization rules.
+ // utrans_close() must by called after using.
+ UTransliterator* u_transliterator_;
+ };
+
+ explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer,
+ int max_term_byte_size);
+
+ // Helper method to normalize Latin terms only. Rules applied:
+ // 1. Uppercase to lowercase
+ // 2. Remove diacritic (accent) marks
+ std::string NormalizeLatin(const UNormalizer2* normalizer2,
+ std::string_view term) const;
+
+ // Used to transform terms into their normalized forms.
+ std::unique_ptr<TermTransformer> term_transformer_;
+
+ // The maximum term length allowed after normalization.
+ int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
diff --git a/icing/transform/normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
similarity index 80%
rename from icing/transform/normalizer_benchmark.cc
rename to icing/transform/icu/icu-normalizer_benchmark.cc
index 53adac7..b037538 100644
--- a/icing/transform/normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,15 +14,17 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:normalizer_benchmark
+// //icing/transform/icu:icu-normalizer_benchmark
//
-// $ blaze-bin/icing/transform/normalizer_benchmark
+// $ blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// --benchmarks=all
//
// Run on an Android device:
@@ -31,12 +33,14 @@
//
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:normalizer_benchmark
+// //icing/transform/icu:icu-normalizer_benchmark
//
-// $ adb push blaze-bin/icing/transform/normalizer_benchmark
+// $ adb push
+// blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/normalizer_benchmark --benchmarks=all --adb
+// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmarks=all
+// --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
// the benchmark will set up data files accordingly.
@@ -50,12 +54,14 @@
void BM_NormalizeUppercase(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
@@ -82,12 +88,14 @@
void BM_NormalizeAccent(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -118,12 +126,14 @@
void BM_NormalizeHiragana(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
}
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
new file mode 100644
index 0000000..83fa972
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -0,0 +1,237 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::Eq;
+
+class IcuNormalizerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1024));
+ }
+
+ std::unique_ptr<Normalizer> normalizer_;
+};
+
+TEST_F(IcuNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST_F(IcuNormalizerTest, AlreadyNormalized) {
+ EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST_F(IcuNormalizerTest, UppercaseToLowercase) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+ Eq("aaaaaaaaaaaaaaaaaaaa"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+ Eq("ddddddddddddddd"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+ Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
+ Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+ Eq("iiiiiiiiiiiiiiiii"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
+ Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+ Eq("nnnnnnnnnnnnnnnn"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+ Eq("oooooooooooooooooooooooo"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+ Eq("rrrrrrrrrrrrrr"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+ Eq("ssssssssssssssssssss"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+ Eq("tttttttttttttt"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+ Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
+ Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
+ // Katakana
+ EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+ // Greek
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+
+ // Our current ICU rules can't handle Hebrew properly, e.g. the accents in
+ // "אָלֶף־בֵּית עִבְרִי"
+ // will be removed.
+ // TODO (samzheng): figure out how we should handle Hebrew.
+}
+
+TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer_->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\""));
+ // Full-width 0-9
+ EXPECT_THAT(normalizer_->NormalizeTerm("0123456789"),
+ Eq("0123456789"));
+ // Full-width A-Z
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+ // Full-width a-z
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "abcdefghijklmnopqrstuvwxyz"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(IcuNormalizerTest, IdeographicToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",."));
+}
+
+// For Katakana, each character is normalized to its full-width version.
+TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ"));
+}
+
+TEST_F(IcuNormalizerTest, HiraganaToKatakana) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ん"), Eq("ン"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
+}
+
+TEST_F(IcuNormalizerTest, CircledCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
+}
+
+TEST_F(IcuNormalizerTest, RotatedCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
+}
+
+TEST_F(IcuNormalizerTest, SquaredCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
+}
+
+TEST_F(IcuNormalizerTest, FractionsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
+}
+
+TEST_F(IcuNormalizerTest, Truncate) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+ // Each Japanese character has 3 bytes, so truncating to length 5 results in
+ // only 1 character.
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+ // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+ // character.
+ EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/2));
+ // The Japanese character has 3 bytes, truncating it results in an empty
+ // string.
+ EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc
new file mode 100644
index 0000000..3bf84b3
--- /dev/null
+++ b/icing/transform/map/map-normalizer-factory.cc
@@ -0,0 +1,48 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/map/map-normalizer.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a map-based normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ return std::make_unique<MapNormalizer>(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
new file mode 100644
index 0000000..c888551
--- /dev/null
+++ b/icing/transform/map/map-normalizer.cc
@@ -0,0 +1,86 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/map-normalizer.h"
+
+#include <ctype.h>
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/transform/map/normalization-map.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
+#include "unicode/utypes.h"
+
+namespace icing {
+namespace lib {
+
+std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
+ std::string normalized_text;
+ normalized_text.reserve(term.length());
+
+ for (int i = 0; i < term.length(); ++i) {
+ if (i18n_utils::IsAscii(term[i])) {
+ // The original character has 1 byte.
+ normalized_text.push_back(std::tolower(term[i]));
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
+ << " at position" << i;
+ continue;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (i18n_utils::GetUtf16Length(uchar32) > 1) {
+ // All the characters we need to normalize can be encoded into a
+ // single char16_t. If this character needs more than 1 char16_t code
+ // unit, we can skip normalization and append it directly.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ continue;
+ }
+ // The original character can be encoded into a single char16_t.
+ const std::unordered_map<char16_t, char16_t>& normalization_map =
+ GetNormalizationMap();
+ auto iterator = normalization_map.find(static_cast<char16_t>(uchar32));
+ if (iterator != normalization_map.end()) {
+ // Found a normalization mapping. The normalized character (stored in a
+ // char16_t) can have 1 or 2 bytes.
+ if (i18n_utils::IsAscii(iterator->second)) {
+ // The normalized character has 1 byte.
+ normalized_text.push_back(
+ std::tolower(static_cast<char>(iterator->second)));
+ } else {
+ // The normalized character has 2 bytes.
+ i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second);
+ }
+ } else {
+ // Normalization mapping not found, append the original character.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ }
+ }
+ }
+
+ if (normalized_text.length() > max_term_byte_size_) {
+ i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+ }
+
+ return normalized_text;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
new file mode 100644
index 0000000..f9c0e42
--- /dev/null
+++ b/icing/transform/map/map-normalizer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+#define ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+class MapNormalizer : public Normalizer {
+ public:
+ explicit MapNormalizer(int max_term_byte_size)
+ : max_term_byte_size_(max_term_byte_size){};
+
+ // Normalizes the input term based on character mappings. The mappings
+ // contain the following categories:
+ // - Uppercase -> lowercase
+ // - Hiragana -> Katakana
+ // - Common full-width characters -> ASCII
+ // - Common ideographic punctuation marks -> ASCII
+ // - Common diacritic Latin characters -> ASCII
+ //
+ // Read more mapping details in normalization-map.cc
+ std::string NormalizeTerm(std::string_view term) const override;
+
+ private:
+ // The maximum term length allowed after normalization.
+ int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
diff --git a/icing/transform/normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
similarity index 73%
copy from icing/transform/normalizer_benchmark.cc
copy to icing/transform/map/map-normalizer_benchmark.cc
index 53adac7..691afc6 100644
--- a/icing/transform/normalizer_benchmark.cc
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -12,53 +12,47 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <memory>
+
#include "testing/base/public/benchmark.h"
-#include "gmock/gmock.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:normalizer_benchmark
+// //icing/transform/map:map-normalizer_benchmark
//
-// $ blaze-bin/icing/transform/normalizer_benchmark
+// $ blaze-bin/icing/transform/map/map-normalizer_benchmark
// --benchmarks=all
//
// Run on an Android device:
-// Make target //icing/transform:normalizer depend on
-// //third_party/icu
-//
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:normalizer_benchmark
+// //icing/transform/map:map-normalizer_benchmark
//
-// $ adb push blaze-bin/icing/transform/normalizer_benchmark
+// $ adb push
+// blaze-bin/icing/transform/map/map-normalizer_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/normalizer_benchmark --benchmarks=all --adb
-
-// Flag to tell the benchmark that it'll be run on an Android device via adb,
-// the benchmark will set up data files accordingly.
-ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
-
+// $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmarks=all
namespace icing {
namespace lib {
namespace {
void BM_NormalizeUppercase(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
- }
-
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
+
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
for (auto _ : state) {
normalizer->NormalizeTerm(input_string);
}
@@ -80,14 +74,9 @@
->Arg(4096000);
void BM_NormalizeAccent(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
- }
-
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -95,6 +84,10 @@
input_string.append("àáâãā");
}
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
for (auto _ : state) {
normalizer->NormalizeTerm(input_string);
}
@@ -116,14 +109,9 @@
->Arg(4096000);
void BM_NormalizeHiragana(benchmark::State& state) {
- bool run_via_adb = absl::GetFlag(FLAGS_adb);
- if (!run_via_adb) {
- ICING_ASSERT_OK(SetUpICUDataFile("icing/icu.dat"));
- }
-
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
- Normalizer::Create(
+ normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -131,6 +119,10 @@
input_string.append("あいうえお");
}
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
for (auto _ : state) {
normalizer->NormalizeTerm(input_string);
}
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
new file mode 100644
index 0000000..b62ae0e
--- /dev/null
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+TEST(MapNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST(MapNormalizerTest, AlreadyNormalized) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
+ EXPECT_THAT(normalizer->NormalizeTerm("你好"), Eq("你好"));
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+ EXPECT_THAT(normalizer->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST(MapNormalizerTest, UppercaseToLowercase) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("mdi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST(MapNormalizerTest, LatinLetterRemoveAccent) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("zurich"));
+ EXPECT_THAT(normalizer->NormalizeTerm("après-midi"), Eq("apres-midi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+ Eq("aaaaaaaaaaaaaaaaaaaa"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+ Eq("ddddddddddddddd"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+ Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Ḟḟ"), Eq("ff"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+ Eq("iiiiiiiiiiiiiiiii"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Ĵĵ"), Eq("jj"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+ Eq("nnnnnnnnnnnnnnnn"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+ Eq("oooooooooooooooooooooooo"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+ Eq("rrrrrrrrrrrrrr"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+ Eq("ssssssssssssssssssss"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+ Eq("tttttttttttttt"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+ Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST(MapNormalizerTest, NonLatinLetterNotRemoveAccent) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Katakana
+ EXPECT_THAT(normalizer->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+ // Greek
+ EXPECT_THAT(normalizer->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+ EXPECT_THAT(normalizer->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+ // Hebrew
+ EXPECT_THAT(normalizer->NormalizeTerm("אָלֶף־בֵּית עִבְרִי"), Eq("אָלֶף־בֵּית עִבְרִי"));
+}
+
+TEST(MapNormalizerTest, FullWidthCharsToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\""));
+ // Full-width 0-9
+ EXPECT_THAT(normalizer->NormalizeTerm("0123456789"),
+ Eq("0123456789"));
+ // Full-width A-Z
+ EXPECT_THAT(normalizer->NormalizeTerm(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+ // Full-width a-z
+ EXPECT_THAT(normalizer->NormalizeTerm(
+ "abcdefghijklmnopqrstuvwxyz"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST(MapNormalizerTest, IdeographicToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",."));
+}
+
+TEST(MapNormalizerTest, HiraganaToKatakana) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+ EXPECT_THAT(normalizer->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ん"), Eq("ン"));
+ EXPECT_THAT(normalizer->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST(MapNormalizerTest, Truncate) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+ // Each Japanese character has 3 bytes, so truncating to length 5 results in
+ // only 1 character.
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+ // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+ // character.
+ EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/2));
+ // The Japanese character has 3 bytes, truncating it results in an empty
+ // string.
+ EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
new file mode 100644
index 0000000..c318036
--- /dev/null
+++ b/icing/transform/map/normalization-map.cc
@@ -0,0 +1,712 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/normalization-map.h"
+
+#include <cstdint>
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// A pair representing the mapping of the 'from' character to 'to' character.
+struct NormalizationPair {
+ // All the mapped characters can be stored in 2 bytes.
+ char16_t from;
+ char16_t to;
+} __attribute__((packed));
+
+// The following mappings contain multiple categories:
+// 1. Hiragana -> Katakana, listed in the order of Hiragana chart rows.
+// All regular and small Hiragana characters are mapped to Katakana. Note
+// that half-width Katakana characters are not handled here.
+// 2. Common full-width characters -> ASCII characters.
+// Full-width characters in the Unicode range of [0xff01, 0xff5e] are mapped
+// to the corresponding ASCII forms.
+// 3. Common ideographic punctuation marks -> ASCII characters.
+// Ideographic characters are in the Unicode range of [0x3000, 0x303f]. Here
+// we list two that are frequently used in CJK and can be converted to ASCII.
+// 4. Common diacritic Latin characters -> ASCII characters.
+// We list most diacritic Latin characters within the Unicode range of
+// [0x00c0, 0x017e], some from [0x01a0, 0x021b], and most from [0x1e00,
+// 0x1ef9].
+//
+// All the characters can be stored in a single UTF16 code unit, so we use
+// char16_t to store them. Size of the following array is about 2.5KiB.
+constexpr NormalizationPair kNormalizationMappings[] = {
+ // Part 1: Hiragana -> Katakana
+ // 'a' row
+ {0x3042, 0x30a2}, // Hiragana letter A -> Katakana letter A
+ {0x3044, 0x30a4}, // Hiragana letter I -> Katakana letter I
+ {0x3046, 0x30a6}, // Hiragana letter U -> Katakana letter U
+ {0x3048, 0x30a8}, // Hiragana letter E -> Katakana letter E
+ {0x304a, 0x30aa}, // Hiragana letter O -> Katakana letter O
+ {0x3041, 0x30a2}, // Hiragana letter small A -> Katakana letter A
+ {0x3043, 0x30a4}, // Hiragana letter small I -> Katakana letter I
+ {0x3045, 0x30a6}, // Hiragana letter small U -> Katakana letter U
+ {0x3047, 0x30a8}, // Hiragana letter small E -> Katakana letter E
+ {0x3049, 0x30aa}, // Hiragana letter small O -> Katakana letter O
+ // 'ka' row
+ {0x304b, 0x30ab}, // Hiragana letter KA -> Katakana letter KA
+ {0x304d, 0x30ad}, // Hiragana letter KI -> Katakana letter KI
+ {0x304f, 0x30af}, // Hiragana letter KU -> Katakana letter KU
+ {0x3051, 0x30b1}, // Hiragana letter KE -> Katakana letter KE
+ {0x3053, 0x30b3}, // Hiragana letter KO -> Katakana letter KO
+ {0x3095, 0x30ab}, // Hiragana letter small KA -> Katakana letter KA
+ {0x3096, 0x30b1}, // Hiragana letter small KE -> Katakana letter KE
+ // 'sa' row
+ {0x3055, 0x30b5}, // Hiragana letter SA -> Katakana letter SA
+ {0x3057, 0x30b7}, // Hiragana letter SI -> Katakana letter SI
+ {0x3059, 0x30b9}, // Hiragana letter SU -> Katakana letter SU
+ {0x305b, 0x30bb}, // Hiragana letter SE -> Katakana letter SE
+ {0x305d, 0x30bd}, // Hiragana letter SO -> Katakana letter SO
+ // 'ta' row
+ {0x305f, 0x30bf}, // Hiragana letter TA -> Katakana letter TA
+ {0x3061, 0x30c1}, // Hiragana letter TI -> Katakana letter TI
+ {0x3063, 0x30c4}, // Hiragana letter small TU -> Katakana letter TU
+ {0x3064, 0x30c4}, // Hiragana letter TU -> Katakana letter TU
+ {0x3066, 0x30c6}, // Hiragana letter TE -> Katakana letter TE
+ {0x3068, 0x30c8}, // Hiragana letter TO -> Katakana letter TO
+ // 'na' row
+ {0x306a, 0x30ca}, // Hiragana letter NA -> Katakana letter NA
+ {0x306b, 0x30cb}, // Hiragana letter NI -> Katakana letter NI
+ {0x306c, 0x30cc}, // Hiragana letter NU -> Katakana letter NU
+ {0x306d, 0x30cd}, // Hiragana letter NE -> Katakana letter NE
+ {0x306e, 0x30ce}, // Hiragana letter NO -> Katakana letter NO
+ // 'ha' row
+ {0x306f, 0x30cf}, // Hiragana letter HA -> Katakana letter HA
+ {0x3072, 0x30d2}, // Hiragana letter HI -> Katakana letter HI
+ {0x3075, 0x30d5}, // Hiragana letter HU -> Katakana letter HU
+ {0x3078, 0x30d8}, // Hiragana letter HE -> Katakana letter HE
+ {0x307b, 0x30db}, // Hiragana letter HO -> Katakana letter HO
+ // 'ma' row
+ {0x307e, 0x30de}, // Hiragana letter MA -> Katakana letter MA
+ {0x307f, 0x30df}, // Hiragana letter MI -> Katakana letter MI
+ {0x3080, 0x30e0}, // Hiragana letter MU -> Katakana letter MU
+ {0x3081, 0x30e1}, // Hiragana letter ME -> Katakana letter ME
+ {0x3082, 0x30e2}, // Hiragana letter MO -> Katakana letter MO
+ // 'ya' row
+ {0x3083, 0x30e4}, // Hiragana letter small YA -> Katakana letter YA
+ {0x3084, 0x30e4}, // Hiragana letter YA -> Katakana letter YA
+ {0x3085, 0x30e6}, // Hiragana letter small YU -> Katakana letter YU
+ {0x3086, 0x30e6}, // Hiragana letter YU -> Katakana letter YU
+ {0x3087, 0x30e8}, // Hiragana letter small YO -> Katakana letter YO
+ {0x3088, 0x30e8}, // Hiragana letter YO -> Katakana letter YO
+ // 'ra' row
+ {0x3089, 0x30e9}, // Hiragana letter RA -> Katakana letter RA
+ {0x308a, 0x30ea}, // Hiragana letter RI -> Katakana letter RI
+ {0x308b, 0x30eb}, // Hiragana letter RU -> Katakana letter RU
+ {0x308c, 0x30ec}, // Hiragana letter RE -> Katakana letter RE
+ {0x308d, 0x30ed}, // Hiragana letter RO -> Katakana letter RO
+ // 'wa' row
+ {0x308e, 0x30ef}, // Hiragana letter small WA -> Katakana letter WA
+ {0x308f, 0x30ef}, // Hiragana letter WA -> Katakana letter WA
+ {0x3090, 0x30f0}, // Hiragana letter WI -> Katakana letter WI
+ {0x3091, 0x30f1}, // Hiragana letter WE -> Katakana letter WE
+ {0x3092, 0x30f2}, // Hiragana letter WO -> Katakana letter WO
+ // 'n'
+ {0x3093, 0x30f3}, // Hiragana letter N -> Katakana letter N
+ // 'ga' row
+ {0x304c, 0x30ac}, // Hiragana letter GA -> Katakana letter GA
+ {0x304e, 0x30ae}, // Hiragana letter GI -> Katakana letter GI
+ {0x3050, 0x30b0}, // Hiragana letter GU -> Katakana letter GU
+ {0x3052, 0x30b2}, // Hiragana letter GE -> Katakana letter GE
+ {0x3054, 0x30b4}, // Hiragana letter GO -> Katakana letter GO
+ // 'za' row
+ {0x3056, 0x30b6}, // Hiragana letter ZA -> Katakana letter ZA
+ {0x3058, 0x30b8}, // Hiragana letter ZI -> Katakana letter ZI
+ {0x305a, 0x30ba}, // Hiragana letter ZU -> Katakana letter ZU
+ {0x305c, 0x30bc}, // Hiragana letter ZE -> Katakana letter ZE
+ {0x305e, 0x30be}, // Hiragana letter ZO -> Katakana letter ZO
+ // 'da' row
+ {0x3060, 0x30c0}, // Hiragana letter DA -> Katakana letter DA
+ {0x3062, 0x30c2}, // Hiragana letter DI -> Katakana letter DI
+ {0x3065, 0x30c5}, // Hiragana letter DU -> Katakana letter DU
+ {0x3067, 0x30c7}, // Hiragana letter DE -> Katakana letter DE
+ {0x3069, 0x30c9}, // Hiragana letter DO -> Katakana letter DO
+ // 'ba' row
+ {0x3070, 0x30d0}, // Hiragana letter BA -> Katakana letter BA
+ {0x3073, 0x30d3}, // Hiragana letter BI -> Katakana letter BI
+ {0x3076, 0x30d6}, // Hiragana letter BU -> Katakana letter BU
+ {0x3079, 0x30d9}, // Hiragana letter BE -> Katakana letter BE
+ {0x307c, 0x30dc}, // Hiragana letter BO -> Katakana letter BO
+ // 'pa' row
+ {0x3071, 0x30d1}, // Hiragana letter PA -> Katakana letter PA
+ {0x3074, 0x30d4}, // Hiragana letter PI -> Katakana letter PI
+ {0x3077, 0x30d7}, // Hiragana letter PU -> Katakana letter PU
+ {0x307a, 0x30da}, // Hiragana letter PE -> Katakana letter PE
+ {0x307d, 0x30dd}, // Hiragana letter PO -> Katakana letter PO
+ // Additional Hiragana
+ {0x3094, 0x30f4}, // Hiragana letter VU -> Katakana letter VU
+ // Part 2: Common full-width characters -> ASCII characters.
+ {0xff01, 33}, // ASCII !
+ {0xff02, 34}, // ASCII "
+ {0xff03, 35}, // ASCII #
+ {0xff04, 36}, // ASCII $
+ {0xff05, 37}, // ASCII %
+ {0xff06, 38}, // ASCII &
+ {0xff07, 39}, // ASCII '
+ {0xff08, 40}, // ASCII (
+ {0xff09, 41}, // ASCII )
+ {0xff0a, 42}, // ASCII *
+ {0xff0b, 43}, // ASCII +
+ {0xff0c, 44}, // ASCII ,
+ {0xff0d, 45}, // ASCII -
+ {0xff0e, 46}, // ASCII .
+ {0xff0f, 47}, // ASCII /
+ {0xff10, 48}, // ASCII 0
+ {0xff11, 49}, // ASCII 1
+ {0xff12, 50}, // ASCII 2
+ {0xff13, 51}, // ASCII 3
+ {0xff14, 52}, // ASCII 4
+ {0xff15, 53}, // ASCII 5
+ {0xff16, 54}, // ASCII 6
+ {0xff17, 55}, // ASCII 7
+ {0xff18, 56}, // ASCII 8
+ {0xff19, 57}, // ASCII 9
+ {0xff1a, 58}, // ASCII :
+ {0xff1b, 59}, // ASCII ;
+ {0xff1c, 60}, // ASCII <
+ {0xff1d, 61}, // ASCII =
+ {0xff1e, 62}, // ASCII >
+ {0xff1f, 63}, // ASCII ?
+ {0xff20, 64}, // ASCII @
+ {0xff21, 65}, // ASCII A
+ {0xff22, 66}, // ASCII B
+ {0xff23, 67}, // ASCII C
+ {0xff24, 68}, // ASCII D
+ {0xff25, 69}, // ASCII E
+ {0xff26, 70}, // ASCII F
+ {0xff27, 71}, // ASCII G
+ {0xff28, 72}, // ASCII H
+ {0xff29, 73}, // ASCII I
+ {0xff2a, 74}, // ASCII J
+ {0xff2b, 75}, // ASCII K
+ {0xff2c, 76}, // ASCII L
+ {0xff2d, 77}, // ASCII M
+ {0xff2e, 78}, // ASCII N
+ {0xff2f, 79}, // ASCII O
+ {0xff30, 80}, // ASCII P
+ {0xff31, 81}, // ASCII Q
+ {0xff32, 82}, // ASCII R
+ {0xff33, 83}, // ASCII S
+ {0xff34, 84}, // ASCII T
+ {0xff35, 85}, // ASCII U
+ {0xff36, 86}, // ASCII V
+ {0xff37, 87}, // ASCII W
+ {0xff38, 88}, // ASCII X
+ {0xff39, 89}, // ASCII Y
+ {0xff3a, 90}, // ASCII Z
+ {0xff3b, 91}, // ASCII [
+ {0xff3c, 92}, // ASCII forward slash
+ {0xff3d, 93}, // ASCII ]
+ {0xff3e, 94}, // ASCII ^
+ {0xff3f, 95}, // ASCII _
+ {0xff40, 96}, // ASCII `
+ {0xff41, 97}, // ASCII a
+ {0xff42, 98}, // ASCII b
+ {0xff43, 99}, // ASCII c
+ {0xff44, 100}, // ASCII d
+ {0xff45, 101}, // ASCII e
+ {0xff46, 102}, // ASCII f
+ {0xff47, 103}, // ASCII g
+ {0xff48, 104}, // ASCII h
+ {0xff49, 105}, // ASCII i
+ {0xff4a, 106}, // ASCII j
+ {0xff4b, 107}, // ASCII k
+ {0xff4c, 108}, // ASCII l
+ {0xff4d, 109}, // ASCII m
+ {0xff4e, 110}, // ASCII n
+ {0xff4f, 111}, // ASCII o
+ {0xff50, 112}, // ASCII p
+ {0xff51, 113}, // ASCII q
+ {0xff52, 114}, // ASCII r
+ {0xff53, 115}, // ASCII s
+ {0xff54, 116}, // ASCII t
+ {0xff55, 117}, // ASCII u
+ {0xff56, 118}, // ASCII v
+ {0xff57, 119}, // ASCII w
+ {0xff58, 120}, // ASCII x
+ {0xff59, 121}, // ASCII y
+ {0xff5a, 122}, // ASCII z
+ {0xff5b, 123}, // ASCII {
+ {0xff5c, 124}, // ASCII |
+ {0xff5d, 125}, // ASCII }
+ {0xff5e, 126}, // ASCII ~
+ {0x2018, 39}, // Left single quote -> ASCII apostrophe
+ {0x2019, 39}, // Right single quote -> ASCII apostrophe
+ {0x201c, 34}, // Left double quote -> ASCII quote
+ {0x201d, 34}, // Right double quote -> ASCII quote
+ // Part 3: Common ideographic punctuation marks -> ASCII.
+ // Usually used in CJK.
+ {0x3001, 44}, // ASCII ,
+ {0x3002, 46}, // ASCII .
+ // Part 4: Common diacritic Latin characters -> ASCII characters.
+ {0x00c0, 65}, // À -> A
+ {0x00c1, 65}, // Á -> A
+ {0x00c2, 65}, // Â -> A
+ {0x00c3, 65}, // Ã -> A
+ {0x00c4, 65}, // Ä -> A
+ {0x00c5, 65}, // Å -> A
+ {0x00c7, 67}, // Ç -> C
+ {0x00c8, 69}, // È -> E
+ {0x00c9, 69}, // É -> E
+ {0x00ca, 69}, // Ê -> E
+ {0x00cb, 69}, // Ë -> E
+ {0x00cc, 73}, // Ì -> I
+ {0x00cd, 73}, // Í -> I
+ {0x00ce, 73}, // Î -> I
+ {0x00cf, 73}, // Ï -> I
+ {0x00d0, 68}, // Ð -> D
+ {0x00d1, 78}, // Ñ -> N
+ {0x00d2, 79}, // Ò -> O
+ {0x00d3, 79}, // Ó -> O
+ {0x00d4, 79}, // Ô -> O
+ {0x00d5, 79}, // Õ -> O
+ {0x00d6, 79}, // Ö -> O
+ {0x00d8, 79}, // Ø -> O
+ {0x00d9, 85}, // Ù -> U
+ {0x00da, 85}, // Ú -> U
+ {0x00db, 85}, // Û -> U
+ {0x00dc, 85}, // Ü -> U
+ {0x00dd, 89}, // Ý -> Y
+ {0x00e0, 97}, // à -> a
+ {0x00e1, 97}, // á -> a
+ {0x00e2, 97}, // â -> a
+ {0x00e3, 97}, // ã -> a
+ {0x00e4, 97}, // ä -> a
+ {0x00e5, 97}, // å -> a
+ {0x00e7, 99}, // ç -> c
+ {0x00e8, 101}, // è -> e
+ {0x00e9, 101}, // é -> e
+ {0x00ea, 101}, // ê -> e
+ {0x00eb, 101}, // ë -> e
+ {0x00ec, 105}, // ì -> i
+ {0x00ed, 105}, // í -> i
+ {0x00ee, 105}, // î -> i
+ {0x00ef, 105}, // ï -> i
+ {0x00f0, 100}, // ð -> d
+ {0x00f1, 110}, // ñ -> n
+ {0x00f2, 111}, // ò -> o
+ {0x00f3, 111}, // ó -> o
+ {0x00f4, 111}, // ô -> o
+ {0x00f5, 111}, // õ -> o
+ {0x00f6, 111}, // ö -> o
+ {0x00f8, 111}, // ø -> o
+ {0x00f9, 117}, // ù -> u
+ {0x00fa, 117}, // ú -> u
+ {0x00fb, 117}, // û -> u
+ {0x00fc, 117}, // ü -> u
+ {0x00fd, 121}, // ý -> y
+ {0x00ff, 121}, // ÿ -> y
+ {0x0100, 65}, // Ā -> A
+ {0x0101, 97}, // ā -> a
+ {0x0102, 65}, // Ă -> A
+ {0x0103, 97}, // ă -> a
+ {0x0104, 65}, // Ą -> A
+ {0x0105, 97}, // ą -> a
+ {0x0106, 67}, // Ć -> C
+ {0x0107, 99}, // ć -> c
+ {0x0108, 67}, // Ĉ -> C
+ {0x0109, 99}, // ĉ -> c
+ {0x010a, 67}, // Ċ -> C
+ {0x010b, 99}, // ċ -> c
+ {0x010c, 67}, // Č -> C
+ {0x010d, 99}, // č -> c
+ {0x010e, 68}, // Ď -> D
+ {0x010f, 100}, // ď -> d
+ {0x0110, 68}, // Đ -> D
+ {0x0111, 100}, // đ -> d
+ {0x0112, 69}, // Ē -> E
+ {0x0113, 101}, // ē -> e
+ {0x0114, 69}, // Ĕ -> E
+ {0x0115, 101}, // ĕ -> e
+ {0x0116, 69}, // Ė -> E
+ {0x0117, 101}, // ė -> e
+ {0x0118, 69}, // Ę -> E
+ {0x0119, 101}, // ę -> e
+ {0x011a, 69}, // Ě -> E
+ {0x011b, 101}, // ě -> e
+ {0x011c, 71}, // Ĝ -> G
+ {0x011d, 103}, // ĝ -> g
+ {0x011e, 71}, // Ğ -> G
+ {0x011f, 103}, // ğ -> g
+ {0x0120, 71}, // Ġ -> G
+ {0x0121, 103}, // ġ -> g
+ {0x0122, 71}, // Ģ -> G
+ {0x0123, 103}, // ģ -> g
+ {0x0124, 72}, // Ĥ -> H
+ {0x0125, 104}, // ĥ -> h
+ {0x0126, 72}, // Ħ -> H
+ {0x0127, 104}, // ħ -> h
+ {0x0128, 73}, // Ĩ -> I
+ {0x0129, 105}, // ĩ -> i
+ {0x012a, 73}, // Ī -> I
+ {0x012b, 105}, // ī -> i
+ {0x012c, 73}, // Ĭ -> I
+ {0x012d, 105}, // ĭ -> i
+ {0x012e, 73}, // Į -> I
+ {0x012f, 105}, // į -> i
+ {0x0130, 73}, // İ -> I
+ {0x0131, 105}, // ı -> i
+ {0x0134, 74}, // Ĵ -> J
+ {0x0135, 106}, // ĵ -> j
+ {0x0136, 75}, // Ķ -> K
+ {0x0137, 107}, // ķ -> k
+ {0x0139, 76}, // Ĺ -> L
+ {0x013a, 108}, // ĺ -> l
+ {0x013b, 76}, // Ļ -> L
+ {0x013c, 108}, // ļ -> l
+ {0x013d, 76}, // Ľ -> L
+ {0x013e, 108}, // ľ -> l
+ {0x013f, 76}, // Ŀ -> L
+ {0x0140, 108}, // ŀ -> l
+ {0x0141, 76}, // Ł -> L
+ {0x0142, 108}, // ł -> l
+ {0x0143, 78}, // Ń -> N
+ {0x0144, 110}, // ń -> n
+ {0x0145, 78}, // Ņ -> N
+ {0x0146, 110}, // ņ -> n
+ {0x0147, 78}, // Ň -> N
+ {0x0148, 110}, // ň -> n
+ {0x014a, 78}, // Ŋ -> N
+ {0x014b, 110}, // ŋ -> n
+ {0x014c, 79}, // Ō -> O
+ {0x014d, 111}, // ō -> o
+ {0x014e, 79}, // Ŏ -> O
+ {0x014f, 111}, // ŏ -> o
+ {0x0150, 79}, // Ő -> O
+ {0x0151, 111}, // ő -> o
+ {0x0154, 82}, // Ŕ -> R
+ {0x0155, 114}, // ŕ -> r
+ {0x0156, 82}, // Ŗ -> R
+ {0x0157, 114}, // ŗ -> r
+ {0x0158, 82}, // Ř -> R
+ {0x0159, 114}, // ř -> r
+ {0x015a, 83}, // Ś -> S
+ {0x015b, 115}, // ś -> s
+ {0x015c, 83}, // Ŝ -> S
+ {0x015d, 115}, // ŝ -> s
+ {0x015e, 83}, // Ş -> S
+ {0x015f, 115}, // ş -> s
+ {0x0160, 83}, // Š -> S
+ {0x0161, 115}, // š -> s
+ {0x0162, 84}, // Ţ -> T
+ {0x0163, 116}, // ţ -> t
+ {0x0164, 84}, // Ť -> T
+ {0x0165, 116}, // ť -> t
+ {0x0166, 84}, // Ŧ -> T
+ {0x0167, 116}, // ŧ -> t
+ {0x0168, 85}, // Ũ -> U
+ {0x0169, 117}, // ũ -> u
+ {0x016a, 85}, // Ū -> U
+ {0x016b, 117}, // ū -> u
+ {0x016c, 85}, // Ŭ -> U
+ {0x016d, 117}, // ŭ -> u
+ {0x016e, 85}, // Ů -> U
+ {0x016f, 117}, // ů -> u
+ {0x0170, 85}, // Ű -> U
+ {0x0171, 117}, // ű -> u
+ {0x0172, 85}, // Ų -> U
+ {0x0173, 117}, // ų -> u
+ {0x0174, 87}, // Ŵ -> W
+ {0x0175, 119}, // ŵ -> w
+ {0x0176, 89}, // Ŷ -> Y
+ {0x0177, 121}, // ŷ -> y
+ {0x0178, 89}, // Ÿ -> Y
+ {0x0179, 90}, // Ź -> Z
+ {0x017a, 122}, // ź -> z
+ {0x017b, 90}, // Ż -> Z
+ {0x017c, 122}, // ż -> z
+ {0x017d, 90}, // Ž -> Z
+ {0x017e, 122}, // ž -> z
+ {0x01a0, 79}, // Ơ -> O
+ {0x01a1, 111}, // ơ -> o
+ {0x01af, 85}, // Ư -> U
+ {0x01b0, 117}, // ư -> u
+ {0x01b5, 90}, // Ƶ -> Z
+ {0x01b6, 122}, // ƶ -> z
+ {0x0218, 83}, // Ș -> S
+ {0x0219, 115}, // ș -> s
+ {0x021a, 84}, // Ț -> T
+ {0x021b, 116}, // ț -> t
+ {0x1e00, 65}, // Ḁ -> A
+ {0x1e01, 97}, // ḁ -> a
+ {0x1e02, 66}, // Ḃ -> B
+ {0x1e03, 98}, // ḃ -> b
+ {0x1e04, 66}, // Ḅ -> B
+ {0x1e05, 98}, // ḅ -> b
+ {0x1e06, 66}, // Ḇ -> B
+ {0x1e07, 98}, // ḇ -> b
+ {0x1e08, 67}, // Ḉ -> C
+ {0x1e09, 99}, // ḉ -> c
+ {0x1e0a, 68}, // Ḋ -> D
+ {0x1e0b, 100}, // ḋ -> d
+ {0x1e0c, 68}, // Ḍ -> D
+ {0x1e0d, 100}, // ḍ -> d
+ {0x1e0e, 68}, // Ḏ -> D
+ {0x1e0f, 100}, // ḏ -> d
+ {0x1e10, 68}, // Ḑ -> D
+ {0x1e11, 100}, // ḑ -> d
+ {0x1e12, 68}, // Ḓ -> D
+ {0x1e13, 100}, // ḓ -> d
+ {0x1e14, 69}, // Ḕ -> E
+ {0x1e15, 101}, // ḕ -> e
+ {0x1e16, 69}, // Ḗ -> E
+ {0x1e17, 101}, // ḗ -> e
+ {0x1e18, 69}, // Ḙ -> E
+ {0x1e19, 101}, // ḙ -> e
+ {0x1e1a, 69}, // Ḛ -> E
+ {0x1e1b, 101}, // ḛ -> e
+ {0x1e1c, 69}, // Ḝ -> E
+ {0x1e1d, 101}, // ḝ -> e
+ {0x1e1e, 70}, // Ḟ -> F
+ {0x1e1f, 102}, // ḟ -> f
+ {0x1e20, 71}, // Ḡ -> G
+ {0x1e21, 103}, // ḡ -> g
+ {0x1e22, 72}, // Ḣ -> H
+ {0x1e23, 104}, // ḣ -> h
+ {0x1e24, 72}, // Ḥ -> H
+ {0x1e25, 104}, // ḥ -> h
+ {0x1e26, 72}, // Ḧ -> H
+ {0x1e27, 104}, // ḧ -> h
+ {0x1e28, 72}, // Ḩ -> H
+ {0x1e29, 104}, // ḩ -> h
+ {0x1e2a, 72}, // Ḫ -> H
+ {0x1e2b, 104}, // ḫ -> h
+ {0x1e2c, 73}, // Ḭ -> I
+ {0x1e2d, 105}, // ḭ -> i
+ {0x1e2e, 73}, // Ḯ -> I
+ {0x1e2f, 105}, // ḯ -> i
+ {0x1e30, 75}, // Ḱ -> K
+ {0x1e31, 107}, // ḱ -> k
+ {0x1e32, 75}, // Ḳ -> K
+ {0x1e33, 107}, // ḳ -> k
+ {0x1e34, 75}, // Ḵ -> K
+ {0x1e35, 107}, // ḵ -> k
+ {0x1e36, 76}, // Ḷ -> L
+ {0x1e37, 108}, // ḷ -> l
+ {0x1e38, 76}, // Ḹ -> L
+ {0x1e39, 108}, // ḹ -> l
+ {0x1e3b, 108}, // ḻ -> l
+ {0x1e3c, 76}, // Ḽ -> L
+ {0x1e3d, 108}, // ḽ -> l
+ {0x1e3e, 77}, // Ḿ -> M
+ {0x1e3f, 109}, // ḿ -> m
+ {0x1e40, 77}, // Ṁ -> M
+ {0x1e41, 109}, // ṁ -> m
+ {0x1e42, 77}, // Ṃ -> M
+ {0x1e43, 109}, // ṃ -> m
+ {0x1e44, 78}, // Ṅ -> N
+ {0x1e45, 110}, // ṅ -> n
+ {0x1e46, 78}, // Ṇ -> N
+ {0x1e47, 110}, // ṇ -> n
+ {0x1e48, 78}, // Ṉ -> N
+ {0x1e49, 110}, // ṉ -> n
+ {0x1e4a, 78}, // Ṋ -> N
+ {0x1e4b, 110}, // ṋ -> n
+ {0x1e4c, 79}, // Ṍ -> O
+ {0x1e4d, 111}, // ṍ -> o
+ {0x1e4e, 79}, // Ṏ -> O
+ {0x1e4f, 111}, // ṏ -> o
+ {0x1e50, 79}, // Ṑ -> O
+ {0x1e51, 111}, // ṑ -> o
+ {0x1e52, 79}, // Ṓ -> O
+ {0x1e53, 111}, // ṓ -> o
+ {0x1e54, 80}, // Ṕ -> P
+ {0x1e55, 112}, // ṕ -> p
+ {0x1e56, 80}, // Ṗ -> P
+ {0x1e57, 112}, // ṗ -> p
+ {0x1e58, 82}, // Ṙ -> R
+ {0x1e59, 114}, // ṙ -> r
+ {0x1e5a, 82}, // Ṛ -> R
+ {0x1e5b, 114}, // ṛ -> r
+ {0x1e5c, 82}, // Ṝ -> R
+ {0x1e5d, 114}, // ṝ -> r
+ {0x1e5e, 82}, // Ṟ -> R
+ {0x1e5f, 114}, // ṟ -> r
+ {0x1e60, 83}, // Ṡ -> S
+ {0x1e61, 115}, // ṡ -> s
+ {0x1e62, 83}, // Ṣ -> S
+ {0x1e63, 115}, // ṣ -> s
+ {0x1e64, 83}, // Ṥ -> S
+ {0x1e65, 115}, // ṥ -> s
+ {0x1e66, 83}, // Ṧ -> S
+ {0x1e67, 115}, // ṧ -> s
+ {0x1e68, 83}, // Ṩ -> S
+ {0x1e69, 115}, // ṩ -> s
+ {0x1e6a, 84}, // Ṫ -> T
+ {0x1e6b, 116}, // ṫ -> t
+ {0x1e6c, 84}, // Ṭ -> T
+ {0x1e6d, 116}, // ṭ -> t
+ {0x1e6e, 84}, // Ṯ -> T
+ {0x1e6f, 116}, // ṯ -> t
+ {0x1e70, 84}, // Ṱ -> T
+ {0x1e71, 116}, // ṱ -> t
+ {0x1e72, 85}, // Ṳ -> U
+ {0x1e73, 117}, // ṳ -> u
+ {0x1e74, 85}, // Ṵ -> U
+ {0x1e75, 117}, // ṵ -> u
+ {0x1e76, 85}, // Ṷ -> U
+ {0x1e77, 117}, // ṷ -> u
+ {0x1e78, 85}, // Ṹ -> U
+ {0x1e79, 117}, // ṹ -> u
+ {0x1e7a, 85}, // Ṻ -> U
+ {0x1e7b, 117}, // ṻ -> u
+ {0x1e7c, 86}, // Ṽ -> V
+ {0x1e7d, 118}, // ṽ -> v
+ {0x1e7e, 86}, // Ṿ -> V
+ {0x1e7f, 118}, // ṿ -> v
+ {0x1e80, 87}, // Ẁ -> W
+ {0x1e81, 119}, // ẁ -> w
+ {0x1e82, 87}, // Ẃ -> W
+ {0x1e83, 119}, // ẃ -> w
+ {0x1e84, 87}, // Ẅ -> W
+ {0x1e85, 119}, // ẅ -> w
+ {0x1e86, 87}, // Ẇ -> W
+ {0x1e87, 119}, // ẇ -> w
+ {0x1e88, 87}, // Ẉ -> W
+ {0x1e89, 119}, // ẉ -> w
+ {0x1e8a, 88}, // Ẋ -> X
+ {0x1e8b, 120}, // ẋ -> x
+ {0x1e8c, 88}, // Ẍ -> X
+ {0x1e8d, 120}, // ẍ -> x
+ {0x1e8e, 89}, // Ẏ -> Y
+ {0x1e8f, 121}, // ẏ -> y
+ {0x1e90, 90}, // Ẑ -> Z
+ {0x1e91, 122}, // ẑ -> z
+ {0x1e92, 90}, // Ẓ -> Z
+ {0x1e93, 122}, // ẓ -> z
+ {0x1e94, 90}, // Ẕ -> Z
+ {0x1e95, 122}, // ẕ -> z
+ {0x1e96, 104}, // ẖ -> h
+ {0x1e97, 116}, // ẗ -> t
+ {0x1e98, 119}, // ẘ -> w
+ {0x1e99, 121}, // ẙ -> y
+ {0x1e9a, 97}, // ẚ -> a
+ {0x1e9b, 102}, // ẛ -> f
+ {0x1ea0, 65}, // Ạ -> A
+ {0x1ea1, 97}, // ạ -> a
+ {0x1ea2, 65}, // Ả -> A
+ {0x1ea3, 97}, // ả -> a
+ {0x1ea4, 65}, // Ấ -> A
+ {0x1ea5, 97}, // ấ -> a
+ {0x1ea6, 65}, // Ầ -> A
+ {0x1ea7, 97}, // ầ -> a
+ {0x1ea8, 65}, // Ẩ -> A
+ {0x1ea9, 97}, // ẩ -> a
+ {0x1eaa, 65}, // Ẫ -> A
+ {0x1eab, 97}, // ẫ -> a
+ {0x1eac, 65}, // Ậ -> A
+ {0x1ead, 97}, // ậ -> a
+ {0x1eae, 65}, // Ắ -> A
+ {0x1eaf, 97}, // ắ -> a
+ {0x1eb0, 65}, // Ằ -> A
+ {0x1eb1, 97}, // ằ -> a
+ {0x1eb2, 65}, // Ẳ -> A
+ {0x1eb3, 97}, // ẳ -> a
+ {0x1eb4, 65}, // Ẵ -> A
+ {0x1eb5, 97}, // ẵ -> a
+ {0x1eb6, 65}, // Ặ -> A
+ {0x1eb7, 97}, // ặ -> a
+ {0x1eb8, 69}, // Ẹ -> E
+ {0x1eb9, 101}, // ẹ -> e
+ {0x1eba, 69}, // Ẻ -> E
+ {0x1ebb, 101}, // ẻ -> e
+ {0x1ebc, 69}, // Ẽ -> E
+ {0x1ebd, 101}, // ẽ -> e
+ {0x1ebe, 69}, // Ế -> E
+ {0x1ebf, 101}, // ế -> e
+ {0x1ec0, 69}, // Ề -> E
+ {0x1ec1, 101}, // ề -> e
+ {0x1ec2, 69}, // Ể -> E
+ {0x1ec3, 101}, // ể -> e
+ {0x1ec4, 69}, // Ễ -> E
+ {0x1ec5, 101}, // ễ -> e
+ {0x1ec6, 69}, // Ệ -> E
+ {0x1ec7, 101}, // ệ -> e
+ {0x1ec8, 73}, // Ỉ -> I
+ {0x1ec9, 105}, // ỉ -> i
+ {0x1eca, 73}, // Ị -> I
+ {0x1ecb, 105}, // ị -> i
+ {0x1ecc, 79}, // Ọ -> O
+ {0x1ecd, 111}, // ọ -> o
+ {0x1ece, 79}, // Ỏ -> O
+ {0x1ecf, 111}, // ỏ -> o
+ {0x1ed0, 79}, // Ố -> O
+ {0x1ed1, 111}, // ố -> o
+ {0x1ed2, 79}, // Ồ -> O
+ {0x1ed3, 111}, // ồ -> o
+ {0x1ed4, 79}, // Ổ -> O
+ {0x1ed5, 111}, // ổ -> o
+ {0x1ed6, 79}, // Ỗ -> O
+ {0x1ed7, 111}, // ỗ -> o
+ {0x1ed8, 79}, // Ộ -> O
+ {0x1ed9, 111}, // ộ -> o
+ {0x1eda, 79}, // Ớ -> O
+ {0x1edb, 111}, // ớ -> o
+ {0x1edc, 79}, // Ờ -> O
+ {0x1edd, 111}, // ờ -> o
+ {0x1ede, 79}, // Ở -> O
+ {0x1edf, 111}, // ở -> o
+ {0x1ee0, 79}, // Ỡ -> O
+ {0x1ee1, 111}, // ỡ -> o
+ {0x1ee2, 79}, // Ợ -> O
+ {0x1ee3, 111}, // ợ -> o
+ {0x1ee4, 85}, // Ụ -> U
+ {0x1ee5, 117}, // ụ -> u
+ {0x1ee6, 85}, // Ủ -> U
+ {0x1ee7, 117}, // ủ -> u
+ {0x1ee8, 85}, // Ứ -> U
+ {0x1ee9, 117}, // ứ -> u
+ {0x1eea, 85}, // Ừ -> U
+ {0x1eeb, 117}, // ừ -> u
+ {0x1eec, 85}, // Ử -> U
+ {0x1eed, 117}, // ử -> u
+ {0x1eee, 85}, // Ữ -> U
+ {0x1eef, 117}, // ữ -> u
+ {0x1ef0, 85}, // Ự -> U
+ {0x1ef1, 117}, // ự -> u
+ {0x1ef2, 89}, // Ỳ -> Y
+ {0x1ef3, 121}, // ỳ -> y
+ {0x1ef4, 89}, // Ỵ -> Y
+ {0x1ef5, 121}, // ỵ -> y
+ {0x1ef6, 89}, // Ỷ -> Y
+ {0x1ef7, 121}, // ỷ -> y
+ {0x1ef8, 89}, // Ỹ -> Y
+ {0x1ef9, 121}, // ỹ -> y
+};
+
+} // namespace
+
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+ // The map is allocated dynamically the first time this function is executed.
+ static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
+ std::unordered_map<char16_t, char16_t> map;
+ // Size of all the mappings is about 2.5 KiB.
+ constexpr int numMappings =
+ sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+ map.reserve(numMappings);
+ for (size_t i = 0; i < numMappings; ++i) {
+ map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
+ }
+ return map;
+ }();
+
+ return normalization_map;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/i18n-test-utils.h b/icing/transform/map/normalization-map.h
similarity index 60%
copy from icing/testing/i18n-test-utils.h
copy to icing/transform/map/normalization-map.h
index 4e8a3b8..aea85bd 100644
--- a/icing/testing/i18n-test-utils.h
+++ b/icing/transform/map/normalization-map.h
@@ -12,19 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TESTING_I18N_TEST_UTILS_H_
-#define ICING_TESTING_I18N_TEST_UTILS_H_
+#ifndef ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
+#define ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
-#include <string>
-
-#include "unicode/umachine.h"
+#include <unordered_map>
namespace icing {
namespace lib {
-std::string UcharToString(UChar32 uchar);
+// Returns a map containing normalization mappings. A mapping (A -> B) means
+// that we'll transform every character 'A' into 'B'. See normalization-map.cc
+// for mapping details.
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
} // namespace lib
} // namespace icing
-#endif // ICING_TESTING_I18N_TEST_UTILS_H_
+#endif // ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h
new file mode 100644
index 0000000..f1f3f62
--- /dev/null
+++ b/icing/transform/normalizer-factory.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_NORMALIZER_FACTORY_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a normalizer. max_term_byte_size enforces the max size of text after
+// normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size);
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 3f2b624..4cbfa63 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -20,92 +20,25 @@
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "unicode/unorm2.h"
-#include "unicode/utrans.h"
namespace icing {
namespace lib {
-// Used to normalize UTF8 strings for text matching. It enforces a set of rules:
-// 1. Transforms text to be lowercase UTF8.
-// 2. Transforms full-width Latin characters to ASCII characters if possible.
-// 3. Transforms hiragana to katakana.
-// 4. Removes accent / diacritic marks on Latin characters
-// 5. Normalized text must be less than or equal to max_term_byte_size,
-// otherwise it will be truncated.
-//
-// There're some other rules from ICU not listed here, please see .cc file for
-// details.
+// Normalizes strings for text matching.
//
// Example use:
// ICING_ASSIGN_OR_RETURN(auto normalizer,
-// Normalizer::Create(/*max_term_byte_size=*/5);
+// normalizer_factory::Create(/*max_term_byte_size=*/5);
//
// std::string normalized_text = normalizer->NormalizeText("HELLO!");
// ICING_LOG(INFO) << normalized_text; // prints "hello"
class Normalizer {
public:
- Normalizer(const Normalizer&) = delete;
- Normalizer& operator=(const Normalizer&) = delete;
+ virtual ~Normalizer() = default;
- // Creates a normalizer with the subcomponents it needs. max_term_byte_size
- // enforces the max size of text after normalization, text will be truncated
- // if exceeds the max size.
- //
- // Returns:
- // A normalizer on success
- // INVALID_ARGUMENT if max_term_byte_size <= 0
- // INTERNAL_ERROR if failed to create any subcomponent
- static libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
- int max_term_byte_size);
-
- // Normalizes the input term based on rules. See .cc file for rule details.
- //
- // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
- // result in the non-Latin characters not properly being normalized
- std::string NormalizeTerm(std::string_view term) const;
-
- private:
- // A handler class that helps manage the lifecycle of UTransliterator. It's
- // used in Normalizer to transform terms into the formats we need.
- class TermTransformer {
- public:
- // Creates TermTransformer with a valid UTransliterator instance
- //
- // Returns:
- // A term transformer on success
- // INTERNAL_ERROR if failed to create any subcomponent
- static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
- Create();
-
- // Closes the UTransliterator instance
- ~TermTransformer();
-
- // Transforms the text based on our rules described at top of this file
- std::string Transform(std::string_view term) const;
-
- private:
- explicit TermTransformer(UTransliterator* u_transliterator);
-
- // An ICU class to execute custom term transformation / normalization rules.
- // utrans_close() must by called after using.
- UTransliterator* u_transliterator_;
- };
-
- explicit Normalizer(std::unique_ptr<TermTransformer> term_transformer,
- int max_term_byte_size);
-
- // Helper method to normalize Latin terms only. Rules applied:
- // 1. Uppercase to lowercase
- // 2. Remove diacritic (accent) marks
- std::string NormalizeLatin(const UNormalizer2* normalizer2,
- std::string_view term) const;
-
- // Used to transform terms into their normalized forms.
- std::unique_ptr<TermTransformer> term_transformer_;
-
- // The maximum term length allowed after normalization.
- const int max_term_byte_size_;
+ // Normalizes the input term based on rules. See implementation classes for
+ // specific transformation rules.
+ virtual std::string NormalizeTerm(std::string_view term) const = 0;
};
} // namespace lib
diff --git a/icing/transform/normalizer_test.cc b/icing/transform/normalizer_test.cc
deleted file mode 100644
index ec0a782..0000000
--- a/icing/transform/normalizer_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/transform/normalizer.h"
-
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::Eq;
-
-class NormalizerTest : public testing::Test {
- protected:
- void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- SetUpICUDataFile("icing/icu.dat"));
-
- ICING_ASSERT_OK_AND_ASSIGN(normalizer_, Normalizer::Create(
- /*max_term_byte_size=*/1024));
- }
-
- std::unique_ptr<Normalizer> normalizer_;
-};
-
-TEST_F(NormalizerTest, Creation) {
- EXPECT_THAT(Normalizer::Create(5), IsOk());
- EXPECT_THAT(Normalizer::Create(0),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(Normalizer::Create(-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-// Strings that are already normalized won't change if normalized again.
-TEST_F(NormalizerTest, AlreadyNormalized) {
- EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
- EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
- EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
- EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
- EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
-}
-
-TEST_F(NormalizerTest, UppercaseToLowercase) {
- EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
-}
-
-TEST_F(NormalizerTest, LatinLetterRemoveAccent) {
- EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
- EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
- EXPECT_THAT(normalizer_->NormalizeTerm("āăąḃḅḇčćç"), Eq("aaabbbccc"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ÁȦÄḂḄḆĆČḈ"), Eq("aaabbbccc"));
-}
-
-// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
-TEST_F(NormalizerTest, NonLatinLetterNotRemoveAccent) {
- EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
- EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
-}
-
-TEST_F(NormalizerTest, FullWidthCharsToASCII) {
- // Full-width punctuation to ASCII punctuation
- EXPECT_THAT(normalizer_->NormalizeTerm("。,!?:”"), Eq(".,!?:\""));
- // 0xff10 is the full-width number 0
- EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff10)), Eq("0"));
- // 0xff21 is the full-width letter A
- EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff21)), Eq("a"));
- // 0xff41 is the full-width letter a
- EXPECT_THAT(normalizer_->NormalizeTerm(UcharToString(0xff41)), Eq("a"));
-}
-
-// For Katakana, each character is normalized to its full-width version.
-TEST_F(NormalizerTest, KatakanaHalfWidthToFullWidth) {
- EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ"));
-}
-
-TEST_F(NormalizerTest, HiraganaToKatakana) {
- EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ぎゃぎゅぎょ"), Eq("ギャギュギョ"));
-}
-
-TEST_F(NormalizerTest, SuperscriptAndSubscriptToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
- EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
-}
-
-TEST_F(NormalizerTest, CircledCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
-}
-
-TEST_F(NormalizerTest, RotatedCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
- EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
-}
-
-TEST_F(NormalizerTest, SquaredCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
-}
-
-TEST_F(NormalizerTest, FractionsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
- EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
-}
-
-TEST_F(NormalizerTest, Truncate) {
- {
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer,
- Normalizer::Create(/*max_term_byte_size=*/5));
-
- // Won't be truncated
- EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
- EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
-
- // Truncated to length 5.
- EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
-
- // Each Japanese character has 3 bytes, so truncating to length 5 results in
- // only 1 character.
- EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
-
- // Each Greek character has 2 bytes, so truncating to length 5 results in 2
- // character.
- EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
- }
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer,
- Normalizer::Create(/*max_term_byte_size=*/2));
- // The Japanese character has 3 bytes, truncating it results in an empty
- // string.
- EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
- }
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc
new file mode 100644
index 0000000..6b35270
--- /dev/null
+++ b/icing/transform/simple/none-normalizer-factory.cc
@@ -0,0 +1,53 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/normalizer.h"
+#include "icing/transform/simple/none-normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a dummy normalizer. The term is not normalized, but
+// the text will be truncated to max_term_byte_size if it exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ return std::make_unique<NoneNormalizer>(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/simple/none-normalizer.h b/icing/transform/simple/none-normalizer.h
new file mode 100644
index 0000000..47085e1
--- /dev/null
+++ b/icing/transform/simple/none-normalizer.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+// This normalizer is not meant for production use. Currently only used to get
+// the Icing library to compile in Jetpack.
+//
+// No normalization is done, but the term is truncated if it exceeds
+// max_term_byte_size.
+class NoneNormalizer : public Normalizer {
+ public:
+ explicit NoneNormalizer(int max_term_byte_size)
+ : max_term_byte_size_(max_term_byte_size){};
+
+ std::string NormalizeTerm(std::string_view term) const override {
+ if (term.length() > max_term_byte_size_) {
+ return std::string(term.substr(0, max_term_byte_size_));
+ }
+ return std::string(term);
+ }
+
+ private:
+ // The maximum term length allowed after normalization.
+ int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
diff --git a/icing/transform/simple/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc
new file mode 100644
index 0000000..e074828
--- /dev/null
+++ b/icing/transform/simple/none-normalizer_test.cc
@@ -0,0 +1,74 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(NoneNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(IcuNormalizerTest, NoNormalizationDone) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+ EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
+
+ // Capitalization
+ EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("MDI"));
+
+ // Accents
+ EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("Zürich"));
+
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer->NormalizeTerm("。,!?:”"), Eq("。,!?:”"));
+
+ // Half-width katakana
+ EXPECT_THAT(normalizer->NormalizeTerm("カ"), Eq("カ"));
+}
+
+TEST(NoneNormalizerTest, Truncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/clock.cc b/icing/util/clock.cc
new file mode 100644
index 0000000..3593f13
--- /dev/null
+++ b/icing/util/clock.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/clock.h"
+
+#include <chrono> // NOLINT. Abseil library is not available in AOSP so we have
+ // to use chrono to get current time in milliseconds.
+
+namespace icing {
+namespace lib {
+
+int64_t Clock::GetSystemTimeMilliseconds() const {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count();
+}
+
+uint64_t GetSteadyTimeNanoseconds() {
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now().time_since_epoch())
+ .count();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/clock.h b/icing/util/clock.h
index fa98960..58628f3 100644
--- a/icing/util/clock.h
+++ b/icing/util/clock.h
@@ -15,8 +15,7 @@
#ifndef ICING_UTIL_CLOCK_H_
#define ICING_UTIL_CLOCK_H_
-#include <chrono> // NOLINT. Abseil library is not available in AOSP so we have
- // to use chrono to get current time in milliseconds.
+#include <cstdint>
namespace icing {
namespace lib {
@@ -29,13 +28,14 @@
// Returns the current time in milliseconds, it's guaranteed that the return
// value is non-negative.
- virtual int64_t GetSystemTimeMilliseconds() const {
- return std::chrono::duration_cast<std::chrono::milliseconds>(
- std::chrono::system_clock::now().time_since_epoch())
- .count();
- }
+ virtual int64_t GetSystemTimeMilliseconds() const;
};
+// Returns the current steady time in nanoseconds. The steady clock is different
+// from the system clock. It's monotonic and never returns a lower value than a
+// previous call, while a system clock can be occasionally adjusted.
+uint64_t GetSteadyTimeNanoseconds();
+
} // namespace lib
} // namespace icing
diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc
index 5b588e7..36b84f8 100644
--- a/icing/util/document-validator.cc
+++ b/icing/util/document-validator.cc
@@ -72,11 +72,9 @@
const SchemaTypeConfigProto* type_config =
std::move(type_config_or).ValueOrDie();
- int32_t num_required_properties_expected = 0;
int32_t num_required_properties_actual = 0;
- PropertyConfigMap property_config_map;
- SchemaUtil::BuildPropertyConfigMap(*type_config, &property_config_map,
- &num_required_properties_expected);
+ SchemaUtil::ParsedPropertyConfigs parsed_property_configs =
+ SchemaUtil::ParsePropertyConfigs(*type_config);
std::unordered_set<std::string_view> unique_properties;
for (const PropertyProto& property : document.properties()) {
@@ -93,8 +91,9 @@
document.namespace_(), ", ", document.uri(), ")."));
}
- const auto& property_iter = property_config_map.find(property.name());
- if (property_iter == property_config_map.end()) {
+ const auto& property_iter =
+ parsed_property_configs.property_config_map.find(property.name());
+ if (property_iter == parsed_property_configs.property_config_map.end()) {
return absl_ports::NotFoundError(absl_ports::StrCat(
"Property config '", property.name(), "' not found for key: (",
document.namespace_(), ", ", document.uri(), ")."));
@@ -165,7 +164,8 @@
}
}
}
- if (num_required_properties_actual < num_required_properties_expected) {
+ if (num_required_properties_actual <
+ parsed_property_configs.num_required_properties) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("One or more required fields missing for key: (",
document.namespace_(), ", ", document.uri(), ")."));
diff --git a/icing/util/document-validator.h b/icing/util/document-validator.h
index d009b5c..34a3217 100644
--- a/icing/util/document-validator.h
+++ b/icing/util/document-validator.h
@@ -60,9 +60,11 @@
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if no schema is set yet
// INVALID_ARGUMENT if any of case 1, 2, 3, 5, 8, 9, 10, 11, 12, 13 fails
// NOT_FOUND if case 4 or 7 fails
// ALREADY_EXISTS if case 6 fails
+ // INTERNAL on any I/O error
libtextclassifier3::Status Validate(const DocumentProto& document);
void UpdateSchemaStore(const SchemaStore* schema_store) {
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 9838310..9cf992f 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -14,24 +14,34 @@
#include "icing/util/i18n-utils.h"
-#include <sys/types.h>
-
#include <cctype>
-#include <string>
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/util/logging.h"
+#include "unicode/uchar.h"
#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
#include "unicode/ustring.h"
+#include "unicode/utf16.h"
#include "unicode/utf8.h"
+#include "unicode/utypes.h"
namespace icing {
namespace lib {
namespace i18n_utils {
+namespace {
+
+// All ASCII punctuation that's also in a Unicode Punctuation category
+// (https://www.fileformat.info/info/unicode/category/index.htm). The set of
+// characters that are regarded as punctuation is not the same for std::ispunct
+// and u_ispunct.
+const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
+
+} // namespace
+
libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
const std::u16string& utf16_string) {
std::string utf8_string;
@@ -101,20 +111,22 @@
str->resize(0);
}
-bool IsAscii(char c) { return U8_IS_SINGLE((u_int8_t)c); }
+bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
-bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((u_int8_t)c); }
+int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
+
+bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
if (IsAscii(input[position])) {
if (char_len_out != nullptr) {
*char_len_out = 1;
}
- return std::ispunct(input[position]);
+ return ascii_icu_punctuation.find(input[position]) != std::string::npos;
}
UChar32 c = GetUChar32At(input.data(), input.length(), position);
if (char_len_out != nullptr) {
@@ -123,36 +135,36 @@
return u_ispunct(c);
}
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
- char* char_out) {
- if (IsAscii(uchar32_in)) {
- // The Unicode character is within ASCII range
- if (char_out != nullptr) {
- *char_out = uchar32_in;
- }
- return true;
+bool IsWhitespaceAt(std::string_view input, int position) {
+ if (IsAscii(input[position])) {
+ return std::isspace(input[position]);
}
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ return u_isUWhiteSpace(c);
+}
- // Maximum number of pieces a Unicode character can be decomposed into.
- // TODO(samzheng) figure out if this number is proper.
- constexpr int kDecompositionBufferCapacity = 5;
-
- // A buffer used to store Unicode decomposition mappings of only one
- // character.
- UChar decomposition_buffer[kDecompositionBufferCapacity];
-
- // Decomposes the Unicode character, trying to get an ASCII char and some
- // diacritic chars.
- UErrorCode status = U_ZERO_ERROR;
- if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
- kDecompositionBufferCapacity, &status) > 0 &&
- !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
- if (char_out != nullptr) {
- *char_out = decomposition_buffer[0];
- }
- return true;
+bool IsAlphabeticAt(std::string_view input, int position) {
+ if (IsAscii(input[position])) {
+ return std::isalpha(input[position]);
}
- return false;
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ return u_isUAlphabetic(c);
+}
+
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
+ uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
+
+ int utf8_index = 0;
+ UBool has_error = FALSE;
+
+ // utf8_index is advanced to the end of the contents if successful
+ U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
+
+ if (has_error) {
+ ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
+ return;
+ }
+ utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
}
} // namespace i18n_utils
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 86f07a7..e103bab 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -20,12 +20,17 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
namespace icing {
namespace lib {
+
+// Internationalization utils that use standard utilities or custom code. Does
+// not require any special dependencies, such as data files for ICU.
namespace i18n_utils {
+// An invalid value defined by Unicode.
+static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
+
// Converts a UTF16 string to a UTF8 string.
//
// Returns:
@@ -42,8 +47,7 @@
libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
std::string_view utf8_string);
-// Returns the Unicode char at the given position. If anything wrong happens, an
-// invalid value 0xFFFD is returned.
+// Returns the char at the given position.
UChar32 GetUChar32At(const char* data, int length, int position);
// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
@@ -56,10 +60,14 @@
// Checks if the Unicode char is within ASCII range.
bool IsAscii(UChar32 c);
-// Returns how many code units (bytes) are used for the UTF-8 encoding of this
+// Returns how many code units (char) are used for the UTF-8 encoding of this
// Unicode character. Returns 0 if not valid.
int GetUtf8Length(UChar32 c);
+// Returns how many code units (char16_t) are used for the UTF-16 encoding of
+// this Unicode character. Returns 0 if not valid.
+int GetUtf16Length(UChar32 c);
+
// Checks if the single char is the first byte of a UTF8 character, note
// that a single ASCII char is also considered a lead byte.
bool IsLeadUtf8Byte(char c);
@@ -70,16 +78,13 @@
bool IsPunctuationAt(std::string_view input, int position,
int* char_len_out = nullptr);
-// Transforms a Unicode character with diacritics to its counterpart in ASCII
-// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
-// the transformation is successful.
-//
-// NOTE: According to our convention this function should have returned
-// StatusOr<char>. However, this function is performance-sensitive because is
-// could be called on every Latin character in normalization, so we make it
-// return a bool here to save a bit more time and memory.
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
- char* char_out);
+// Checks if the character at position is a whitespace.
+bool IsWhitespaceAt(std::string_view input, int position);
+
+// Checks if the character at position is a whitespace.
+bool IsAlphabeticAt(std::string_view input, int position);
+
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar);
} // namespace i18n_utils
} // namespace lib
diff --git a/icing/util/i18n-utils_test.cc b/icing/util/i18n-utils_test.cc
new file mode 100644
index 0000000..a1e8d4e
--- /dev/null
+++ b/icing/util/i18n-utils_test.cc
@@ -0,0 +1,141 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/i18n-utils.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "unicode/uchar.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) {
+ // Iterate through ASCII values
+ for (int i = 0; i <= 127; ++i) {
+ char ascii = i;
+
+ std::string ascii_string = "";
+ ascii_string.push_back(ascii);
+
+ EXPECT_EQ(i18n_utils::IsPunctuationAt(ascii_string, /*position=*/0),
+
+ u_ispunct(ascii));
+ }
+}
+
+TEST(IcuI18nUtilsTest, IsAlphabeticAt) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/0)); // 'i'
+ EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/1)); // 'J'
+ EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/2)); // '?'
+ EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/3)); // '9'
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ EXPECT_FALSE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/0)); // '👏'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+ 4);
+ EXPECT_TRUE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/4)); // 'ñ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+ 2);
+ EXPECT_FALSE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/6)); // '①'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+ 3);
+ EXPECT_TRUE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/9)); // 'カ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+ 3);
+}
+
+TEST(IcuI18nUtilsTest, GetUtf8Length) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 0)),
+ 1); // 'i'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 1)),
+ 1); // 'J'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 2)),
+ 1); // '?'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 3)),
+ 1); // '9'
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+ 4); // '👏'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+ 2); // 'ñ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+ 3); // '①'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+ 3); // 'カ'
+}
+
+TEST(IcuI18nUtilsTest, SafeTruncate) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ std::string truncated(kSomeAscii);
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() + 1);
+ EXPECT_THAT(truncated, Eq("iJ?9"));
+ truncated = kSomeAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length());
+ EXPECT_THAT(truncated, Eq("iJ?9"));
+ truncated = kSomeAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() - 1);
+ EXPECT_THAT(truncated, Eq("iJ?"));
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() + 1);
+ EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length());
+ EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 1);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 2);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 3);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 4);
+ EXPECT_THAT(truncated, Eq("👏ñ"));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/java/Android.bp b/java/Android.bp
index 6bc8836..b2c915b 100644
--- a/java/Android.bp
+++ b/java/Android.bp
@@ -16,6 +16,7 @@
name: "libicing-java",
srcs: ["src/**/*.java"],
static_libs: [
+ "androidx.annotation_annotation",
"icing-java-proto-lite",
"libprotobuf-java-lite",
],
diff --git a/java/src/com/google/android/icing/BreakIteratorBatcher.java b/java/src/com/google/android/icing/BreakIteratorBatcher.java
new file mode 100644
index 0000000..58efbfc
--- /dev/null
+++ b/java/src/com/google/android/icing/BreakIteratorBatcher.java
@@ -0,0 +1,92 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.android.icing;
+
+import androidx.annotation.NonNull;
+import androidx.annotation.RestrictTo;
+
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * A simple wrapper around BreakIterator that allows batching of multiple BreakIterator#next calls
+ * to reduce the number of necessary reverse jni calls.
+ *
+ * <p>Example: The text "我每天走路去上班。" has a length of 9 bytes in UTF-16 and a length of 27 bytes in
+ * UTF-8. The text should be broken up into the following six terms when properly segmented: "我",
+ * "每天", "走路", "去", "上班", "。"
+ *
+ * <pre>{@code
+ * BreakIteratorBatcher brkItrBatcher = new BreakIteratorBatcher(Locale.US);
+ * brkItrBatcher.setText("我每天走路去上班。");
+ * int[] utf16Boundaries = brkItrBatcher.next(5);
+ * assertThat(utf16Boundaries).asList().containsExactly(1, 3, 5, 6, 8);
+ * utf16Boundaries = brkItrBatcher.next(5);
+ * assertThat(utf16Boundaries).asList().containsExactly(9);
+ * }</pre>
+ *
+ * @hide
+ */
+@RestrictTo(RestrictTo.Scope.LIBRARY_GROUP)
+public class BreakIteratorBatcher {
+
+ private final BreakIterator iterator;
+
+ public BreakIteratorBatcher(@NonNull Locale locale) {
+ this.iterator = BreakIterator.getWordInstance(locale);
+ }
+
+ /* Direct calls to BreakIterator */
+ public void setText(@NonNull String text) {
+ iterator.setText(text);
+ }
+
+ public int first() {
+ return iterator.first();
+ }
+
+ public int preceding(int utf16Offset) {
+ return iterator.preceding(utf16Offset);
+ }
+
+ public int following(int utf16Offset) {
+ return iterator.following(utf16Offset);
+ }
+
+ /**
+ * Batched version of next. Returns an array of ints of up to size batchSize, reflecting the
+ * return values of batchSize successful calls to BreakIterator.next. If the BreakIterator reaches
+ * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls
+ * in that batch will be returned.
+ */
+ @NonNull
+ public int[] next(int batchSize) {
+ List<Integer> breakIndices = new ArrayList<>(batchSize);
+ for (int i = 0; i < batchSize; ++i) {
+ int boundary = iterator.next();
+ if (boundary == BreakIterator.DONE) {
+ break;
+ }
+ breakIndices.add(boundary);
+ }
+ int[] breakIndicesArray = new int[breakIndices.size()];
+ for (int i = 0; i < breakIndices.size(); ++i) {
+ breakIndicesArray[i] = breakIndices.get(i);
+ }
+ return breakIndicesArray;
+ }
+}
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index a12352b..7be631c 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -1,28 +1,27 @@
-/*
- * Copyright (C) 2020 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
package com.google.android.icing;
-import android.annotation.NonNull;
import android.util.Log;
-
+import androidx.annotation.NonNull;
import com.google.android.icing.proto.DeleteByNamespaceResultProto;
import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
import com.google.android.icing.proto.DeleteResultProto;
import com.google.android.icing.proto.DocumentProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
+import com.google.android.icing.proto.GetOptimizeInfoResultProto;
import com.google.android.icing.proto.GetResultProto;
import com.google.android.icing.proto.GetSchemaResultProto;
import com.google.android.icing.proto.GetSchemaTypeResultProto;
@@ -31,6 +30,7 @@
import com.google.android.icing.proto.OptimizeResultProto;
import com.google.android.icing.proto.PersistToDiskResultProto;
import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ResetResultProto;
import com.google.android.icing.proto.ResultSpecProto;
import com.google.android.icing.proto.SchemaProto;
import com.google.android.icing.proto.ScoringSpecProto;
@@ -38,35 +38,35 @@
import com.google.android.icing.proto.SearchSpecProto;
import com.google.android.icing.proto.SetSchemaResultProto;
import com.google.android.icing.proto.StatusProto;
+import com.google.protobuf.ExtensionRegistryLite;
import com.google.protobuf.InvalidProtocolBufferException;
/** Java wrapper to access native APIs in external/icing/icing/icing-search-engine.h */
public final class IcingSearchEngine {
private static final String TAG = "IcingSearchEngine";
+ private static final ExtensionRegistryLite EXTENSION_REGISTRY_LITE =
+ ExtensionRegistryLite.getEmptyRegistry();
- private long mNativePointer;
+ private final long nativePointer;
static {
// NOTE: This can fail with an UnsatisfiedLinkError
- System.loadLibrary("icing_jni");
+ System.loadLibrary("icing");
}
- /**
- * @throws RuntimeException if IcingSearchEngine fails to be created
- */
- public IcingSearchEngine(IcingSearchEngineOptions options) {
- mNativePointer = nativeCreate(options.toByteArray());
- if (mNativePointer == 0) {
+ /** @throws IllegalStateException if IcingSearchEngine fails to be created */
+ public IcingSearchEngine(@NonNull IcingSearchEngineOptions options) {
+ nativePointer = nativeCreate(options.toByteArray());
+ if (nativePointer == 0) {
Log.e(TAG, "Failed to create IcingSearchEngine.");
- throw new RuntimeException("Failed to create IcingSearchEngine.");
+ throw new IllegalStateException("Failed to create IcingSearchEngine.");
}
}
-
@NonNull
public InitializeResultProto initialize() {
- byte[] initializeResultBytes = nativeInitialize(mNativePointer);
+ byte[] initializeResultBytes = nativeInitialize(nativePointer);
if (initializeResultBytes == null) {
Log.e(TAG, "Received null InitializeResult from native.");
return InitializeResultProto.newBuilder()
@@ -75,7 +75,7 @@
}
try {
- return InitializeResultProto.parseFrom(initializeResultBytes);
+ return InitializeResultProto.parseFrom(initializeResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing InitializeResultProto.", e);
return InitializeResultProto.newBuilder()
@@ -91,9 +91,9 @@
@NonNull
public SetSchemaResultProto setSchema(
- @NonNull SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments) {
+ @NonNull SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments) {
byte[] setSchemaResultBytes =
- nativeSetSchema(mNativePointer, schema.toByteArray(), ignoreErrorsAndDeleteDocuments);
+ nativeSetSchema(nativePointer, schema.toByteArray(), ignoreErrorsAndDeleteDocuments);
if (setSchemaResultBytes == null) {
Log.e(TAG, "Received null SetSchemaResultProto from native.");
return SetSchemaResultProto.newBuilder()
@@ -102,7 +102,7 @@
}
try {
- return SetSchemaResultProto.parseFrom(setSchemaResultBytes);
+ return SetSchemaResultProto.parseFrom(setSchemaResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing SetSchemaResultProto.", e);
return SetSchemaResultProto.newBuilder()
@@ -113,7 +113,7 @@
@NonNull
public GetSchemaResultProto getSchema() {
- byte[] getSchemaResultBytes = nativeGetSchema(mNativePointer);
+ byte[] getSchemaResultBytes = nativeGetSchema(nativePointer);
if (getSchemaResultBytes == null) {
Log.e(TAG, "Received null GetSchemaResultProto from native.");
return GetSchemaResultProto.newBuilder()
@@ -122,7 +122,7 @@
}
try {
- return GetSchemaResultProto.parseFrom(getSchemaResultBytes);
+ return GetSchemaResultProto.parseFrom(getSchemaResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing GetSchemaResultProto.", e);
return GetSchemaResultProto.newBuilder()
@@ -133,7 +133,7 @@
@NonNull
public GetSchemaTypeResultProto getSchemaType(@NonNull String schemaType) {
- byte[] getSchemaTypeResultBytes = nativeGetSchemaType(mNativePointer, schemaType);
+ byte[] getSchemaTypeResultBytes = nativeGetSchemaType(nativePointer, schemaType);
if (getSchemaTypeResultBytes == null) {
Log.e(TAG, "Received null GetSchemaTypeResultProto from native.");
return GetSchemaTypeResultProto.newBuilder()
@@ -142,7 +142,7 @@
}
try {
- return GetSchemaTypeResultProto.parseFrom(getSchemaTypeResultBytes);
+ return GetSchemaTypeResultProto.parseFrom(getSchemaTypeResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing GetSchemaTypeResultProto.", e);
return GetSchemaTypeResultProto.newBuilder()
@@ -153,7 +153,7 @@
@NonNull
public PutResultProto put(@NonNull DocumentProto document) {
- byte[] putResultBytes = nativePut(mNativePointer, document.toByteArray());
+ byte[] putResultBytes = nativePut(nativePointer, document.toByteArray());
if (putResultBytes == null) {
Log.e(TAG, "Received null PutResultProto from native.");
return PutResultProto.newBuilder()
@@ -162,7 +162,7 @@
}
try {
- return PutResultProto.parseFrom(putResultBytes);
+ return PutResultProto.parseFrom(putResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing PutResultProto.", e);
return PutResultProto.newBuilder()
@@ -173,7 +173,7 @@
@NonNull
public GetResultProto get(@NonNull String namespace, @NonNull String uri) {
- byte[] getResultBytes = nativeGet(mNativePointer, namespace, uri);
+ byte[] getResultBytes = nativeGet(nativePointer, namespace, uri);
if (getResultBytes == null) {
Log.e(TAG, "Received null GetResultProto from native.");
return GetResultProto.newBuilder()
@@ -182,7 +182,7 @@
}
try {
- return GetResultProto.parseFrom(getResultBytes);
+ return GetResultProto.parseFrom(getResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing GetResultProto.", e);
return GetResultProto.newBuilder()
@@ -192,11 +192,34 @@
}
@NonNull
+ public GetAllNamespacesResultProto getAllNamespaces() {
+ byte[] getAllNamespacesResultBytes = nativeGetAllNamespaces(nativePointer);
+ if (getAllNamespacesResultBytes == null) {
+ Log.e(TAG, "Received null GetAllNamespacesResultProto from native.");
+ return GetAllNamespacesResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetAllNamespacesResultProto.parseFrom(
+ getAllNamespacesResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetAllNamespacesResultProto.", e);
+ return GetAllNamespacesResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
public SearchResultProto search(
- @NonNull SearchSpecProto searchSpec, @NonNull ScoringSpecProto scoringSpec, @NonNull ResultSpecProto resultSpec) {
+ @NonNull SearchSpecProto searchSpec,
+ @NonNull ScoringSpecProto scoringSpec,
+ @NonNull ResultSpecProto resultSpec) {
byte[] searchResultBytes =
nativeSearch(
- mNativePointer,
+ nativePointer,
searchSpec.toByteArray(),
scoringSpec.toByteArray(),
resultSpec.toByteArray());
@@ -208,7 +231,7 @@
}
try {
- return SearchResultProto.parseFrom(searchResultBytes);
+ return SearchResultProto.parseFrom(searchResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing SearchResultProto.", e);
return SearchResultProto.newBuilder()
@@ -218,8 +241,33 @@
}
@NonNull
+ public SearchResultProto getNextPage(long nextPageToken) {
+ byte[] searchResultBytes = nativeGetNextPage(nativePointer, nextPageToken);
+ if (searchResultBytes == null) {
+ Log.e(TAG, "Received null SearchResultProto from native.");
+ return SearchResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return SearchResultProto.parseFrom(searchResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing SearchResultProto.", e);
+ return SearchResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public void invalidateNextPageToken(long nextPageToken) {
+ nativeInvalidateNextPageToken(nativePointer, nextPageToken);
+ }
+
+ @NonNull
public DeleteResultProto delete(@NonNull String namespace, @NonNull String uri) {
- byte[] deleteResultBytes = nativeDelete(mNativePointer, namespace, uri);
+ byte[] deleteResultBytes = nativeDelete(nativePointer, namespace, uri);
if (deleteResultBytes == null) {
Log.e(TAG, "Received null DeleteResultProto from native.");
return DeleteResultProto.newBuilder()
@@ -228,7 +276,7 @@
}
try {
- return DeleteResultProto.parseFrom(deleteResultBytes);
+ return DeleteResultProto.parseFrom(deleteResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing DeleteResultProto.", e);
return DeleteResultProto.newBuilder()
@@ -239,7 +287,7 @@
@NonNull
public DeleteByNamespaceResultProto deleteByNamespace(@NonNull String namespace) {
- byte[] deleteByNamespaceResultBytes = nativeDeleteByNamespace(mNativePointer, namespace);
+ byte[] deleteByNamespaceResultBytes = nativeDeleteByNamespace(nativePointer, namespace);
if (deleteByNamespaceResultBytes == null) {
Log.e(TAG, "Received null DeleteByNamespaceResultProto from native.");
return DeleteByNamespaceResultProto.newBuilder()
@@ -248,7 +296,8 @@
}
try {
- return DeleteByNamespaceResultProto.parseFrom(deleteByNamespaceResultBytes);
+ return DeleteByNamespaceResultProto.parseFrom(
+ deleteByNamespaceResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing DeleteByNamespaceResultProto.", e);
return DeleteByNamespaceResultProto.newBuilder()
@@ -259,7 +308,7 @@
@NonNull
public DeleteBySchemaTypeResultProto deleteBySchemaType(@NonNull String schemaType) {
- byte[] deleteBySchemaTypeResultBytes = nativeDeleteBySchemaType(mNativePointer, schemaType);
+ byte[] deleteBySchemaTypeResultBytes = nativeDeleteBySchemaType(nativePointer, schemaType);
if (deleteBySchemaTypeResultBytes == null) {
Log.e(TAG, "Received null DeleteBySchemaTypeResultProto from native.");
return DeleteBySchemaTypeResultProto.newBuilder()
@@ -268,7 +317,8 @@
}
try {
- return DeleteBySchemaTypeResultProto.parseFrom(deleteBySchemaTypeResultBytes);
+ return DeleteBySchemaTypeResultProto.parseFrom(
+ deleteBySchemaTypeResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing DeleteBySchemaTypeResultProto.", e);
return DeleteBySchemaTypeResultProto.newBuilder()
@@ -279,7 +329,7 @@
@NonNull
public PersistToDiskResultProto persistToDisk() {
- byte[] persistToDiskResultBytes = nativePersistToDisk(mNativePointer);
+ byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer);
if (persistToDiskResultBytes == null) {
Log.e(TAG, "Received null PersistToDiskResultProto from native.");
return PersistToDiskResultProto.newBuilder()
@@ -288,7 +338,7 @@
}
try {
- return PersistToDiskResultProto.parseFrom(persistToDiskResultBytes);
+ return PersistToDiskResultProto.parseFrom(persistToDiskResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing PersistToDiskResultProto.", e);
return PersistToDiskResultProto.newBuilder()
@@ -299,7 +349,7 @@
@NonNull
public OptimizeResultProto optimize() {
- byte[] optimizeResultBytes = nativeOptimize(mNativePointer);
+ byte[] optimizeResultBytes = nativeOptimize(nativePointer);
if (optimizeResultBytes == null) {
Log.e(TAG, "Received null OptimizeResultProto from native.");
return OptimizeResultProto.newBuilder()
@@ -308,7 +358,7 @@
}
try {
- return OptimizeResultProto.parseFrom(optimizeResultBytes);
+ return OptimizeResultProto.parseFrom(optimizeResultBytes, EXTENSION_REGISTRY_LITE);
} catch (InvalidProtocolBufferException e) {
Log.e(TAG, "Error parsing OptimizeResultProto.", e);
return OptimizeResultProto.newBuilder()
@@ -317,31 +367,82 @@
}
}
+ @NonNull
+ public GetOptimizeInfoResultProto getOptimizeInfo() {
+ byte[] getOptimizeInfoResultBytes = nativeGetOptimizeInfo(nativePointer);
+ if (getOptimizeInfoResultBytes == null) {
+ Log.e(TAG, "Received null GetOptimizeInfoResultProto from native.");
+ return GetOptimizeInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetOptimizeInfoResultProto.parseFrom(
+ getOptimizeInfoResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetOptimizeInfoResultProto.", e);
+ return GetOptimizeInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public ResetResultProto reset() {
+ byte[] resetResultBytes = nativeReset(nativePointer);
+ if (resetResultBytes == null) {
+ Log.e(TAG, "Received null ResetResultProto from native.");
+ return ResetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return ResetResultProto.parseFrom(resetResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing ResetResultProto.", e);
+ return ResetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
private static native long nativeCreate(byte[] icingSearchEngineOptionsBytes);
- private static native byte[] nativeInitialize(long mNativePointer);
+ private static native byte[] nativeInitialize(long nativePointer);
private static native byte[] nativeSetSchema(
- long mNativePointer, byte[] schemaBytes, boolean ignoreErrorsAndDeleteDocuments);
+ long nativePointer, byte[] schemaBytes, boolean ignoreErrorsAndDeleteDocuments);
- private static native byte[] nativeGetSchema(long mNativePointer);
+ private static native byte[] nativeGetSchema(long nativePointer);
- private static native byte[] nativeGetSchemaType(long mNativePointer, String schemaType);
+ private static native byte[] nativeGetSchemaType(long nativePointer, String schemaType);
- private static native byte[] nativePut(long mNativePointer, byte[] documentBytes);
+ private static native byte[] nativePut(long nativePointer, byte[] documentBytes);
- private static native byte[] nativeGet(long mNativePointer, String namespace, String uri);
+ private static native byte[] nativeGet(long nativePointer, String namespace, String uri);
+
+ private static native byte[] nativeGetAllNamespaces(long nativePointer);
private static native byte[] nativeSearch(
- long mNativePointer, byte[] searchSpecBytes, byte[] scoringSpecBytes, byte[] resultSpecBytes);
+ long nativePointer, byte[] searchSpecBytes, byte[] scoringSpecBytes, byte[] resultSpecBytes);
- private static native byte[] nativeDelete(long mNativePointer, String namespace, String uri);
+ private static native byte[] nativeGetNextPage(long nativePointer, long nextPageToken);
- private static native byte[] nativeDeleteByNamespace(long mNativePointer, String namespace);
+ private static native void nativeInvalidateNextPageToken(long nativePointer, long nextPageToken);
- private static native byte[] nativeDeleteBySchemaType(long mNativePointer, String schemaType);
+ private static native byte[] nativeDelete(long nativePointer, String namespace, String uri);
- private static native byte[] nativePersistToDisk(long mNativePointer);
+ private static native byte[] nativeDeleteByNamespace(long nativePointer, String namespace);
- private static native byte[] nativeOptimize(long mNativePointer);
+ private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType);
+
+ private static native byte[] nativePersistToDisk(long nativePointer);
+
+ private static native byte[] nativeOptimize(long nativePointer);
+
+ private static native byte[] nativeGetOptimizeInfo(long nativePointer);
+
+ private static native byte[] nativeReset(long nativePointer);
}
diff --git a/java/tests/instrumentation/Android.bp b/java/tests/instrumentation/Android.bp
index c941acf..6cb579d 100644
--- a/java/tests/instrumentation/Android.bp
+++ b/java/tests/instrumentation/Android.bp
@@ -30,7 +30,7 @@
],
jni_libs: [
- "libicing_jni",
+ "libicing",
],
test_suites: [
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index 5c502d9..d907d4e 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -1,31 +1,28 @@
-/*
- * Copyright (C) 2020 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
package com.google.android.icing;
import static com.google.common.truth.Truth.assertThat;
-import android.content.Context;
-
-import androidx.test.InstrumentationRegistry;
-
+import androidx.test.core.app.ApplicationProvider;
import com.google.android.icing.proto.DeleteByNamespaceResultProto;
import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
import com.google.android.icing.proto.DeleteResultProto;
import com.google.android.icing.proto.DocumentProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
+import com.google.android.icing.proto.GetOptimizeInfoResultProto;
import com.google.android.icing.proto.GetResultProto;
import com.google.android.icing.proto.GetSchemaResultProto;
import com.google.android.icing.proto.GetSchemaTypeResultProto;
@@ -38,6 +35,7 @@
import com.google.android.icing.proto.PropertyConfigProto;
import com.google.android.icing.proto.PropertyProto;
import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ResetResultProto;
import com.google.android.icing.proto.ResultSpecProto;
import com.google.android.icing.proto.SchemaProto;
import com.google.android.icing.proto.SchemaTypeConfigProto;
@@ -47,7 +45,9 @@
import com.google.android.icing.proto.SetSchemaResultProto;
import com.google.android.icing.proto.StatusProto;
import com.google.android.icing.proto.TermMatchType;
-
+import com.google.android.icing.IcingSearchEngine;
+import java.util.HashMap;
+import java.util.Map;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -55,17 +55,16 @@
/**
* This test is not intended to fully test the functionality of each API. But rather to test the JNI
- * wrapper of Icing library.
+ * wrapper and Java interfaces of Icing library {@link IcingSearchEngine}.
*/
@RunWith(JUnit4.class)
public final class IcingSearchEngineTest {
private static final String EMAIL_TYPE = "Email";
- private Context mContext;
- private String mFilesDir;
+ private String filesDir;
- static SchemaTypeConfigProto createEmailTypeConfig() {
+ private static SchemaTypeConfigProto createEmailTypeConfig() {
return SchemaTypeConfigProto.newBuilder()
.setSchemaType(EMAIL_TYPE)
.addProperties(
@@ -89,7 +88,7 @@
.build();
}
- static DocumentProto createEmailDocument(String namespace, String uri) {
+ private static DocumentProto createEmailDocument(String namespace, String uri) {
return DocumentProto.newBuilder()
.setNamespace(namespace)
.setUri(uri)
@@ -100,14 +99,13 @@
@Before
public void setUp() throws Exception {
- mContext = InstrumentationRegistry.getInstrumentation().getContext();
- mFilesDir = mContext.getFilesDir().getCanonicalPath();
+ filesDir = ApplicationProvider.getApplicationContext().getFilesDir().getCanonicalPath();
}
@Test
public void testInitialize() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
InitializeResultProto initializeResultProto = icing.initialize();
@@ -117,9 +115,9 @@
@Test
public void testSetAndGetSchema() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
@@ -140,14 +138,18 @@
@Test
public void testPutAndGetDocuments() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
PutResultProto putResultProto = icing.put(emailDocument);
@@ -161,20 +163,24 @@
@Test
public void testSearch() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument =
createEmailDocument("namespace", "uri").toBuilder()
.addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
.build();
- PutResultProto putResultProto = icing.put(emailDocument);
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SearchSpecProto searchSpec =
SearchSpecProto.newBuilder()
@@ -193,19 +199,80 @@
}
@Test
- public void testDelete() throws Exception {
+ public void testGetNextPage() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ Map<String, DocumentProto> documents = new HashMap<>();
+ for (int i = 0; i < 10; i++) {
+ DocumentProto emailDocument =
+ createEmailDocument("namespace", "uri:" + i).toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();
+ documents.put("uri:" + i, emailDocument);
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ }
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("foo")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto = ResultSpecProto.newBuilder().setNumPerPage(1).build();
+
+ SearchResultProto searchResultProto =
+ icing.search(searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument).isEqualTo(documents.remove(resultDocument.getUri()));
+
+ // fetch rest pages
+ for (int i = 1; i < 5; i++) {
+ searchResultProto = icing.getNextPage(searchResultProto.getNextPageToken());
+ assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument).isEqualTo(documents.remove(resultDocument.getUri()));
+ }
+
+ // invalidate rest result
+ icing.invalidateNextPageToken(searchResultProto.getNextPageToken());
+
+ searchResultProto = icing.getNextPage(searchResultProto.getNextPageToken());
+ assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(0);
+ }
+
+ @Test
+ public void testDelete() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- PutResultProto putResultProto = icing.put(emailDocument);
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
DeleteResultProto deleteResultProto = icing.delete("namespace", "uri");
assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
@@ -217,17 +284,21 @@
@Test
public void testDeleteByNamespace() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- PutResultProto putResultProto = icing.put(emailDocument);
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
DeleteByNamespaceResultProto deleteByNamespaceResultProto =
icing.deleteByNamespace("namespace");
@@ -240,17 +311,21 @@
@Test
public void testDeleteBySchemaType() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
- SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- PutResultProto putResultProto = icing.put(emailDocument);
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
DeleteBySchemaTypeResultProto deleteBySchemaTypeResultProto =
icing.deleteBySchemaType(EMAIL_TYPE);
@@ -263,9 +338,9 @@
@Test
public void testPersistToDisk() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
PersistToDiskResultProto persistToDiskResultProto = icing.persistToDisk();
assertThat(persistToDiskResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
@@ -274,11 +349,59 @@
@Test
public void testOptimize() throws Exception {
IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(mFilesDir).build();
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
IcingSearchEngine icing = new IcingSearchEngine(options);
- InitializeResultProto initializeResultProto = icing.initialize();
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
OptimizeResultProto optimizeResultProto = icing.optimize();
assertThat(optimizeResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
}
+
+ @Test
+ public void testGetOptimizeInfo() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ GetOptimizeInfoResultProto getOptimizeInfoResultProto = icing.getOptimizeInfo();
+ assertThat(getOptimizeInfoResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(getOptimizeInfoResultProto.getOptimizableDocs()).isEqualTo(0);
+ assertThat(getOptimizeInfoResultProto.getEstimatedOptimizableBytes()).isEqualTo(0);
+ }
+
+ @Test
+ public void testGetAllNamespaces() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument = createEmailDocument("namespace", "uri");
+ assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ GetAllNamespacesResultProto getAllNamespacesResultProto = icing.getAllNamespaces();
+ assertThat(getAllNamespacesResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(getAllNamespacesResultProto.getNamespacesList()).containsExactly("namespace");
+ }
+
+ @Test
+ public void testReset() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ ResetResultProto resetResultProto = icing.reset();
+ assertThat(resetResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ }
}
diff --git a/nativeLib/build.gradle b/nativeLib/build.gradle
new file mode 100644
index 0000000..6b30451
--- /dev/null
+++ b/nativeLib/build.gradle
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO(b/161205849): We've had to move libicing.so compilation into appsearch:appsearch to get
+// it included into the exported aar. Find a proper solution for bundling libicing.so into
+// appsearch-release.aar and move compilation of libicing.so back into the external/icing tree.
diff --git a/proto/Android.bp b/proto/Android.bp
new file mode 100644
index 0000000..7bc1859
--- /dev/null
+++ b/proto/Android.bp
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+java_library {
+ name: "icing-java-proto-lite",
+ proto: {
+ type: "lite",
+ include_dirs: ["external/protobuf/src"],
+ canonical_path_from_root: false,
+ },
+ srcs: ["icing/proto/*.proto"],
+ sdk_version: "core_current",
+}
+
+cc_library_static {
+ name: "icing-c-proto",
+ defaults: ["libicing_defaults"],
+ proto: {
+ type: "lite",
+ // Find protos relative from where they're specified (useful for external protos)
+ canonical_path_from_root: false,
+ // Need to be able to see the .pb.h files that are generated
+ export_proto_headers: true,
+ },
+ srcs: ["icing/**/*.proto"],
+}
diff --git a/icing/legacy/index/proto/icing-dynamic-trie-header.proto b/proto/icing/legacy/index/proto/icing-dynamic-trie-header.proto
similarity index 100%
rename from icing/legacy/index/proto/icing-dynamic-trie-header.proto
rename to proto/icing/legacy/index/proto/icing-dynamic-trie-header.proto
diff --git a/icing/proto/document.proto b/proto/icing/proto/document.proto
similarity index 88%
rename from icing/proto/document.proto
rename to proto/icing/proto/document.proto
index eea97da..1caf169 100644
--- a/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -20,6 +20,7 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Defines a unit of data understood by the IcingSearchEngine.
// Next tag: 9
@@ -91,12 +92,14 @@
message PutResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
+ // NOT_FOUND
// INTERNAL
// OUT_OF_SPACE
//
// See status.proto for more details.
//
- // TODO(b/147699081): Fix error codes: +ABORTED, +FAILED_PRECONDITION.
+ // TODO(b/147699081): Fix error codes: +ABORTED
// go/icing-library-apis.
optional StatusProto status = 1;
}
@@ -106,6 +109,7 @@
message GetResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
@@ -120,11 +124,27 @@
optional DocumentProto document = 2;
}
+// Result of a call to IcingSearchEngine.GetAllNamespaces
+// Next tag: 3
+message GetAllNamespacesResultProto {
+ // Status code can be one of:
+ // OK
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // List of namespaces which have at least one existing document in it (not
+ // deleted and not expired). Order of namespaces is undefined.
+ repeated string namespaces = 2;
+}
+
// Result of a call to IcingSearchEngine.Delete
// Next tag: 2
message DeleteResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
+ // NOT_FOUND
// INTERNAL
//
// See status.proto for more details.
@@ -139,6 +159,8 @@
message DeleteByNamespaceResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
+ // NOT_FOUND
// INTERNAL
//
// See status.proto for more details.
@@ -153,6 +175,8 @@
message DeleteBySchemaTypeResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
+ // NOT_FOUND
// INTERNAL
//
// See status.proto for more details.
diff --git a/icing/proto/document_wrapper.proto b/proto/icing/proto/document_wrapper.proto
similarity index 96%
rename from icing/proto/document_wrapper.proto
rename to proto/icing/proto/document_wrapper.proto
index 0666e72..e8eb992 100644
--- a/icing/proto/document_wrapper.proto
+++ b/proto/icing/proto/document_wrapper.proto
@@ -21,6 +21,8 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to
// be used by icing team internally. It stores the original document provided
// by library users and metadata of the document which shouldn't be exposed to
diff --git a/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto
similarity index 98%
rename from icing/proto/initialize.proto
rename to proto/icing/proto/initialize.proto
index 813cdb5..eac88e6 100644
--- a/icing/proto/initialize.proto
+++ b/proto/icing/proto/initialize.proto
@@ -21,6 +21,8 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Next tag: 5
message IcingSearchEngineOptions {
// Directory to persist files for Icing. Required.
diff --git a/icing/proto/optimize.proto b/proto/icing/proto/optimize.proto
similarity index 62%
rename from icing/proto/optimize.proto
rename to proto/icing/proto/optimize.proto
index 2bf28e8..1baa64c 100644
--- a/icing/proto/optimize.proto
+++ b/proto/icing/proto/optimize.proto
@@ -20,12 +20,14 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Result of a call to IcingSearchEngine.Optimize
// Next tag: 2
message OptimizeResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// WARNING_DATA_LOSS
// ABORTED
// INTERNAL
@@ -36,3 +38,23 @@
// TODO(b/147699081): Add a field to indicate lost_schema and lost_documents.
// go/icing-library-apis.
}
+
+// Result of a call to IcingSearchEngine.GetOptimizeInfo
+// Next tag: 4
+message GetOptimizeInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ // INTERNAL
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Documents that have expired or been deleted, but are still taking up space
+ // in IcingSearchEngine.
+ optional int64 optimizable_docs = 2;
+
+ // Estimated bytes that could be recovered. The exact size per document isn't
+ // tracked, so this is based off an average document size.
+ optional int64 estimated_optimizable_bytes = 3;
+}
diff --git a/icing/proto/persist.proto b/proto/icing/proto/persist.proto
similarity index 93%
rename from icing/proto/persist.proto
rename to proto/icing/proto/persist.proto
index 5b5a737..77cf987 100644
--- a/icing/proto/persist.proto
+++ b/proto/icing/proto/persist.proto
@@ -20,12 +20,14 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Result of a call to IcingSearchEngine.Persist
// Next tag: 2
message PersistToDiskResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// INTERNAL
//
// See status.proto for more details.
diff --git a/icing/proto/persist.proto b/proto/icing/proto/reset.proto
similarity index 87%
copy from icing/proto/persist.proto
copy to proto/icing/proto/reset.proto
index 5b5a737..5e8b9f5 100644
--- a/icing/proto/persist.proto
+++ b/proto/icing/proto/reset.proto
@@ -21,11 +21,14 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
-// Result of a call to IcingSearchEngine.Persist
+option objc_class_prefix = "ICNG";
+
+// Result of a call to IcingSearchEngine.Reset
// Next tag: 2
-message PersistToDiskResultProto {
+message ResetResultProto {
// Status code can be one of:
// OK
+ // ABORTED
// INTERNAL
//
// See status.proto for more details.
diff --git a/icing/proto/schema.proto b/proto/icing/proto/schema.proto
similarity index 97%
rename from icing/proto/schema.proto
rename to proto/icing/proto/schema.proto
index 80d6901..3a7ee5d 100644
--- a/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -21,6 +21,7 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Defines the schema that every Document of a specific "type" should adhere
// to. These can be considered as definitions of rich structured types for
@@ -204,6 +205,7 @@
message GetSchemaResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
@@ -223,13 +225,14 @@
message GetSchemaTypeResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
// See status.proto for more details.
//
- // TODO(b/147699081): Fix error codes: +ABORTED, +FAILED_PRECONDITION,
- // -INTERNAL go/icing-library-apis.
+ // TODO(b/147699081): Fix error codes: +ABORTED, -INTERNAL
+ // go/icing-library-apis.
optional StatusProto status = 1;
// Copy of the SchemaTypeConfig proto with the specified schema_type.
diff --git a/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto
similarity index 90%
rename from icing/proto/scoring.proto
rename to proto/icing/proto/scoring.proto
index ad536b4..667ff4f 100644
--- a/icing/proto/scoring.proto
+++ b/proto/icing/proto/scoring.proto
@@ -19,6 +19,8 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Encapsulates the configurations on how Icing should score and rank the search
// results.
// Next tag: 3
@@ -26,9 +28,8 @@
// OPTIONAL: Indicates how the search results will be ranked.
message RankingStrategy {
enum Code {
- // No ranking strategy specified, documents will be returned in the
- // default order that the most recent document inserted into Icing comes
- // first.
+ // No ranking strategy specified, documents may be returned in an
+ // arbitrary order.
NONE = 0;
// Ranked by user-provided document scores.
diff --git a/icing/proto/search.proto b/proto/icing/proto/search.proto
similarity index 90%
rename from icing/proto/search.proto
rename to proto/icing/proto/search.proto
index 7241e7e..8ea5036 100644
--- a/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -22,6 +22,7 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Client-supplied specifications on what documents to retrieve.
// Next tag: 5
@@ -65,10 +66,9 @@
// results.
// Next tag: 4
message ResultSpecProto {
- // The maximum number of documents to return in SearchResultProto. Even if
- // more documents have matched the SearchSpecProto, this may be used to limit
- // the number to actually return to the client.
- optional int32 num_to_retrieve = 1 [default = 10];
+ // The results will be returned in pages, and num_per_page specifies the
+ // number of documents in one page.
+ optional int32 num_per_page = 1 [default = 10];
// Whether to collect and return debug_info in the SearchResultProto.
optional bool debug_info = 2;
@@ -145,10 +145,13 @@
}
// Icing lib-supplied results from a search results.
-// Next tag: 4
+// Next tag: 5
message SearchResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
+ // INVALID_ARGUMENT
+ // ABORTED
// INTERNAL
//
// See status.proto for more details.
@@ -185,4 +188,11 @@
optional string executed_query = 3;
}
optional DebugInfoProto debug_info = 3;
+
+ // An opaque token used internally to keep track of information needed for
+ // pagination. A valid pagination token is required to fetch other pages of
+ // results. The default value 0 means that there're no more pages.
+ // LINT.IfChange(next_page_token)
+ optional uint64 next_page_token = 4 [default = 0];
+ // LINT.ThenChange(//depot/google3/icing/result/result-state-manager.h:kInvalidNextPageToken)
}
diff --git a/icing/proto/status.proto b/proto/icing/proto/status.proto
similarity index 98%
rename from icing/proto/status.proto
rename to proto/icing/proto/status.proto
index 418b2e8..2733a15 100644
--- a/icing/proto/status.proto
+++ b/proto/icing/proto/status.proto
@@ -19,6 +19,8 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Canonical status to indicate the results of API calls.
// Next tag: 3
message StatusProto {
diff --git a/icing/proto/term.proto b/proto/icing/proto/term.proto
similarity index 96%
rename from icing/proto/term.proto
rename to proto/icing/proto/term.proto
index 30cd1bc..adf2ad6 100644
--- a/icing/proto/term.proto
+++ b/proto/icing/proto/term.proto
@@ -19,6 +19,8 @@
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Encapsulates the configurations on how Icing should query/index these terms.
// Next tag: 0
message TermMatchType {