Merge 8353f56116066bfcceb6a2deb1d9a691c3c4c1f1 on remote branch
Change-Id: I3e99bdc1bcc69a31946397b749554d4f28d202a8
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 2b02610..3c8e10b 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -7,6 +7,9 @@
"exclude-annotation": "androidx.test.filters.FlakyTest"
}
]
+ },
+ {
+ "name": "libtextclassifier_tests"
}
]
}
\ No newline at end of file
diff --git a/java/res/values-es-rUS/strings.xml b/java/res/values-b+es+419/strings.xml
similarity index 100%
rename from java/res/values-es-rUS/strings.xml
rename to java/res/values-b+es+419/strings.xml
diff --git a/java/res/values-in/strings.xml b/java/res/values-b+sr+Latn/strings.xml
similarity index 100%
copy from java/res/values-in/strings.xml
copy to java/res/values-b+sr+Latn/strings.xml
diff --git a/java/res/values-in/strings.xml b/java/res/values-id/strings.xml
similarity index 100%
rename from java/res/values-in/strings.xml
rename to java/res/values-id/strings.xml
diff --git a/java/res/values-nb/strings.xml b/java/res/values-no/strings.xml
similarity index 100%
rename from java/res/values-nb/strings.xml
rename to java/res/values-no/strings.xml
diff --git a/java/res/values-zh-rCN/strings.xml b/java/res/values-zh-rCN/strings.xml
deleted file mode 100755
index 56d9f67..0000000
--- a/java/res/values-zh-rCN/strings.xml
+++ /dev/null
@@ -1,3 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources xmlns:xliff="urn:oasis:names:tc:xliff:document:1.2">
-</resources>
diff --git a/java/res/values-in/strings.xml b/java/res/values-zh/strings.xml
similarity index 100%
copy from java/res/values-in/strings.xml
copy to java/res/values-zh/strings.xml
diff --git a/java/src/com/android/textclassifier/ActionsSuggestionsHelper.java b/java/src/com/android/textclassifier/ActionsSuggestionsHelper.java
index 82411dd..a51c95d 100644
--- a/java/src/com/android/textclassifier/ActionsSuggestionsHelper.java
+++ b/java/src/com/android/textclassifier/ActionsSuggestionsHelper.java
@@ -31,18 +31,18 @@
import com.android.textclassifier.common.base.TcLog;
import com.android.textclassifier.common.intent.LabeledIntent;
import com.android.textclassifier.common.intent.TemplateIntentFactory;
-import com.android.textclassifier.common.statsd.ResultIdUtils;
+import com.android.textclassifier.common.logging.ResultIdUtils;
import com.google.android.textclassifier.ActionsSuggestionsModel;
import com.google.android.textclassifier.RemoteActionTemplate;
import com.google.common.base.Equivalence;
import com.google.common.base.Equivalence.Wrapper;
+import com.google.common.base.Optional;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.Objects;
-import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
diff --git a/java/src/com/android/textclassifier/ModelFileManager.java b/java/src/com/android/textclassifier/ModelFileManager.java
index d48c1b3..a6f64d8 100644
--- a/java/src/com/android/textclassifier/ModelFileManager.java
+++ b/java/src/com/android/textclassifier/ModelFileManager.java
@@ -21,7 +21,8 @@
import android.text.TextUtils;
import androidx.annotation.GuardedBy;
import com.android.textclassifier.common.base.TcLog;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
+import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
@@ -34,8 +35,6 @@
import java.util.List;
import java.util.Locale;
import java.util.Objects;
-import java.util.Optional;
-import java.util.StringJoiner;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
@@ -288,28 +287,24 @@
}
public ModelInfo toModelInfo() {
- return new ModelInfo(getVersion(), getSupportedLocales());
+ return new ModelInfo(getVersion(), supportedLocalesStr);
}
@Override
public String toString() {
- final StringJoiner localesJoiner = new StringJoiner(",");
- for (Locale locale : supportedLocales) {
- localesJoiner.add(locale.toLanguageTag());
- }
return String.format(
Locale.US,
"ModelFile { path=%s name=%s version=%d locales=%s }",
getPath(),
getName(),
version,
- localesJoiner);
+ supportedLocalesStr);
}
public static ImmutableList<Optional<ModelInfo>> toModelInfos(
Optional<ModelFile>... modelFiles) {
return Arrays.stream(modelFiles)
- .map(modelFile -> modelFile.map(ModelFile::toModelInfo))
+ .map(modelFile -> modelFile.transform(ModelFile::toModelInfo))
.collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf));
}
}
diff --git a/java/src/com/android/textclassifier/TextClassifierImpl.java b/java/src/com/android/textclassifier/TextClassifierImpl.java
index 480d7cc..5c028ef 100644
--- a/java/src/com/android/textclassifier/TextClassifierImpl.java
+++ b/java/src/com/android/textclassifier/TextClassifierImpl.java
@@ -46,9 +46,9 @@
import com.android.textclassifier.common.base.TcLog;
import com.android.textclassifier.common.intent.LabeledIntent;
import com.android.textclassifier.common.intent.TemplateIntentFactory;
+import com.android.textclassifier.common.logging.ResultIdUtils;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
import com.android.textclassifier.common.statsd.GenerateLinksLogger;
-import com.android.textclassifier.common.statsd.ResultIdUtils;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
import com.android.textclassifier.common.statsd.SelectionEventConverter;
import com.android.textclassifier.common.statsd.TextClassificationSessionIdConverter;
import com.android.textclassifier.common.statsd.TextClassifierEventConverter;
@@ -57,7 +57,9 @@
import com.google.android.textclassifier.ActionsSuggestionsModel;
import com.google.android.textclassifier.AnnotatorModel;
import com.google.android.textclassifier.LangIdModel;
+import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
+import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import java.io.File;
import java.io.FileNotFoundException;
@@ -69,7 +71,6 @@
import java.util.List;
import java.util.Map;
import java.util.Objects;
-import java.util.Optional;
import javax.annotation.Nullable;
/**
@@ -340,8 +341,9 @@
Optional<ModelInfo> annotatorModelInfo;
Optional<ModelInfo> langIdModelInfo;
synchronized (lock) {
- annotatorModelInfo = Optional.ofNullable(annotatorModelInUse).map(ModelFile::toModelInfo);
- langIdModelInfo = Optional.ofNullable(langIdModelInUse).map(ModelFile::toModelInfo);
+ annotatorModelInfo =
+ Optional.fromNullable(annotatorModelInUse).transform(ModelFile::toModelInfo);
+ langIdModelInfo = Optional.fromNullable(langIdModelInUse).transform(ModelFile::toModelInfo);
}
generateLinksLogger.logGenerateLinks(
request.getText(),
@@ -495,9 +497,9 @@
ActionsSuggestionsHelper.createResultId(
context,
request.getConversation(),
- Optional.ofNullable(actionModelInUse),
- Optional.ofNullable(annotatorModelInUse),
- Optional.ofNullable(langIdModelInUse));
+ Optional.fromNullable(actionModelInUse),
+ Optional.fromNullable(annotatorModelInUse),
+ Optional.fromNullable(langIdModelInUse));
return new ConversationActions(conversationActions, resultId);
}
}
@@ -547,7 +549,7 @@
synchronized (lock) {
final ModelFileManager.ModelFile bestModel = langIdModelFileManager.findBestModelFile(null);
if (bestModel == null) {
- return Optional.empty();
+ return Optional.absent();
}
if (langIdImpl == null || !Objects.equals(langIdModelInUse, bestModel)) {
TcLog.d(TAG, "Loading " + bestModel);
@@ -558,7 +560,7 @@
new File(bestModel.getPath()), ParcelFileDescriptor.MODE_READ_ONLY);
} catch (FileNotFoundException e) {
TcLog.e(TAG, "Failed to open the LangID model file", e);
- return Optional.empty();
+ return Optional.absent();
}
try {
if (pfd != null) {
@@ -610,7 +612,7 @@
start,
end,
ModelFile.toModelInfos(
- Optional.ofNullable(annotatorModelInUse), Optional.ofNullable(langIdModelInUse)));
+ Optional.fromNullable(annotatorModelInUse), Optional.fromNullable(langIdModelInUse)));
}
}
@@ -670,13 +672,13 @@
actionIntents.add(intent);
}
Bundle extras = new Bundle();
- langId.ifPresent(
- model -> {
- maybeCreateExtrasForTranslate(actionIntents, model)
- .ifPresent(
- foreignLanguageExtra ->
- ExtrasUtils.putForeignLanguageExtra(extras, foreignLanguageExtra));
- });
+ Optional<Bundle> foreignLanguageExtra =
+ langId
+ .transform(model -> maybeCreateExtrasForTranslate(actionIntents, model))
+ .or(Optional.<Bundle>absent());
+ if (foreignLanguageExtra.isPresent()) {
+ ExtrasUtils.putForeignLanguageExtra(extras, foreignLanguageExtra.get());
+ }
if (actionIntents.stream().anyMatch(Objects::nonNull)) {
ArrayList<Intent> strippedIntents =
actionIntents.stream()
@@ -704,16 +706,16 @@
private static Optional<Bundle> maybeCreateExtrasForTranslate(
List<Intent> intents, LangIdModel langId) {
Optional<Intent> translateIntent =
- intents.stream()
+ FluentIterable.from(intents)
.filter(Objects::nonNull)
.filter(intent -> Intent.ACTION_TRANSLATE.equals(intent.getAction()))
- .findFirst();
+ .first();
if (!translateIntent.isPresent()) {
- return Optional.empty();
+ return Optional.absent();
}
Pair<String, Float> topLanguageWithScore = ExtrasUtils.getTopLanguage(translateIntent.get());
if (topLanguageWithScore == null) {
- return Optional.empty();
+ return Optional.absent();
}
return Optional.of(
ExtrasUtils.createForeignLanguageExtra(
@@ -723,13 +725,13 @@
private ImmutableList<String> detectLanguageTags(
Optional<LangIdModel> langId, CharSequence text) {
return langId
- .map(
+ .transform(
model -> {
float threshold = getLangIdThreshold(model);
EntityConfidence languagesConfidence = detectLanguages(model, text, threshold);
return ImmutableList.copyOf(languagesConfidence.getEntities());
})
- .orElse(ImmutableList.of());
+ .or(ImmutableList.of());
}
/**
diff --git a/java/src/com/android/textclassifier/common/base/LocaleCompat.java b/java/src/com/android/textclassifier/common/base/LocaleCompat.java
new file mode 100644
index 0000000..baaaf67
--- /dev/null
+++ b/java/src/com/android/textclassifier/common/base/LocaleCompat.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.textclassifier.common.base;
+
+import android.content.Context;
+import android.os.Build;
+import java.util.Locale;
+
+/** Helper for accessing locale related stuff that works across different platform versions. */
+public final class LocaleCompat {
+
+ private LocaleCompat() {}
+
+ /**
+ * Returns a well-formed IETF BCP 47 language tag representing this locale. In older platforms,
+ * only the ISO 639 language code will be returned.
+ *
+ * @see Locale#toLanguageTag()
+ */
+ public static String toLanguageTag(Locale locale) {
+ if (Build.VERSION.SDK_INT >= 24) {
+ return Api24Impl.toLanguageTag(locale);
+ }
+ return ApiBaseImpl.toLanguageTag(locale);
+ }
+
+ /** Returns the language tags in string for the current resources configuration. */
+ public static String getResourceLanguageTags(Context context) {
+ if (Build.VERSION.SDK_INT >= 24) {
+ return Api24Impl.getResourceLanguageTags(context);
+ } else if (Build.VERSION.SDK_INT >= 21) {
+ return Api21Impl.getResourceLanguageTags(context);
+ }
+ return ApiBaseImpl.getResourceLanguageTags(context);
+ }
+
+ private static class Api24Impl {
+ private Api24Impl() {}
+
+ static String toLanguageTag(Locale locale) {
+ return locale.toLanguageTag();
+ }
+
+ static String getResourceLanguageTags(Context context) {
+ return context.getResources().getConfiguration().getLocales().toLanguageTags();
+ }
+ }
+
+ private static class Api21Impl {
+ private Api21Impl() {}
+
+ static String getResourceLanguageTags(Context context) {
+ return context.getResources().getConfiguration().locale.toLanguageTag();
+ }
+ }
+
+ private static class ApiBaseImpl {
+ private ApiBaseImpl() {}
+
+ static String toLanguageTag(Locale locale) {
+ return locale.getLanguage();
+ }
+
+ static String getResourceLanguageTags(Context context) {
+ return context.getResources().getConfiguration().locale.getLanguage();
+ }
+ }
+}
diff --git a/java/src/com/android/textclassifier/common/statsd/ResultIdUtils.java b/java/src/com/android/textclassifier/common/logging/ResultIdUtils.java
similarity index 64%
rename from java/src/com/android/textclassifier/common/statsd/ResultIdUtils.java
rename to java/src/com/android/textclassifier/common/logging/ResultIdUtils.java
index 27f68d9..dae0442 100644
--- a/java/src/com/android/textclassifier/common/statsd/ResultIdUtils.java
+++ b/java/src/com/android/textclassifier/common/logging/ResultIdUtils.java
@@ -14,18 +14,20 @@
* limitations under the License.
*/
-package com.android.textclassifier.common.statsd;
+package com.android.textclassifier.common.logging;
import android.content.Context;
import android.text.TextUtils;
+import com.android.textclassifier.common.base.LocaleCompat;
+import com.google.common.base.Joiner;
+import com.google.common.base.Objects;
+import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
-import java.util.Objects;
-import java.util.Optional;
-import java.util.StringJoiner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
@@ -44,18 +46,23 @@
Preconditions.checkNotNull(text);
Preconditions.checkNotNull(context);
Preconditions.checkNotNull(modelInfos);
- final int hash = Objects.hash(text, start, end, context.getPackageName());
+ final int hash = Objects.hashCode(text, start, end, context.getPackageName());
return createId(hash, modelInfos);
}
/** Creates a string id that may be used to identify a TextClassifier result. */
public static String createId(int hash, List<Optional<ModelInfo>> modelInfos) {
Preconditions.checkNotNull(modelInfos);
- final StringJoiner modelJoiner = new StringJoiner(SEPARATOR_MODEL_NAME);
+ final List<String> modelNames = new ArrayList<>();
for (Optional<ModelInfo> modelInfo : modelInfos) {
- modelJoiner.add(modelInfo.map(ModelInfo::toModelName).orElse(""));
+ modelNames.add(modelInfo.transform(ModelInfo::toModelName).or(""));
}
- return String.format(Locale.US, "%s|%s|%d", CLASSIFIER_ID, modelJoiner, hash);
+ return String.format(
+ Locale.US,
+ "%s|%s|%d",
+ CLASSIFIER_ID,
+ Joiner.on(SEPARATOR_MODEL_NAME).join(modelNames),
+ hash);
}
/** Returns if the result id was generated from the default text classifier. */
@@ -64,7 +71,7 @@
}
/** Returns all the model names encoded in the signature. */
- static ImmutableList<String> getModelNames(@Nullable String signature) {
+ public static ImmutableList<String> getModelNames(@Nullable String signature) {
if (TextUtils.isEmpty(signature)) {
return ImmutableList.of();
}
@@ -79,21 +86,38 @@
/** Model information of a model file. */
public static class ModelInfo {
- private final int version;
- private final ImmutableList<Locale> locales;
+ private final String modelName;
public ModelInfo(int version, List<Locale> locales) {
- this.version = version;
- this.locales = ImmutableList.copyOf(locales);
+ this(version, createSupportedLanguageTagsString(locales));
+ }
+
+ /**
+ * Creates a {@link ModelInfo} object.
+ *
+ * @param version model version
+ * @param supportedLanguageTags a comma-separated string of bcp47 language tags of supported
+ * languages
+ */
+ public ModelInfo(int version, String supportedLanguageTags) {
+ this.modelName = createModelName(version, supportedLanguageTags);
+ }
+
+ private static String createSupportedLanguageTagsString(List<Locale> locales) {
+ List<String> languageTags = new ArrayList<>();
+ for (Locale locale : locales) {
+ languageTags.add(LocaleCompat.toLanguageTag(locale));
+ }
+ return Joiner.on(SEPARATOR_LOCALES).join(languageTags);
+ }
+
+ private static String createModelName(int version, String supportedLanguageTags) {
+ return String.format(Locale.US, "%s_v%d", supportedLanguageTags, version);
}
/** Returns a string representation of the model info. */
public String toModelName() {
- final StringJoiner localesJoiner = new StringJoiner(SEPARATOR_LOCALES);
- for (Locale locale : locales) {
- localesJoiner.add(locale.toLanguageTag());
- }
- return String.format(Locale.US, "%s_v%d", localesJoiner, version);
+ return modelName;
}
}
}
diff --git a/java/src/com/android/textclassifier/common/statsd/GenerateLinksLogger.java b/java/src/com/android/textclassifier/common/statsd/GenerateLinksLogger.java
index ed54206..c132749 100644
--- a/java/src/com/android/textclassifier/common/statsd/GenerateLinksLogger.java
+++ b/java/src/com/android/textclassifier/common/statsd/GenerateLinksLogger.java
@@ -22,14 +22,14 @@
import android.view.textclassifier.TextLinks;
import androidx.collection.ArrayMap;
import com.android.textclassifier.common.base.TcLog;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
import com.android.textclassifier.common.logging.TextClassifierEvent;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.util.Locale;
import java.util.Map;
-import java.util.Optional;
import java.util.Random;
import java.util.UUID;
import java.util.function.Supplier;
@@ -140,8 +140,8 @@
long latencyMs,
Optional<ModelInfo> annotatorModel,
Optional<ModelInfo> langIdModel) {
- String annotatorModelName = annotatorModel.map(ModelInfo::toModelName).orElse(null);
- String langIdModelName = langIdModel.map(ModelInfo::toModelName).orElse(null);
+ String annotatorModelName = annotatorModel.transform(ModelInfo::toModelName).or("");
+ String langIdModelName = langIdModel.transform(ModelInfo::toModelName).or("");
StatsEvent statsEvent =
StatsEvent.newBuilder()
.setAtomId(TextClassifierEventLogger.TEXT_LINKIFY_EVENT_ATOM_ID)
diff --git a/java/src/com/android/textclassifier/common/statsd/TextClassifierEventLogger.java b/java/src/com/android/textclassifier/common/statsd/TextClassifierEventLogger.java
index b5e780c..41f546c 100644
--- a/java/src/com/android/textclassifier/common/statsd/TextClassifierEventLogger.java
+++ b/java/src/com/android/textclassifier/common/statsd/TextClassifierEventLogger.java
@@ -23,6 +23,7 @@
import android.util.StatsLog;
import android.view.textclassifier.TextClassifier;
import com.android.textclassifier.common.base.TcLog;
+import com.android.textclassifier.common.logging.ResultIdUtils;
import com.android.textclassifier.common.logging.TextClassificationContext;
import com.android.textclassifier.common.logging.TextClassificationSessionId;
import com.android.textclassifier.common.logging.TextClassifierEvent;
diff --git a/java/tests/instrumentation/AndroidTest.xml b/java/tests/instrumentation/AndroidTest.xml
new file mode 100644
index 0000000..e02a338
--- /dev/null
+++ b/java/tests/instrumentation/AndroidTest.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2020 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- This test config file is auto-generated. -->
+<configuration description="Runs TextClassifierServiceTest.">
+ <option name="test-suite-tag" value="apct" />
+ <option name="test-suite-tag" value="apct-instrumentation" />
+ <target_preparer class="com.android.tradefed.targetprep.suite.SuiteApkInstaller">
+ <option name="cleanup-apks" value="true" />
+ <option name="test-file-name" value="TextClassifierServiceTest.apk" />
+ </target_preparer>
+
+ <test class="com.android.tradefed.testtype.AndroidJUnitTest" >
+ <option name="package" value="com.android.textclassifier.tests" />
+ <option name="runner" value="androidx.test.runner.AndroidJUnitRunner" />
+ </test>
+
+ <object type="module_controller" class="com.android.tradefed.testtype.suite.module.MainlineTestModuleController">
+ <option name="mainline-module-package-name" value="com.google.android.extservices" />
+ </object>
+</configuration>
diff --git a/java/tests/instrumentation/src/com/android/textclassifier/ModelFileManagerTest.java b/java/tests/instrumentation/src/com/android/textclassifier/ModelFileManagerTest.java
index 3930bb7..06d47d6 100644
--- a/java/tests/instrumentation/src/com/android/textclassifier/ModelFileManagerTest.java
+++ b/java/tests/instrumentation/src/com/android/textclassifier/ModelFileManagerTest.java
@@ -24,14 +24,14 @@
import androidx.test.ext.junit.runners.AndroidJUnit4;
import androidx.test.filters.SmallTest;
import com.android.textclassifier.ModelFileManager.ModelFile;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
+import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
-import java.util.Optional;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.junit.After;
@@ -341,8 +341,7 @@
assertThat(
modelInfos.stream()
- .map(modelFile -> modelFile.map(ModelInfo::toModelName))
- .map(Optional::get)
+ .map(modelFile -> modelFile.transform(ModelInfo::toModelName).or(""))
.collect(Collectors.toList()))
.containsExactly("en_v1", "ja_v2")
.inOrder();
diff --git a/java/tests/instrumentation/src/com/android/textclassifier/common/base/LocaleCompatTest.java b/java/tests/instrumentation/src/com/android/textclassifier/common/base/LocaleCompatTest.java
new file mode 100644
index 0000000..9e1f5a8
--- /dev/null
+++ b/java/tests/instrumentation/src/com/android/textclassifier/common/base/LocaleCompatTest.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.textclassifier.common.base;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import android.os.LocaleList;
+import androidx.test.core.app.ApplicationProvider;
+import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.filters.SdkSuppress;
+import androidx.test.filters.SmallTest;
+import java.util.Locale;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@SmallTest
+@RunWith(AndroidJUnit4.class)
+public class LocaleCompatTest {
+ @SdkSuppress(minSdkVersion = 24)
+ @Test
+ public void toLanguageTag_minApi24() {
+ Locale locale = Locale.TRADITIONAL_CHINESE;
+
+ String languageTags = LocaleCompat.toLanguageTag(locale);
+
+ assertThat(languageTags).isEqualTo("zh-TW");
+ }
+
+ @SdkSuppress(maxSdkVersion = 23)
+ @Test
+ public void toLanguageTag_base() {
+ Locale locale = Locale.TRADITIONAL_CHINESE;
+
+ String languageTags = LocaleCompat.toLanguageTag(locale);
+
+ assertThat(languageTags).isEqualTo("zh");
+ }
+
+ @SdkSuppress(minSdkVersion = 24)
+ @Test
+ public void getResourceLanguageTags_minApi24() {
+ ApplicationProvider.getApplicationContext()
+ .getResources()
+ .getConfiguration()
+ .setLocales(LocaleList.forLanguageTags("zh-TW"));
+
+ String resourceLanguageTags =
+ LocaleCompat.getResourceLanguageTags(ApplicationProvider.getApplicationContext());
+
+ assertThat(resourceLanguageTags).isEqualTo("zh-TW");
+ }
+
+ @SdkSuppress(minSdkVersion = 21, maxSdkVersion = 23)
+ @Test
+ public void getResourceLanguageTags_minApi21() {
+ ApplicationProvider.getApplicationContext()
+ .getResources()
+ .getConfiguration()
+ .setLocale(Locale.TAIWAN);
+
+ String resourceLanguageTags =
+ LocaleCompat.getResourceLanguageTags(ApplicationProvider.getApplicationContext());
+
+ assertThat(resourceLanguageTags).isEqualTo("zh-TW");
+ }
+
+ @SdkSuppress(maxSdkVersion = 20)
+ @Test
+ public void getResourceLanguageTags_base() {
+ ApplicationProvider.getApplicationContext().getResources().getConfiguration().locale =
+ Locale.TAIWAN;
+
+ String resourceLanguageTags =
+ LocaleCompat.getResourceLanguageTags(ApplicationProvider.getApplicationContext());
+
+ assertThat(resourceLanguageTags).isEqualTo("zh");
+ }
+}
diff --git a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/ResultIdUtilsTest.java b/java/tests/instrumentation/src/com/android/textclassifier/common/logging/ResultIdUtilsTest.java
similarity index 86%
rename from java/tests/instrumentation/src/com/android/textclassifier/common/statsd/ResultIdUtilsTest.java
rename to java/tests/instrumentation/src/com/android/textclassifier/common/logging/ResultIdUtilsTest.java
index 7aac3a9..3a85061 100644
--- a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/ResultIdUtilsTest.java
+++ b/java/tests/instrumentation/src/com/android/textclassifier/common/logging/ResultIdUtilsTest.java
@@ -14,17 +14,17 @@
* limitations under the License.
*/
-package com.android.textclassifier.common.statsd;
+package com.android.textclassifier.common.logging;
import static com.google.common.truth.Truth.assertThat;
import androidx.test.core.app.ApplicationProvider;
import androidx.test.ext.junit.runners.AndroidJUnit4;
import androidx.test.filters.SmallTest;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
+import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import java.util.Locale;
-import java.util.Optional;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -38,12 +38,12 @@
public void createId_customHash() {
ImmutableList<Optional<ModelInfo>> modelInfos =
ImmutableList.of(
- Optional.empty(),
+ Optional.absent(),
Optional.of(
new ModelInfo(/* version= */ 1, ImmutableList.of(Locale.ENGLISH, Locale.FRENCH))),
- Optional.empty(),
+ Optional.absent(),
Optional.of(new ModelInfo(/* version= */ 2, ImmutableList.of(Locale.CHINESE))),
- Optional.empty());
+ Optional.absent());
String resultId = ResultIdUtils.createId(HASH, modelInfos);
@@ -90,6 +90,13 @@
}
@Test
+ public void modelInfo_toModelName_supportedLanguageTags() {
+ ModelInfo modelInfo = new ModelInfo(700, "en,fr");
+
+ assertThat(modelInfo.toModelName()).isEqualTo("en,fr_v700");
+ }
+
+ @Test
public void isFromDefaultTextClassifier_true() {
assertThat(ResultIdUtils.isFromDefaultTextClassifier("androidtc|en_v703|12344")).isTrue();
}
diff --git a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/GenerateLinksLoggerTest.java b/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/GenerateLinksLoggerTest.java
index c2512c6..c2a911a 100644
--- a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/GenerateLinksLoggerTest.java
+++ b/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/GenerateLinksLoggerTest.java
@@ -29,12 +29,12 @@
import com.android.os.AtomsProto;
import com.android.os.AtomsProto.Atom;
import com.android.os.AtomsProto.TextLinkifyEvent;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
+import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import java.util.Locale;
import java.util.Map;
-import java.util.Optional;
import java.util.stream.Collectors;
import org.junit.After;
import org.junit.Before;
diff --git a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/SelectionEventConverterTest.java b/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/SelectionEventConverterTest.java
index e317120..ecdc1f4 100644
--- a/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/SelectionEventConverterTest.java
+++ b/java/tests/instrumentation/src/com/android/textclassifier/common/statsd/SelectionEventConverterTest.java
@@ -28,12 +28,13 @@
import androidx.test.core.app.ApplicationProvider;
import androidx.test.ext.junit.runners.AndroidJUnit4;
import androidx.test.filters.SmallTest;
-import com.android.textclassifier.common.statsd.ResultIdUtils.ModelInfo;
+import com.android.textclassifier.common.logging.ResultIdUtils;
+import com.android.textclassifier.common.logging.ResultIdUtils.ModelInfo;
+import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Locale;
-import java.util.Optional;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
diff --git a/jni/com/google/android/textclassifier/AnnotatorModel.java b/jni/com/google/android/textclassifier/AnnotatorModel.java
index 7c72446..7658bf5 100644
--- a/jni/com/google/android/textclassifier/AnnotatorModel.java
+++ b/jni/com/google/android/textclassifier/AnnotatorModel.java
@@ -432,18 +432,18 @@
private final String referenceTimezone;
private final Long referenceTimeMsUtc;
- DatetimeOptions(String referenceTimezone, Long referenceTimeMsUtc) {
+ public DatetimeOptions(String referenceTimezone, Long referenceTimeMsUtc) {
this.referenceTimeMsUtc = referenceTimeMsUtc;
this.referenceTimezone = referenceTimezone;
}
}
- InputFragment(String text) {
+ public InputFragment(String text) {
this.text = text;
this.datetimeOptionsNullable = null;
}
- InputFragment(String text, DatetimeOptions datetimeOptions) {
+ public InputFragment(String text, DatetimeOptions datetimeOptions) {
this.text = text;
this.datetimeOptionsNullable = datetimeOptions;
}
diff --git a/native/Android.bp b/native/Android.bp
index 7d1ed15..9a79285 100644
--- a/native/Android.bp
+++ b/native/Android.bp
@@ -54,6 +54,7 @@
"//apex_available:platform",
"com.android.neuralnetworks",
"test_com.android.neuralnetworks",
+ "com.android.extservices",
],
}
@@ -106,6 +107,7 @@
"libtextclassifier_fbgen_resources_extra",
"libtextclassifier_fbgen_intent_config",
"libtextclassifier_fbgen_annotator_model",
+ "libtextclassifier_fbgen_annotator_experimental_model",
"libtextclassifier_fbgen_actions_model",
"libtextclassifier_fbgen_tflite_text_encoder_config",
"libtextclassifier_fbgen_lang_id_embedded_network",
@@ -205,6 +207,13 @@
}
genrule {
+ name: "libtextclassifier_fbgen_annotator_experimental_model",
+ srcs: ["annotator/experimental/experimental.fbs"],
+ out: ["annotator/experimental/experimental_generated.h"],
+ defaults: ["fbgen"],
+}
+
+genrule {
name: "libtextclassifier_fbgen_actions_model",
srcs: ["actions/actions_model.fbs"],
out: ["actions/actions_model_generated.h"],
@@ -292,13 +301,19 @@
exclude_srcs: [
"**/*_test.cc",
"**/*-test-lib.cc",
- "utils/testing/*.cc",
+ "**/testing/*.cc",
"**/*test-util.*",
"**/*test-utils.*",
"**/*_test-include.*",
+ "**/*unittest.cc",
],
version_script: "jni.lds",
+
+ apex_available: [
+ "//apex_available:platform",
+ "com.android.extservices",
+ ],
}
// -----------------------
@@ -308,7 +323,7 @@
name: "libtextclassifier_tests",
defaults: ["libtextclassifier_defaults"],
- test_suites: ["device-tests"],
+ test_suites: ["device-tests", "mts"],
data: [
"annotator/test_data/**/*",
@@ -316,21 +331,19 @@
],
srcs: ["**/*.cc"],
- // TODO: Do not filter out tflite test once the dependency issue is resolved.
- exclude_srcs: [
- "utils/tflite/*_test.cc",
- "utils/flatbuffers_test.cc",
- "utils/calendar/*_test-include.*",
- "utils/utf8/*_test-include.*"
- ],
- static_libs: ["libgmock_ndk"],
+ static_libs: [
+ "libgmock_ndk",
+ "libgtest_ndk_c++",
+ ],
multilib: {
lib32: {
+ suffix: "32",
cppflags: ["-DTC3_TEST_DATA_DIR=\"/data/nativetest/libtextclassifier_tests/test_data/\""],
},
lib64: {
+ suffix: "64",
cppflags: ["-DTC3_TEST_DATA_DIR=\"/data/nativetest64/libtextclassifier_tests/test_data/\""],
},
},
diff --git a/native/AndroidTest.xml b/native/AndroidTest.xml
index fd0c609..cee26dd 100644
--- a/native/AndroidTest.xml
+++ b/native/AndroidTest.xml
@@ -14,13 +14,21 @@
limitations under the License.
-->
<configuration description="Config for libtextclassifier_tests">
- <target_preparer class="com.android.tradefed.targetprep.PushFilePreparer">
+ <option name="test-suite-tag" value="apct" />
+ <option name="test-suite-tag" value="mts" />
+
+ <target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
<option name="cleanup" value="true" />
<option name="push" value="libtextclassifier_tests->/data/local/tmp/libtextclassifier_tests" />
+ <option name="append-bitness" value="true" />
</target_preparer>
- <option name="test-suite-tag" value="apct" />
+
<test class="com.android.tradefed.testtype.GTest" >
<option name="native-test-device-path" value="/data/local/tmp" />
<option name="module-name" value="libtextclassifier_tests" />
</test>
+
+ <object type="module_controller" class="com.android.tradefed.testtype.suite.module.MainlineTestModuleController">
+ <option name="mainline-module-package-name" value="com.google.android.extservices" />
+ </object>
</configuration>
diff --git a/native/actions/actions-suggestions.cc b/native/actions/actions-suggestions.cc
index a84f2cd..1fcd35c 100644
--- a/native/actions/actions-suggestions.cc
+++ b/native/actions/actions-suggestions.cc
@@ -22,6 +22,7 @@
#include "actions/types.h"
#include "actions/utils.h"
#include "actions/zlib-utils.h"
+#include "annotator/collections.h"
#include "utils/base/logging.h"
#include "utils/flatbuffers.h"
#include "utils/lua-utils.h"
@@ -50,6 +51,11 @@
*[]() { return new std::string("send_email"); }();
const std::string& ActionsSuggestions::kShareLocation =
*[]() { return new std::string("share_location"); }();
+
+// Name for a datetime annotation that only includes time but no date.
+const std::string& kTimeAnnotation =
+ *[]() { return new std::string("time"); }();
+
constexpr float kDefaultFloat = 0.0;
constexpr bool kDefaultBool = false;
constexpr int kDefaultInt = 1;
@@ -260,6 +266,7 @@
}
}
+ // Gather annotation entities for the rules.
if (model_->annotation_actions_spec() != nullptr &&
model_->annotation_actions_spec()->annotation_mapping() != nullptr) {
for (const AnnotationActionsSpec_::AnnotationMapping* mapping :
@@ -300,6 +307,18 @@
grammar_actions_.reset(new GrammarActions(
unilib_, model_->rules()->grammar_rules(), entity_data_builder_.get(),
model_->smart_reply_action_type()->str()));
+
+ // Gather annotation entities for the grammars.
+ if (auto annotation_nt = model_->rules()
+ ->grammar_rules()
+ ->rules()
+ ->nonterminals()
+ ->annotation_nt()) {
+ for (const grammar::RulesSet_::Nonterminals_::AnnotationNtEntry* entry :
+ *annotation_nt) {
+ annotation_entity_types_.insert(entry->key()->str());
+ }
+ }
}
std::string actions_script;
@@ -689,47 +708,41 @@
interpreter->tensor(interpreter->inputs()[param_index])->type;
const auto param_value_it = model_parameters.find(param_name);
const bool has_value = param_value_it != model_parameters.end();
- /*
- case kTfLiteInt16:
- *tflite::GetTensorData<int16_t>(input_tensor) = input_value;
- break;
- case kTfLiteInt8:
- */
switch (param_type) {
case kTfLiteFloat32:
model_executor_->SetInput<float>(
param_index,
- has_value ? param_value_it->second.FloatValue() : kDefaultFloat,
+ has_value ? param_value_it->second.Value<float>() : kDefaultFloat,
interpreter);
break;
case kTfLiteInt32:
model_executor_->SetInput<int32_t>(
param_index,
- has_value ? param_value_it->second.IntValue() : kDefaultInt,
+ has_value ? param_value_it->second.Value<int>() : kDefaultInt,
interpreter);
break;
case kTfLiteInt64:
model_executor_->SetInput<int64_t>(
param_index,
- has_value ? param_value_it->second.Int64Value() : kDefaultInt,
+ has_value ? param_value_it->second.Value<int64>() : kDefaultInt,
interpreter);
break;
case kTfLiteUInt8:
model_executor_->SetInput<uint8_t>(
param_index,
- has_value ? param_value_it->second.UInt8Value() : kDefaultInt,
+ has_value ? param_value_it->second.Value<uint8>() : kDefaultInt,
interpreter);
break;
case kTfLiteInt8:
model_executor_->SetInput<int8_t>(
param_index,
- has_value ? param_value_it->second.Int8Value() : kDefaultInt,
+ has_value ? param_value_it->second.Value<int8>() : kDefaultInt,
interpreter);
break;
case kTfLiteBool:
model_executor_->SetInput<bool>(
param_index,
- has_value ? param_value_it->second.BoolValue() : kDefaultBool,
+ has_value ? param_value_it->second.Value<bool>() : kDefaultBool,
interpreter);
break;
default:
@@ -1023,6 +1036,30 @@
if (message->annotations.empty()) {
message->annotations = annotator->Annotate(
message->text, AnnotationOptionsForMessage(*message));
+ for (int i = 0; i < message->annotations.size(); i++) {
+ ClassificationResult* classification =
+ &message->annotations[i].classification.front();
+
+ // Specialize datetime annotation to time annotation if no date
+ // component is present.
+ if (classification->collection == Collections::DateTime() &&
+ classification->datetime_parse_result.IsSet()) {
+ bool has_only_time = true;
+ for (const DatetimeComponent& component :
+ classification->datetime_parse_result.datetime_components) {
+ if (component.component_type !=
+ DatetimeComponent::ComponentType::UNSPECIFIED &&
+ component.component_type <
+ DatetimeComponent::ComponentType::HOUR) {
+ has_only_time = false;
+ break;
+ }
+ }
+ if (has_only_time) {
+ classification->collection = kTimeAnnotation;
+ }
+ }
+ }
}
}
return annotated_conversation;
@@ -1224,6 +1261,13 @@
SuggestActionsFromAnnotations(annotated_conversation, &response->actions);
+ if (grammar_actions_ != nullptr &&
+ !grammar_actions_->SuggestActions(annotated_conversation,
+ &response->actions)) {
+ TC3_LOG(ERROR) << "Could not suggest actions from grammar rules.";
+ return false;
+ }
+
int input_text_length = 0;
int num_matching_locales = 0;
for (int i = annotated_conversation.messages.size() - num_messages;
@@ -1299,13 +1343,6 @@
return false;
}
- if (grammar_actions_ != nullptr &&
- !grammar_actions_->SuggestActions(annotated_conversation,
- &response->actions)) {
- TC3_LOG(ERROR) << "Could not suggest actions from grammar rules.";
- return false;
- }
-
if (preconditions_.suppress_on_low_confidence_input &&
!regex_actions_->FilterConfidenceOutput(post_check_rules,
&response->actions)) {
diff --git a/native/actions/feature-processor_test.cc b/native/actions/feature-processor_test.cc
new file mode 100644
index 0000000..969bbf7
--- /dev/null
+++ b/native/actions/feature-processor_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/feature-processor.h"
+
+#include "actions/actions_model_generated.h"
+#include "annotator/model-executor.h"
+#include "utils/tensor-view.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using ::testing::FloatEq;
+using ::testing::SizeIs;
+
+// EmbeddingExecutor that always returns features based on
+// the id of the sparse features.
+class FakeEmbeddingExecutor : public EmbeddingExecutor {
+ public:
+ bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
+ const int dest_size) const override {
+ TC3_CHECK_GE(dest_size, 4);
+ EXPECT_THAT(sparse_features, SizeIs(1));
+ dest[0] = sparse_features.data()[0];
+ dest[1] = sparse_features.data()[0];
+ dest[2] = -sparse_features.data()[0];
+ dest[3] = -sparse_features.data()[0];
+ return true;
+ }
+
+ private:
+ std::vector<float> storage_;
+};
+
+class FeatureProcessorTest : public ::testing::Test {
+ protected:
+ FeatureProcessorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
+
+ flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
+ ActionsTokenFeatureProcessorOptionsT* options) const {
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(CreateActionsTokenFeatureProcessorOptions(builder, options));
+ return builder.Release();
+ }
+
+ FakeEmbeddingExecutor embedding_executor_;
+ UniLib unilib_;
+};
+
+TEST_F(FeatureProcessorTest, TokenEmbeddings) {
+ ActionsTokenFeatureProcessorOptionsT options;
+ options.embedding_size = 4;
+ options.tokenizer_options.reset(new ActionsTokenizerOptionsT);
+
+ flatbuffers::DetachedBuffer options_fb =
+ PackFeatureProcessorOptions(&options);
+ ActionsFeatureProcessor feature_processor(
+ flatbuffers::GetRoot<ActionsTokenFeatureProcessorOptions>(
+ options_fb.data()),
+ &unilib_);
+
+ Token token("aaa", 0, 3);
+ std::vector<float> token_features;
+ EXPECT_TRUE(feature_processor.AppendTokenFeatures(token, &embedding_executor_,
+ &token_features));
+ EXPECT_THAT(token_features, SizeIs(4));
+}
+
+TEST_F(FeatureProcessorTest, TokenEmbeddingsCaseFeature) {
+ ActionsTokenFeatureProcessorOptionsT options;
+ options.embedding_size = 4;
+ options.extract_case_feature = true;
+ options.tokenizer_options.reset(new ActionsTokenizerOptionsT);
+
+ flatbuffers::DetachedBuffer options_fb =
+ PackFeatureProcessorOptions(&options);
+ ActionsFeatureProcessor feature_processor(
+ flatbuffers::GetRoot<ActionsTokenFeatureProcessorOptions>(
+ options_fb.data()),
+ &unilib_);
+
+ Token token("Aaa", 0, 3);
+ std::vector<float> token_features;
+ EXPECT_TRUE(feature_processor.AppendTokenFeatures(token, &embedding_executor_,
+ &token_features));
+ EXPECT_THAT(token_features, SizeIs(5));
+ EXPECT_THAT(token_features[4], FloatEq(1.0));
+}
+
+TEST_F(FeatureProcessorTest, MultipleTokenEmbeddingsCaseFeature) {
+ ActionsTokenFeatureProcessorOptionsT options;
+ options.embedding_size = 4;
+ options.extract_case_feature = true;
+ options.tokenizer_options.reset(new ActionsTokenizerOptionsT);
+
+ flatbuffers::DetachedBuffer options_fb =
+ PackFeatureProcessorOptions(&options);
+ ActionsFeatureProcessor feature_processor(
+ flatbuffers::GetRoot<ActionsTokenFeatureProcessorOptions>(
+ options_fb.data()),
+ &unilib_);
+
+ const std::vector<Token> tokens = {Token("Aaa", 0, 3), Token("bbb", 4, 7),
+ Token("Cccc", 8, 12)};
+ std::vector<float> token_features;
+ EXPECT_TRUE(feature_processor.AppendTokenFeatures(
+ tokens, &embedding_executor_, &token_features));
+ EXPECT_THAT(token_features, SizeIs(15));
+ EXPECT_THAT(token_features[4], FloatEq(1.0));
+ EXPECT_THAT(token_features[9], FloatEq(-1.0));
+ EXPECT_THAT(token_features[14], FloatEq(1.0));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/actions/flatbuffer-utils.cc b/native/actions/flatbuffer-utils.cc
deleted file mode 100644
index 6d60c2f..0000000
--- a/native/actions/flatbuffer-utils.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "actions/flatbuffer-utils.h"
-
-#include <memory>
-
-#include "utils/base/logging.h"
-#include "utils/flatbuffers.h"
-#include "flatbuffers/reflection.h"
-
-namespace libtextclassifier3 {
-
-bool SwapFieldNamesForOffsetsInPathInActionsModel(ActionsModelT* model) {
- if (model->actions_entity_data_schema.empty()) {
- // Nothing to do.
- return true;
- }
-
- const reflection::Schema* schema =
- LoadAndVerifyFlatbuffer<reflection::Schema>(
- model->actions_entity_data_schema.data(),
- model->actions_entity_data_schema.size());
-
- // Resolve offsets in regex rules.
- if (model->rules != nullptr) {
- for (std::unique_ptr<RulesModel_::RegexRuleT>& rule :
- model->rules->regex_rule) {
- for (std::unique_ptr<RulesModel_::RuleActionSpecT>& rule_action :
- rule->actions) {
- for (std::unique_ptr<RulesModel_::RuleActionSpec_::RuleCapturingGroupT>&
- capturing_group : rule_action->capturing_group) {
- if (capturing_group->entity_field == nullptr) {
- continue;
- }
- if (!SwapFieldNamesForOffsetsInPath(
- schema, capturing_group->entity_field.get())) {
- return false;
- }
- }
- }
- }
- }
-
- // Resolve offsets in annotation action mapping.
- if (model->annotation_actions_spec != nullptr) {
- for (std::unique_ptr<AnnotationActionsSpec_::AnnotationMappingT>& mapping :
- model->annotation_actions_spec->annotation_mapping) {
- if (mapping->entity_field == nullptr) {
- continue;
- }
- if (!SwapFieldNamesForOffsetsInPath(schema,
- mapping->entity_field.get())) {
- return false;
- }
- }
- }
-
- return true;
-}
-
-std::string SwapFieldNamesForOffsetsInPathInSerializedActionsModel(
- const std::string& model) {
- std::unique_ptr<ActionsModelT> unpacked_model =
- UnPackActionsModel(model.c_str());
- TC3_CHECK(unpacked_model != nullptr);
- TC3_CHECK(SwapFieldNamesForOffsetsInPathInActionsModel(unpacked_model.get()));
- flatbuffers::FlatBufferBuilder builder;
- FinishActionsModelBuffer(builder,
- ActionsModel::Pack(builder, unpacked_model.get()));
- return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
- builder.GetSize());
-}
-
-} // namespace libtextclassifier3
diff --git a/native/actions/flatbuffer-utils.h b/native/actions/flatbuffer-utils.h
deleted file mode 100644
index 2479599..0000000
--- a/native/actions/flatbuffer-utils.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Utility functions for working with FlatBuffers in the actions model.
-
-#ifndef LIBTEXTCLASSIFIER_ACTIONS_FLATBUFFER_UTILS_H_
-#define LIBTEXTCLASSIFIER_ACTIONS_FLATBUFFER_UTILS_H_
-
-#include <string>
-
-#include "actions/actions_model_generated.h"
-
-namespace libtextclassifier3 {
-
-// Resolves field lookups by name to the concrete field offsets in the regex
-// rules of the model.
-bool SwapFieldNamesForOffsetsInPathInActionsModel(ActionsModelT* model);
-
-// Same as above but for a serialized model.
-std::string SwapFieldNamesForOffsetsInPathInSerializedActionsModel(
- const std::string& model);
-
-} // namespace libtextclassifier3
-
-#endif // LIBTEXTCLASSIFIER_ACTIONS_FLATBUFFER_UTILS_H_
diff --git a/native/actions/grammar-actions.cc b/native/actions/grammar-actions.cc
index 4995eaa..7f3e71f 100644
--- a/native/actions/grammar-actions.cc
+++ b/native/actions/grammar-actions.cc
@@ -64,8 +64,8 @@
codepoint_offsets.push_back(it);
}
codepoint_offsets.push_back(message_unicode.end());
- for (const grammar::RuleMatch& candidate :
- grammar::DeduplicateMatches(candidates_)) {
+ for (const grammar::Derivation& candidate :
+ grammar::DeduplicateDerivations(candidates_)) {
// Check that assertions are fulfilled.
if (!VerifyAssertions(candidate.match)) {
continue;
@@ -84,7 +84,7 @@
private:
// Handles action rule matches.
void HandleRuleMatch(const grammar::Match* match, const int64 rule_id) {
- candidates_.push_back(grammar::RuleMatch{match, rule_id});
+ candidates_.push_back(grammar::Derivation{match, rule_id});
}
// Instantiates action suggestions from verified and deduplicated rule matches
@@ -94,7 +94,7 @@
bool InstantiateActionsFromMatch(
const std::vector<UnicodeText::const_iterator>& message_codepoint_offsets,
int message_index, const std::string& smart_reply_action_type,
- const grammar::RuleMatch& candidate,
+ const grammar::Derivation& candidate,
const ReflectiveFlatbufferBuilder* entity_data_builder,
std::vector<ActionSuggestion>* result) const {
const RulesModel_::GrammarRules_::RuleMatch* rule_match =
@@ -194,7 +194,7 @@
// All action rule match candidates.
// Grammar rule matches are recorded, deduplicated, verified and then
// instantiated.
- std::vector<grammar::RuleMatch> candidates_;
+ std::vector<grammar::Derivation> candidates_;
};
} // namespace
diff --git a/native/actions/lua-actions_test.cc b/native/actions/lua-actions_test.cc
new file mode 100644
index 0000000..72cae2c
--- /dev/null
+++ b/native/actions/lua-actions_test.cc
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/lua-actions.h"
+
+#include <map>
+#include <string>
+
+#include "actions/test-utils.h"
+#include "actions/types.h"
+#include "utils/tflite-model-executor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::ElementsAre;
+
+TEST(LuaActions, SimpleAction) {
+ Conversation conversation;
+ const std::string test_snippet = R"(
+ return {{ type = "test_action" }}
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, ElementsAre(IsActionOfType("test_action")));
+}
+
+TEST(LuaActions, ConversationActions) {
+ Conversation conversation;
+ conversation.messages.push_back({/*user_id=*/0, "hello there!"});
+ conversation.messages.push_back({/*user_id=*/1, "general kenobi!"});
+ const std::string test_snippet = R"(
+ local actions = {}
+ for i, message in pairs(messages) do
+ if i < #messages then
+ if message.text == "hello there!" and
+ messages[i+1].text == "general kenobi!" then
+ table.insert(actions, {
+ type = "text_reply",
+ response_text = "you are a bold one!"
+ })
+ end
+ if message.text == "i am the senate!" and
+ messages[i+1].text == "not yet!" then
+ table.insert(actions, {
+ type = "text_reply",
+ response_text = "it's treason then"
+ })
+ end
+ end
+ end
+ return actions;
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, ElementsAre(IsSmartReply("you are a bold one!")));
+}
+
+TEST(LuaActions, SimpleModelAction) {
+ Conversation conversation;
+ const std::string test_snippet = R"(
+ if #model.actions_scores == 0 then
+ return {{ type = "test_action" }}
+ end
+ return {}
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, ElementsAre(IsActionOfType("test_action")));
+}
+
+TEST(LuaActions, SimpleModelRepliesAction) {
+ Conversation conversation;
+ const std::string test_snippet = R"(
+ if #model.reply == 0 then
+ return {{ type = "test_action" }}
+ end
+ return {}
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, ElementsAre(IsActionOfType("test_action")));
+}
+
+TEST(LuaActions, AnnotationActions) {
+ AnnotatedSpan annotation;
+ annotation.span = {11, 15};
+ annotation.classification = {ClassificationResult("address", 1.0)};
+ Conversation conversation = {{{/*user_id=*/1, "are you at home?",
+ /*reference_time_ms_utc=*/0,
+ /*reference_timezone=*/"Europe/Zurich",
+ /*annotations=*/{annotation},
+ /*locales=*/"en"}}};
+ const std::string test_snippet = R"(
+ local actions = {}
+ local last_message = messages[#messages]
+ for i, annotation in pairs(last_message.annotation) do
+ if #annotation.classification > 0 then
+ if annotation.classification[1].collection == "address" then
+ local text = string.sub(last_message.text,
+ annotation.span["begin"] + 1,
+ annotation.span["end"])
+ table.insert(actions, {
+ type = "text_reply",
+ response_text = "i am at " .. text,
+ annotation = {{
+ name = "location",
+ span = {
+ text = text
+ },
+ entity = annotation.classification[1]
+ }},
+ })
+ end
+ end
+ end
+ return actions;
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, ElementsAre(IsSmartReply("i am at home")));
+ EXPECT_EQ("address", actions[0].annotations[0].entity.collection);
+}
+
+TEST(LuaActions, EntityData) {
+ std::string test_schema = TestEntityDataSchema();
+ Conversation conversation = {{{/*user_id=*/1, "hello there"}}};
+ const std::string test_snippet = R"(
+ return {{
+ type = "test",
+ entity = {
+ greeting = "hello",
+ location = "there",
+ person = "Kenobi",
+ },
+ }};
+ )";
+ std::vector<ActionSuggestion> actions;
+ EXPECT_TRUE(LuaActionsSuggestions::CreateLuaActionsSuggestions(
+ test_snippet, conversation,
+ /*model_executor=*/nullptr,
+ /*model_spec=*/nullptr,
+ /*interpreter=*/nullptr,
+ /*actions_entity_data_schema=*/
+ flatbuffers::GetRoot<reflection::Schema>(test_schema.data()),
+ /*annotations_entity_data_schema=*/nullptr)
+ ->SuggestActions(&actions));
+ EXPECT_THAT(actions, testing::SizeIs(1));
+ EXPECT_EQ("test", actions.front().type);
+ const flatbuffers::Table* entity =
+ flatbuffers::GetAnyRoot(reinterpret_cast<const unsigned char*>(
+ actions.front().serialized_entity_data.data()));
+ EXPECT_EQ(entity->GetPointer<const flatbuffers::String*>(/*field=*/4)->str(),
+ "hello");
+ EXPECT_EQ(entity->GetPointer<const flatbuffers::String*>(/*field=*/6)->str(),
+ "there");
+ EXPECT_EQ(entity->GetPointer<const flatbuffers::String*>(/*field=*/8)->str(),
+ "Kenobi");
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/actions/lua-ranker_test.cc b/native/actions/lua-ranker_test.cc
new file mode 100644
index 0000000..a790042
--- /dev/null
+++ b/native/actions/lua-ranker_test.cc
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/lua-ranker.h"
+
+#include <string>
+
+#include "actions/types.h"
+#include "utils/flatbuffers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+MATCHER_P2(IsAction, type, response_text, "") {
+ return testing::Value(arg.type, type) &&
+ testing::Value(arg.response_text, response_text);
+}
+
+MATCHER_P(IsActionType, type, "") { return testing::Value(arg.type, type); }
+
+std::string TestEntitySchema() {
+ // Create fake entity data schema meta data.
+ // Cannot use object oriented API here as that is not available for the
+ // reflection schema.
+ flatbuffers::FlatBufferBuilder schema_builder;
+ std::vector<flatbuffers::Offset<reflection::Field>> fields = {
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("test"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::String),
+ /*id=*/0,
+ /*offset=*/4)};
+ std::vector<flatbuffers::Offset<reflection::Enum>> enums;
+ std::vector<flatbuffers::Offset<reflection::Object>> objects = {
+ reflection::CreateObject(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("EntityData"),
+ /*fields=*/
+ schema_builder.CreateVectorOfSortedTables(&fields))};
+ schema_builder.Finish(reflection::CreateSchema(
+ schema_builder, schema_builder.CreateVectorOfSortedTables(&objects),
+ schema_builder.CreateVectorOfSortedTables(&enums),
+ /*(unused) file_ident=*/0,
+ /*(unused) file_ext=*/0,
+ /*root_table*/ objects[0]));
+ return std::string(
+ reinterpret_cast<const char*>(schema_builder.GetBufferPointer()),
+ schema_builder.GetSize());
+}
+
+TEST(LuaRankingTest, PassThrough) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ for i=1,#actions do
+ table.insert(result, i)
+ end
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("text_reply"),
+ IsActionType("share_location"),
+ IsActionType("add_to_collection")}));
+}
+
+TEST(LuaRankingTest, Filtering) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ return {}
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions, testing::IsEmpty());
+}
+
+TEST(LuaRankingTest, Duplication) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ for i=1,#actions do
+ table.insert(result, 1)
+ end
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("text_reply"),
+ IsActionType("text_reply"),
+ IsActionType("text_reply")}));
+}
+
+TEST(LuaRankingTest, SortByScore) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ function testScoreSorter(a, b)
+ return actions[a].score < actions[b].score
+ end
+ local result = {}
+ for i=1,#actions do
+ result[i] = i
+ end
+ table.sort(result, testScoreSorter)
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("add_to_collection"),
+ IsActionType("share_location"),
+ IsActionType("text_reply")}));
+}
+
+TEST(LuaRankingTest, SuppressType) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ for id, action in pairs(actions) do
+ if action.type ~= "text_reply" then
+ table.insert(result, id)
+ end
+ end
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("share_location"),
+ IsActionType("add_to_collection")}));
+}
+
+TEST(LuaRankingTest, HandlesConversation) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ if messages[1].text ~= "hello hello" then
+ return result
+ end
+ for id, action in pairs(actions) do
+ if action.type ~= "text_reply" then
+ table.insert(result, id)
+ end
+ end
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, /*entity_data_schema=*/nullptr,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("share_location"),
+ IsActionType("add_to_collection")}));
+}
+
+TEST(LuaRankingTest, HandlesEntityData) {
+ std::string serialized_schema = TestEntitySchema();
+ const reflection::Schema* entity_data_schema =
+ flatbuffers::GetRoot<reflection::Schema>(serialized_schema.data());
+
+ // Create test entity data.
+ ReflectiveFlatbufferBuilder builder(entity_data_schema);
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = builder.NewRoot();
+ buffer->Set("test", "value_a");
+ const std::string serialized_entity_data_a = buffer->Serialize();
+ buffer->Set("test", "value_b");
+ const std::string serialized_entity_data_b = buffer->Serialize();
+
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"", /*type=*/"test",
+ /*score=*/1.0, /*priority_score=*/1.0, /*annotations=*/{},
+ /*serialized_entity_data=*/serialized_entity_data_a},
+ {/*response_text=*/"", /*type=*/"test",
+ /*score=*/1.0, /*priority_score=*/1.0, /*annotations=*/{},
+ /*serialized_entity_data=*/serialized_entity_data_b},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ for id, action in pairs(actions) do
+ if action.type == "test" and action.test == "value_a" then
+ table.insert(result, id)
+ end
+ end
+ return result
+ )";
+
+ EXPECT_TRUE(ActionsSuggestionsLuaRanker::Create(
+ conversation, test_snippet, entity_data_schema,
+ /*annotations_entity_data_schema=*/nullptr, &response)
+ ->RankActions());
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("test")}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/actions/ranker_test.cc b/native/actions/ranker_test.cc
new file mode 100644
index 0000000..b52cf45
--- /dev/null
+++ b/native/actions/ranker_test.cc
@@ -0,0 +1,382 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/ranker.h"
+
+#include <string>
+
+#include "actions/types.h"
+#include "utils/zlib/zlib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+MATCHER_P3(IsAction, type, response_text, score, "") {
+ return testing::Value(arg.type, type) &&
+ testing::Value(arg.response_text, response_text) &&
+ testing::Value(arg.score, score);
+}
+
+MATCHER_P(IsActionType, type, "") { return testing::Value(arg.type, type); }
+
+TEST(RankingTest, DeduplicationSmartReply) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"hello there", /*type=*/"text_reply", /*score=*/0.5}};
+
+ RankingOptionsT options;
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(
+ response.actions,
+ testing::ElementsAreArray({IsAction("text_reply", "hello there", 1.0)}));
+}
+
+TEST(RankingTest, DeduplicationExtraData) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0, /*priority_score=*/0.0},
+ {/*response_text=*/"hello there", /*type=*/"text_reply", /*score=*/0.5,
+ /*priority_score=*/0.0},
+ {/*response_text=*/"hello there", /*type=*/"text_reply", /*score=*/0.6,
+ /*priority_score=*/0.0,
+ /*annotations=*/{}, /*serialized_entity_data=*/"test"},
+ };
+
+ RankingOptionsT options;
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(
+ response.actions,
+ testing::ElementsAreArray({IsAction("text_reply", "hello there", 1.0),
+ // Is kept as it has different entity data.
+ IsAction("text_reply", "hello there", 0.6)}));
+}
+
+TEST(RankingTest, DeduplicationAnnotations) {
+ const Conversation conversation = {
+ {{/*user_id=*/1, "742 Evergreen Terrace, the number is 1-800-TESTING"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{0, 21},
+ /*text=*/"742 Evergreen Terrace"};
+ annotation.entity = ClassificationResult("address", 0.5);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"view_map",
+ /*score=*/0.5,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{0, 21},
+ /*text=*/"742 Evergreen Terrace"};
+ annotation.entity = ClassificationResult("address", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"view_map",
+ /*score=*/1.0,
+ /*priority_score=*/2.0,
+ /*annotations=*/{annotation}});
+ }
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{37, 50},
+ /*text=*/"1-800-TESTING"};
+ annotation.entity = ClassificationResult("phone", 0.5);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/0.5,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+
+ RankingOptionsT options;
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsAction("view_map", "", 1.0),
+ IsAction("call_phone", "", 0.5)}));
+}
+
+TEST(RankingTest, DeduplicationAnnotationsByPriorityScore) {
+ const Conversation conversation = {
+ {{/*user_id=*/1, "742 Evergreen Terrace, the number is 1-800-TESTING"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{0, 21},
+ /*text=*/"742 Evergreen Terrace"};
+ annotation.entity = ClassificationResult("address", 0.5);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"view_map",
+ /*score=*/0.6,
+ /*priority_score=*/2.0,
+ /*annotations=*/{annotation}});
+ }
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{0, 21},
+ /*text=*/"742 Evergreen Terrace"};
+ annotation.entity = ClassificationResult("address", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"view_map",
+ /*score=*/1.0,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{37, 50},
+ /*text=*/"1-800-TESTING"};
+ annotation.entity = ClassificationResult("phone", 0.5);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/0.5,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+
+ RankingOptionsT options;
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(
+ response.actions,
+ testing::ElementsAreArray(
+ {IsAction("view_map", "",
+ 0.6), // lower score wins, as priority score is higher
+ IsAction("call_phone", "", 0.5)}));
+}
+
+TEST(RankingTest, DeduplicatesConflictingActions) {
+ const Conversation conversation = {{{/*user_id=*/1, "code A-911"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{7, 10},
+ /*text=*/"911"};
+ annotation.entity = ClassificationResult("phone", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/1.0,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{5, 10},
+ /*text=*/"A-911"};
+ annotation.entity = ClassificationResult("code", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"copy_code",
+ /*score=*/1.0,
+ /*priority_score=*/2.0,
+ /*annotations=*/{annotation}});
+ }
+ RankingOptionsT options;
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsAction("copy_code", "", 1.0)}));
+}
+
+TEST(RankingTest, HandlesCompressedLuaScript) {
+ const Conversation conversation = {{{/*user_id=*/1, "hello hello"}}};
+ ActionsSuggestionsResponse response;
+ response.actions = {
+ {/*response_text=*/"hello there", /*type=*/"text_reply",
+ /*score=*/1.0},
+ {/*response_text=*/"", /*type=*/"share_location", /*score=*/0.5},
+ {/*response_text=*/"", /*type=*/"add_to_collection", /*score=*/0.1}};
+ const std::string test_snippet = R"(
+ local result = {}
+ for id, action in pairs(actions) do
+ if action.type ~= "text_reply" then
+ table.insert(result, id)
+ end
+ end
+ return result
+ )";
+ RankingOptionsT options;
+ options.compressed_lua_ranking_script.reset(new CompressedBufferT);
+ std::unique_ptr<ZlibCompressor> compressor = ZlibCompressor::Instance();
+ compressor->Compress(test_snippet,
+ options.compressed_lua_ranking_script.get());
+ options.deduplicate_suggestions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+
+ std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ decompressor.get(), /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsActionType("share_location"),
+ IsActionType("add_to_collection")}));
+}
+
+TEST(RankingTest, SuppressSmartRepliesWithAction) {
+ const Conversation conversation = {{{/*user_id=*/1, "should i call 911"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{5, 8},
+ /*text=*/"911"};
+ annotation.entity = ClassificationResult("phone", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/1.0,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ }
+ response.actions.push_back({/*response_text=*/"How are you?",
+ /*type=*/"text_reply"});
+ RankingOptionsT options;
+ options.suppress_smart_replies_with_actions = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+
+ EXPECT_THAT(response.actions,
+ testing::ElementsAreArray({IsAction("call_phone", "", 1.0)}));
+}
+
+TEST(RankingTest, GroupsActionsByAnnotations) {
+ const Conversation conversation = {{{/*user_id=*/1, "should i call 911"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{5, 8},
+ /*text=*/"911"};
+ annotation.entity = ClassificationResult("phone", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/1.0,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"add_contact",
+ /*score=*/0.0,
+ /*priority_score=*/0.0,
+ /*annotations=*/{annotation}});
+ }
+ response.actions.push_back({/*response_text=*/"How are you?",
+ /*type=*/"text_reply",
+ /*score=*/0.5});
+ RankingOptionsT options;
+ options.group_by_annotations = true;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+
+ // The text reply should be last, even though it has a higher score than the
+ // `add_contact` action.
+ EXPECT_THAT(
+ response.actions,
+ testing::ElementsAreArray({IsAction("call_phone", "", 1.0),
+ IsAction("add_contact", "", 0.0),
+ IsAction("text_reply", "How are you?", 0.5)}));
+}
+
+TEST(RankingTest, SortsActionsByScore) {
+ const Conversation conversation = {{{/*user_id=*/1, "should i call 911"}}};
+ ActionsSuggestionsResponse response;
+ {
+ ActionSuggestionAnnotation annotation;
+ annotation.span = {/*message_index=*/0, /*span=*/{5, 8},
+ /*text=*/"911"};
+ annotation.entity = ClassificationResult("phone", 1.0);
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"call_phone",
+ /*score=*/1.0,
+ /*priority_score=*/1.0,
+ /*annotations=*/{annotation}});
+ response.actions.push_back({/*response_text=*/"",
+ /*type=*/"add_contact",
+ /*score=*/0.0,
+ /*priority_score=*/0.0,
+ /*annotations=*/{annotation}});
+ }
+ response.actions.push_back({/*response_text=*/"How are you?",
+ /*type=*/"text_reply",
+ /*score=*/0.5});
+ RankingOptionsT options;
+ // Don't group by annotation.
+ options.group_by_annotations = false;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(RankingOptions::Pack(builder, &options));
+ auto ranker = ActionsSuggestionsRanker::CreateActionsSuggestionsRanker(
+ flatbuffers::GetRoot<RankingOptions>(builder.GetBufferPointer()),
+ /*decompressor=*/nullptr, /*smart_reply_action_type=*/"text_reply");
+
+ ranker->RankActions(conversation, &response);
+
+ EXPECT_THAT(
+ response.actions,
+ testing::ElementsAreArray({IsAction("call_phone", "", 1.0),
+ IsAction("text_reply", "How are you?", 0.5),
+ IsAction("add_contact", "", 0.0)}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/actions/test-utils.cc b/native/actions/test-utils.cc
new file mode 100644
index 0000000..9b003dd
--- /dev/null
+++ b/native/actions/test-utils.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/test-utils.h"
+
+namespace libtextclassifier3 {
+
+std::string TestEntityDataSchema() {
+ // Create fake entity data schema meta data.
+ // Cannot use object oriented API here as that is not available for the
+ // reflection schema.
+ flatbuffers::FlatBufferBuilder schema_builder;
+ std::vector<flatbuffers::Offset<reflection::Field>> fields = {
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("greeting"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::String),
+ /*id=*/0,
+ /*offset=*/4),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("location"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::String),
+ /*id=*/1,
+ /*offset=*/6),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("person"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::String),
+ /*id=*/2,
+ /*offset=*/8)};
+ std::vector<flatbuffers::Offset<reflection::Enum>> enums;
+ std::vector<flatbuffers::Offset<reflection::Object>> objects = {
+ reflection::CreateObject(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("EntityData"),
+ /*fields=*/
+ schema_builder.CreateVectorOfSortedTables(&fields))};
+ schema_builder.Finish(reflection::CreateSchema(
+ schema_builder, schema_builder.CreateVectorOfSortedTables(&objects),
+ schema_builder.CreateVectorOfSortedTables(&enums),
+ /*(unused) file_ident=*/0,
+ /*(unused) file_ext=*/0,
+ /*root_table*/ objects[0]));
+
+ return std::string(
+ reinterpret_cast<const char*>(schema_builder.GetBufferPointer()),
+ schema_builder.GetSize());
+}
+
+void SetTestEntityDataSchema(ActionsModelT* test_model) {
+ const std::string serialized_schema = TestEntityDataSchema();
+
+ test_model->actions_entity_data_schema.assign(
+ serialized_schema.data(),
+ serialized_schema.data() + serialized_schema.size());
+}
+
+} // namespace libtextclassifier3
diff --git a/native/actions/test-utils.h b/native/actions/test-utils.h
new file mode 100644
index 0000000..c05d6a9
--- /dev/null
+++ b/native/actions/test-utils.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ACTIONS_TEST_UTILS_H_
+#define LIBTEXTCLASSIFIER_ACTIONS_TEST_UTILS_H_
+
+#include <string>
+
+#include "actions/actions_model_generated.h"
+#include "utils/flatbuffers.h"
+#include "gmock/gmock.h"
+
+namespace libtextclassifier3 {
+
+using testing::ExplainMatchResult;
+using testing::Value;
+
+// Create test entity data schema.
+std::string TestEntityDataSchema();
+void SetTestEntityDataSchema(ActionsModelT* test_model);
+
+MATCHER_P(IsActionOfType, type, "") { return Value(arg.type, type); }
+MATCHER_P(IsSmartReply, response_text, "") {
+ return ExplainMatchResult(IsActionOfType("text_reply"), arg,
+ result_listener) &&
+ Value(arg.response_text, response_text);
+}
+MATCHER_P(IsSpan, span, "") {
+ return Value(arg.first, span.first) && Value(arg.second, span.second);
+}
+MATCHER_P3(IsActionSuggestionAnnotation, name, text, span, "") {
+ return Value(arg.name, name) && Value(arg.span.text, text) &&
+ ExplainMatchResult(IsSpan(span), arg.span.span, result_listener);
+}
+
+} // namespace libtextclassifier3
+
+#endif // LIBTEXTCLASSIFIER_ACTIONS_TEST_UTILS_H_
diff --git a/native/actions/zlib-utils_test.cc b/native/actions/zlib-utils_test.cc
new file mode 100644
index 0000000..75e4c78
--- /dev/null
+++ b/native/actions/zlib-utils_test.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "actions/zlib-utils.h"
+
+#include <memory>
+
+#include "actions/actions_model_generated.h"
+#include "utils/zlib/zlib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::ElementsAre;
+using testing::Field;
+using testing::Pointee;
+
+TEST(ActionsZlibUtilsTest, CompressModel) {
+ ActionsModelT model;
+ constexpr char kTestPattern1[] = "this is a test pattern";
+ constexpr char kTestPattern2[] = "this is a second test pattern";
+ constexpr char kTestOutputPattern[] = "this is an output pattern";
+ model.rules.reset(new RulesModelT);
+ model.rules->regex_rule.emplace_back(new RulesModel_::RegexRuleT);
+ model.rules->regex_rule.back()->pattern = kTestPattern1;
+ model.rules->regex_rule.emplace_back(new RulesModel_::RegexRuleT);
+ model.rules->regex_rule.back()->pattern = kTestPattern2;
+ model.rules->regex_rule.back()->output_pattern = kTestOutputPattern;
+
+ // Compress the model.
+ EXPECT_TRUE(CompressActionsModel(&model));
+
+ // Sanity check that uncompressed field is removed.
+ const auto is_empty_pattern =
+ Pointee(Field(&libtextclassifier3::RulesModel_::RegexRuleT::pattern,
+ testing::IsEmpty()));
+ EXPECT_THAT(model.rules->regex_rule,
+ ElementsAre(is_empty_pattern, is_empty_pattern));
+ // Pack and load the model.
+ flatbuffers::FlatBufferBuilder builder;
+ FinishActionsModelBuffer(builder, ActionsModel::Pack(builder, &model));
+ const ActionsModel* compressed_model = GetActionsModel(
+ reinterpret_cast<const char*>(builder.GetBufferPointer()));
+ ASSERT_TRUE(compressed_model != nullptr);
+
+ // Decompress the fields again and check that they match the original.
+ std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();
+ ASSERT_TRUE(decompressor != nullptr);
+ std::string uncompressed_pattern;
+ EXPECT_TRUE(decompressor->MaybeDecompress(
+ compressed_model->rules()->regex_rule()->Get(0)->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, kTestPattern1);
+ EXPECT_TRUE(decompressor->MaybeDecompress(
+ compressed_model->rules()->regex_rule()->Get(1)->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, kTestPattern2);
+ EXPECT_TRUE(DecompressActionsModel(&model));
+ EXPECT_EQ(model.rules->regex_rule[0]->pattern, kTestPattern1);
+ EXPECT_EQ(model.rules->regex_rule[1]->pattern, kTestPattern2);
+ EXPECT_EQ(model.rules->regex_rule[1]->output_pattern, kTestOutputPattern);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/annotator.cc b/native/annotator/annotator.cc
index 93a3270..6ee983f 100644
--- a/native/annotator/annotator.cc
+++ b/native/annotator/annotator.cc
@@ -556,6 +556,11 @@
->do_conflict_resolution_in_raw_mode();
}
+#ifdef TC3_EXPERIMENTAL
+ TC3_LOG(WARNING) << "Enabling experimental annotators.";
+ InitializeExperimentalAnnotators();
+#endif
+
initialized_ = true;
}
@@ -694,7 +699,8 @@
bool Annotator::InitializeExperimentalAnnotators() {
if (ExperimentalAnnotator::IsEnabled()) {
- experimental_annotator_.reset(new ExperimentalAnnotator(*unilib_));
+ experimental_annotator_.reset(new ExperimentalAnnotator(
+ model_->experimental_model(), *selection_feature_processor_, *unilib_));
return true;
}
return false;
@@ -2490,13 +2496,22 @@
LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(
*serialized_entity_data);
if (data == nullptr) {
- TC3_LOG(ERROR)
- << "Data field is null when trying to parse Money Entity Data";
+ if (model_->version() >= 706) {
+ // This way of parsing money entity data is enabled for models newer than
+ // v706, consequently logging errors only for them (b/156634162).
+ TC3_LOG(ERROR)
+ << "Data field is null when trying to parse Money Entity Data";
+ }
return false;
}
if (data->money->unnormalized_amount.empty()) {
- TC3_LOG(ERROR) << "Data unnormalized_amount is empty when trying to parse "
- "Money Entity Data";
+ if (model_->version() >= 706) {
+ // This way of parsing money entity data is enabled for models newer than
+ // v706, consequently logging errors only for them (b/156634162).
+ TC3_LOG(ERROR)
+ << "Data unnormalized_amount is empty when trying to parse "
+ "Money Entity Data";
+ }
return false;
}
@@ -2587,7 +2602,11 @@
if (regex_pattern.config->collection_name()->str() ==
Collections::Money()) {
if (!ParseAndFillInMoneyAmount(&serialized_entity_data)) {
- TC3_LOG(ERROR) << "Could not parse and fill in money amount.";
+ if (model_->version() >= 706) {
+ // This way of parsing money entity data is enabled for models
+ // newer than v706 => logging errors only for them (b/156634162).
+ TC3_LOG(ERROR) << "Could not parse and fill in money amount.";
+ }
}
}
}
diff --git a/native/annotator/annotator_jni_test.cc b/native/annotator/annotator_jni_test.cc
new file mode 100644
index 0000000..929fb59
--- /dev/null
+++ b/native/annotator/annotator_jni_test.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/annotator_jni.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(Annotator, ConvertIndicesBMPUTF8) {
+ // Test boundary cases.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("hello", {0, 5}), std::make_pair(0, 5));
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("hello", {0, 5}), std::make_pair(0, 5));
+
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("hello world", {0, 5}),
+ std::make_pair(0, 5));
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("hello world", {0, 5}),
+ std::make_pair(0, 5));
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("😁ello world", {0, 6}),
+ std::make_pair(0, 5));
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("😁ello world", {0, 5}),
+ std::make_pair(0, 6));
+
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("hello world", {6, 11}),
+ std::make_pair(6, 11));
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("hello world", {6, 11}),
+ std::make_pair(6, 11));
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("hello worl😁", {6, 12}),
+ std::make_pair(6, 11));
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("hello worl😁", {6, 11}),
+ std::make_pair(6, 12));
+
+ // Simple example where the longer character is before the selection.
+ // character 😁 is 0x1f601
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("😁 Hello World.", {3, 8}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("😁 Hello World.", {2, 7}),
+ std::make_pair(3, 8));
+
+ // Longer character is before and in selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("😁 Hell😁 World.", {3, 9}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("😁 Hell😁 World.", {2, 7}),
+ std::make_pair(3, 9));
+
+ // Longer character is before and after selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("😁 Hello😁World.", {3, 8}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("😁 Hello😁World.", {2, 7}),
+ std::make_pair(3, 8));
+
+ // Longer character is before in after selection.
+ EXPECT_EQ(ConvertIndicesBMPToUTF8("😁 Hell😁😁World.", {3, 9}),
+ std::make_pair(2, 7));
+
+ EXPECT_EQ(ConvertIndicesUTF8ToBMP("😁 Hell😁😁World.", {2, 7}),
+ std::make_pair(3, 9));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/cached-features_test.cc b/native/annotator/cached-features_test.cc
new file mode 100644
index 0000000..702f3ca
--- /dev/null
+++ b/native/annotator/cached-features_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/cached-features.h"
+
+#include "annotator/model-executor.h"
+#include "utils/tensor-view.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::ElementsAreArray;
+using testing::FloatEq;
+using testing::Matcher;
+
+namespace libtextclassifier3 {
+namespace {
+
+Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
+ std::vector<Matcher<float>> matchers;
+ for (const float value : values) {
+ matchers.push_back(FloatEq(value));
+ }
+ return ElementsAreArray(matchers);
+}
+
+std::unique_ptr<std::vector<float>> MakeFeatures(int num_tokens) {
+ std::unique_ptr<std::vector<float>> features(new std::vector<float>());
+ for (int i = 1; i <= num_tokens; ++i) {
+ features->push_back(i * 11.0f);
+ features->push_back(-i * 11.0f);
+ features->push_back(i * 0.1f);
+ }
+ return features;
+}
+
+std::vector<float> GetCachedClickContextFeatures(
+ const CachedFeatures& cached_features, int click_pos) {
+ std::vector<float> output_features;
+ cached_features.AppendClickContextFeaturesForClick(click_pos,
+ &output_features);
+ return output_features;
+}
+
+std::vector<float> GetCachedBoundsSensitiveFeatures(
+ const CachedFeatures& cached_features, TokenSpan selected_span) {
+ std::vector<float> output_features;
+ cached_features.AppendBoundsSensitiveFeaturesForSpan(selected_span,
+ &output_features);
+ return output_features;
+}
+
+TEST(CachedFeaturesTest, ClickContext) {
+ FeatureProcessorOptionsT options;
+ options.context_size = 2;
+ options.feature_version = 1;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(CreateFeatureProcessorOptions(builder, &options));
+ flatbuffers::DetachedBuffer options_fb = builder.Release();
+
+ std::unique_ptr<std::vector<float>> features = MakeFeatures(9);
+ std::unique_ptr<std::vector<float>> padding_features(
+ new std::vector<float>{112233.0, -112233.0, 321.0});
+
+ const std::unique_ptr<CachedFeatures> cached_features =
+ CachedFeatures::Create(
+ {3, 10}, std::move(features), std::move(padding_features),
+ flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
+ /*feature_vector_size=*/3);
+ ASSERT_TRUE(cached_features);
+
+ EXPECT_THAT(GetCachedClickContextFeatures(*cached_features, 5),
+ ElementsAreFloat({11.0, -11.0, 0.1, 22.0, -22.0, 0.2, 33.0, -33.0,
+ 0.3, 44.0, -44.0, 0.4, 55.0, -55.0, 0.5}));
+
+ EXPECT_THAT(GetCachedClickContextFeatures(*cached_features, 6),
+ ElementsAreFloat({22.0, -22.0, 0.2, 33.0, -33.0, 0.3, 44.0, -44.0,
+ 0.4, 55.0, -55.0, 0.5, 66.0, -66.0, 0.6}));
+
+ EXPECT_THAT(GetCachedClickContextFeatures(*cached_features, 7),
+ ElementsAreFloat({33.0, -33.0, 0.3, 44.0, -44.0, 0.4, 55.0, -55.0,
+ 0.5, 66.0, -66.0, 0.6, 77.0, -77.0, 0.7}));
+}
+
+TEST(CachedFeaturesTest, BoundsSensitive) {
+ std::unique_ptr<FeatureProcessorOptions_::BoundsSensitiveFeaturesT> config(
+ new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
+ config->enabled = true;
+ config->num_tokens_before = 2;
+ config->num_tokens_inside_left = 2;
+ config->num_tokens_inside_right = 2;
+ config->num_tokens_after = 2;
+ config->include_inside_bag = true;
+ config->include_inside_length = true;
+ FeatureProcessorOptionsT options;
+ options.bounds_sensitive_features = std::move(config);
+ options.feature_version = 2;
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(CreateFeatureProcessorOptions(builder, &options));
+ flatbuffers::DetachedBuffer options_fb = builder.Release();
+
+ std::unique_ptr<std::vector<float>> features = MakeFeatures(9);
+ std::unique_ptr<std::vector<float>> padding_features(
+ new std::vector<float>{112233.0, -112233.0, 321.0});
+
+ const std::unique_ptr<CachedFeatures> cached_features =
+ CachedFeatures::Create(
+ {3, 9}, std::move(features), std::move(padding_features),
+ flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
+ /*feature_vector_size=*/3);
+ ASSERT_TRUE(cached_features);
+
+ EXPECT_THAT(
+ GetCachedBoundsSensitiveFeatures(*cached_features, {5, 8}),
+ ElementsAreFloat({11.0, -11.0, 0.1, 22.0, -22.0, 0.2, 33.0,
+ -33.0, 0.3, 44.0, -44.0, 0.4, 44.0, -44.0,
+ 0.4, 55.0, -55.0, 0.5, 66.0, -66.0, 0.6,
+ 112233.0, -112233.0, 321.0, 44.0, -44.0, 0.4, 3.0}));
+
+ EXPECT_THAT(
+ GetCachedBoundsSensitiveFeatures(*cached_features, {5, 7}),
+ ElementsAreFloat({11.0, -11.0, 0.1, 22.0, -22.0, 0.2, 33.0,
+ -33.0, 0.3, 44.0, -44.0, 0.4, 33.0, -33.0,
+ 0.3, 44.0, -44.0, 0.4, 55.0, -55.0, 0.5,
+ 66.0, -66.0, 0.6, 38.5, -38.5, 0.35, 2.0}));
+
+ EXPECT_THAT(
+ GetCachedBoundsSensitiveFeatures(*cached_features, {6, 8}),
+ ElementsAreFloat({22.0, -22.0, 0.2, 33.0, -33.0, 0.3, 44.0,
+ -44.0, 0.4, 55.0, -55.0, 0.5, 44.0, -44.0,
+ 0.4, 55.0, -55.0, 0.5, 66.0, -66.0, 0.6,
+ 112233.0, -112233.0, 321.0, 49.5, -49.5, 0.45, 2.0}));
+
+ EXPECT_THAT(
+ GetCachedBoundsSensitiveFeatures(*cached_features, {6, 7}),
+ ElementsAreFloat({22.0, -22.0, 0.2, 33.0, -33.0, 0.3,
+ 44.0, -44.0, 0.4, 112233.0, -112233.0, 321.0,
+ 112233.0, -112233.0, 321.0, 44.0, -44.0, 0.4,
+ 55.0, -55.0, 0.5, 66.0, -66.0, 0.6,
+ 44.0, -44.0, 0.4, 1.0}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/datetime/extractor.cc b/native/annotator/datetime/extractor.cc
index ebcf091..b8e1b7a 100644
--- a/native/annotator/datetime/extractor.cc
+++ b/native/annotator/datetime/extractor.cc
@@ -473,6 +473,7 @@
{DatetimeExtractorType_NEXT, 1},
{DatetimeExtractorType_NEXT_OR_SAME, 1},
{DatetimeExtractorType_LAST, -1},
+ {DatetimeExtractorType_PAST, -1},
},
relative_count);
}
diff --git a/native/annotator/duration/duration_test.cc b/native/annotator/duration/duration_test.cc
new file mode 100644
index 0000000..a0985a2
--- /dev/null
+++ b/native/annotator/duration/duration_test.cc
@@ -0,0 +1,567 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/duration/duration.h"
+
+#include <string>
+#include <vector>
+
+#include "annotator/collections.h"
+#include "annotator/model_generated.h"
+#include "annotator/types-test-util.h"
+#include "annotator/types.h"
+#include "utils/test-utils.h"
+#include "utils/utf8/unicodetext.h"
+#include "utils/utf8/unilib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::Field;
+using testing::IsEmpty;
+
+const DurationAnnotatorOptions* TestingDurationAnnotatorOptions() {
+ static const flatbuffers::DetachedBuffer* options_data = []() {
+ DurationAnnotatorOptionsT options;
+ options.enabled = true;
+
+ options.week_expressions.push_back("week");
+ options.week_expressions.push_back("weeks");
+
+ options.day_expressions.push_back("day");
+ options.day_expressions.push_back("days");
+
+ options.hour_expressions.push_back("hour");
+ options.hour_expressions.push_back("hours");
+
+ options.minute_expressions.push_back("minute");
+ options.minute_expressions.push_back("minutes");
+
+ options.second_expressions.push_back("second");
+ options.second_expressions.push_back("seconds");
+
+ options.filler_expressions.push_back("and");
+ options.filler_expressions.push_back("a");
+ options.filler_expressions.push_back("an");
+ options.filler_expressions.push_back("one");
+
+ options.half_expressions.push_back("half");
+
+ options.sub_token_separator_codepoints.push_back('-');
+
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(DurationAnnotatorOptions::Pack(builder, &options));
+ return new flatbuffers::DetachedBuffer(builder.Release());
+ }();
+
+ return flatbuffers::GetRoot<DurationAnnotatorOptions>(options_data->data());
+}
+
+std::unique_ptr<FeatureProcessor> BuildFeatureProcessor(const UniLib* unilib) {
+ static const flatbuffers::DetachedBuffer* options_data = []() {
+ FeatureProcessorOptionsT options;
+ options.context_size = 1;
+ options.max_selection_span = 1;
+ options.snap_label_span_boundaries_to_containing_tokens = false;
+ options.ignored_span_boundary_codepoints.push_back(',');
+
+ options.tokenization_codepoint_config.emplace_back(
+ new TokenizationCodepointRangeT());
+ auto& config = options.tokenization_codepoint_config.back();
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
+ return new flatbuffers::DetachedBuffer(builder.Release());
+ }();
+
+ const FeatureProcessorOptions* feature_processor_options =
+ flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
+
+ return std::unique_ptr<FeatureProcessor>(
+ new FeatureProcessor(feature_processor_options, unilib));
+}
+
+class DurationAnnotatorTest : public ::testing::Test {
+ protected:
+ DurationAnnotatorTest()
+ : INIT_UNILIB_FOR_TESTING(unilib_),
+ feature_processor_(BuildFeatureProcessor(&unilib_)),
+ duration_annotator_(TestingDurationAnnotatorOptions(),
+ feature_processor_.get(), &unilib_) {}
+
+ std::vector<Token> Tokenize(const UnicodeText& text) {
+ return feature_processor_->Tokenize(text);
+ }
+
+ UniLib unilib_;
+ std::unique_ptr<FeatureProcessor> feature_processor_;
+ DurationAnnotator duration_annotator_;
+};
+
+TEST_F(DurationAnnotatorTest, ClassifiesSimpleDuration) {
+ ClassificationResult classification;
+ EXPECT_TRUE(duration_annotator_.ClassifyText(
+ UTF8ToUnicodeText("Wake me up in 15 minutes ok?"), {14, 24},
+ AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
+
+ EXPECT_THAT(classification,
+ AllOf(Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms, 15 * 60 * 1000)));
+}
+
+TEST_F(DurationAnnotatorTest, ClassifiesWhenTokensDontAlignWithSelection) {
+ ClassificationResult classification;
+ EXPECT_TRUE(duration_annotator_.ClassifyText(
+ UTF8ToUnicodeText("Wake me up in15 minutesok?"), {13, 23},
+ AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
+
+ EXPECT_THAT(classification,
+ AllOf(Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms, 15 * 60 * 1000)));
+}
+
+TEST_F(DurationAnnotatorTest, DoNotClassifyWhenInputIsInvalid) {
+ ClassificationResult classification;
+ EXPECT_FALSE(duration_annotator_.ClassifyText(
+ UTF8ToUnicodeText("Weird space"), {5, 6},
+ AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
+}
+
+TEST_F(DurationAnnotatorTest, FindsSimpleDuration) {
+ const UnicodeText text = UTF8ToUnicodeText("Wake me up in 15 minutes ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 24)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 15 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsDurationWithHalfExpression) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 3 and half minutes ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 3.5 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsComposedDuration) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Wake me up in 3 hours and 5 seconds ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 35)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 3 * 60 * 60 * 1000 + 5 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, AllUnitsAreCovered) {
+ const UnicodeText text = UTF8ToUnicodeText(
+ "See you in a week and a day and an hour and a minute and a second");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(13, 65)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 7 * 24 * 60 * 60 * 1000 + 24 * 60 * 60 * 1000 +
+ 60 * 60 * 1000 + 60 * 1000 + 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsHalfAnHour) {
+ const UnicodeText text = UTF8ToUnicodeText("Set a timer for half an hour");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 28)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 0.5 * 60 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsWhenHalfIsAfterGranularitySpecification) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 1 hour and a half");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 33)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 1.5 * 60 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsAnHourAndAHalf) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for an hour and a half");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(19, 34)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 1.5 * 60 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest,
+ FindsCorrectlyWhenSecondsComeSecondAndDontHaveNumber) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 10 minutes and a second ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 39)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 10 * 60 * 1000 + 1 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, DoesNotGreedilyTakeFillerWords) {
+ const UnicodeText text = UTF8ToUnicodeText(
+ "Set a timer for a a a 10 minutes and 2 seconds an and an ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(22, 46)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 10 * 60 * 1000 + 2 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, DoesNotCrashWhenJustHalfIsSaid) {
+ const UnicodeText text = UTF8ToUnicodeText("Set a timer for half ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ ASSERT_EQ(result.size(), 0);
+}
+
+TEST_F(DurationAnnotatorTest, StripsPunctuationFromTokens) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 10 ,minutes, ,and, ,2, seconds, ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 46)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 10 * 60 * 1000 + 2 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsCorrectlyWithCombinedQuantityUnitToken) {
+ const UnicodeText text = UTF8ToUnicodeText("Show 5-minute timer.");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(5, 13)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 5 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest,
+ DoesNotIntOverflowWithDurationThatHasMoreThanInt32Millis) {
+ ClassificationResult classification;
+ EXPECT_TRUE(duration_annotator_.ClassifyText(
+ UTF8ToUnicodeText("1400 hours"), {0, 10},
+ AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
+
+ EXPECT_THAT(classification,
+ AllOf(Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 1400LL * 60LL * 60LL * 1000LL)));
+}
+
+TEST_F(DurationAnnotatorTest, FindsSimpleDurationIgnoringCase) {
+ const UnicodeText text = UTF8ToUnicodeText("Wake me up in 15 MiNuTeS ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 24)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 15 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, FindsDurationWithHalfExpressionIgnoringCase) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 3 and HaLf minutes ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 3.5 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest,
+ FindsDurationWithHalfExpressionIgnoringFillerWordCase) {
+ const UnicodeText text =
+ UTF8ToUnicodeText("Set a timer for 3 AnD half minutes ok?");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 3.5 * 60 * 1000)))))));
+}
+
+TEST_F(DurationAnnotatorTest, CorrectlyAnnotatesSpanWithDanglingQuantity) {
+ const UnicodeText text = UTF8ToUnicodeText("20 minutes 10");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ // TODO(b/144752747) Include test for duration_ms.
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 13)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(Field(&ClassificationResult::collection,
+ "duration")))))));
+}
+
+const DurationAnnotatorOptions* TestingJapaneseDurationAnnotatorOptions() {
+ static const flatbuffers::DetachedBuffer* options_data = []() {
+ DurationAnnotatorOptionsT options;
+ options.enabled = true;
+
+ options.week_expressions.push_back("週間");
+
+ options.day_expressions.push_back("日間");
+
+ options.hour_expressions.push_back("時間");
+
+ options.minute_expressions.push_back("分");
+ options.minute_expressions.push_back("分間");
+
+ options.second_expressions.push_back("秒");
+ options.second_expressions.push_back("秒間");
+
+ options.half_expressions.push_back("半");
+
+ options.require_quantity = true;
+ options.enable_dangling_quantity_interpretation = false;
+
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(DurationAnnotatorOptions::Pack(builder, &options));
+ return new flatbuffers::DetachedBuffer(builder.Release());
+ }();
+
+ return flatbuffers::GetRoot<DurationAnnotatorOptions>(options_data->data());
+}
+
+class JapaneseDurationAnnotatorTest : public ::testing::Test {
+ protected:
+ JapaneseDurationAnnotatorTest()
+ : INIT_UNILIB_FOR_TESTING(unilib_),
+ feature_processor_(BuildFeatureProcessor(&unilib_)),
+ duration_annotator_(TestingJapaneseDurationAnnotatorOptions(),
+ feature_processor_.get(), &unilib_) {}
+
+ std::vector<Token> Tokenize(const UnicodeText& text) {
+ return feature_processor_->Tokenize(text);
+ }
+
+ UniLib unilib_;
+ std::unique_ptr<FeatureProcessor> feature_processor_;
+ DurationAnnotator duration_annotator_;
+};
+
+TEST_F(JapaneseDurationAnnotatorTest, FindsDuration) {
+ const UnicodeText text = UTF8ToUnicodeText("10 分 の アラーム");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 4)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 10 * 60 * 1000)))))));
+}
+
+TEST_F(JapaneseDurationAnnotatorTest, FindsDurationWithHalfExpression) {
+ const UnicodeText text = UTF8ToUnicodeText("2 分 半 の アラーム");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 5)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 2.5 * 60 * 1000)))))));
+}
+
+TEST_F(JapaneseDurationAnnotatorTest, IgnoresDurationWithoutQuantity) {
+ const UnicodeText text = UTF8ToUnicodeText("分 の アラーム");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(result, IsEmpty());
+}
+
+TEST_F(JapaneseDurationAnnotatorTest, IgnoresDanglingQuantity) {
+ const UnicodeText text = UTF8ToUnicodeText("2 分 10 の アラーム");
+ std::vector<Token> tokens = Tokenize(text);
+ std::vector<AnnotatedSpan> result;
+ EXPECT_TRUE(duration_annotator_.FindAll(
+ text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
+
+ EXPECT_THAT(
+ result,
+ ElementsAre(
+ AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 3)),
+ Field(&AnnotatedSpan::classification,
+ ElementsAre(AllOf(
+ Field(&ClassificationResult::collection, "duration"),
+ Field(&ClassificationResult::duration_ms,
+ 2 * 60 * 1000)))))));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/experimental/experimental-dummy.h b/native/annotator/experimental/experimental-dummy.h
index d245387..389aae1 100644
--- a/native/annotator/experimental/experimental-dummy.h
+++ b/native/annotator/experimental/experimental-dummy.h
@@ -20,6 +20,7 @@
#include <string>
#include <vector>
+#include "annotator/feature-processor.h"
#include "annotator/types.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
@@ -32,7 +33,9 @@
// always disabled;
static constexpr bool IsEnabled() { return false; }
- explicit ExperimentalAnnotator(const UniLib& unilib) {}
+ explicit ExperimentalAnnotator(const ExperimentalModel* model,
+ const FeatureProcessor& feature_processor,
+ const UniLib& unilib) {}
bool Annotate(const UnicodeText& context,
std::vector<AnnotatedSpan>* candidates) const {
diff --git a/native/annotator/experimental/experimental.fbs b/native/annotator/experimental/experimental.fbs
new file mode 100755
index 0000000..6e15d04
--- /dev/null
+++ b/native/annotator/experimental/experimental.fbs
@@ -0,0 +1,20 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+namespace libtextclassifier3;
+table ExperimentalModel {
+}
+
diff --git a/native/annotator/feature-processor.cc b/native/annotator/feature-processor.cc
index 1d3b8f5..8d08574 100644
--- a/native/annotator/feature-processor.cc
+++ b/native/annotator/feature-processor.cc
@@ -661,6 +661,10 @@
++num_total;
}
}
+ // Avoid division by zero.
+ if (num_total == 0) {
+ return 0.0;
+ }
return static_cast<float>(num_supported) / static_cast<float>(num_total);
}
diff --git a/native/annotator/flatbuffer-utils.cc b/native/annotator/flatbuffer-utils.cc
deleted file mode 100644
index d83d2bb..0000000
--- a/native/annotator/flatbuffer-utils.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "annotator/flatbuffer-utils.h"
-
-#include <memory>
-
-#include "utils/base/logging.h"
-#include "utils/flatbuffers.h"
-#include "flatbuffers/reflection.h"
-
-namespace libtextclassifier3 {
-
-bool SwapFieldNamesForOffsetsInPath(ModelT* model) {
- if (model->regex_model == nullptr || model->entity_data_schema.empty()) {
- // Nothing to do.
- return true;
- }
- const reflection::Schema* schema =
- LoadAndVerifyFlatbuffer<reflection::Schema>(
- model->entity_data_schema.data(), model->entity_data_schema.size());
-
- for (std::unique_ptr<RegexModel_::PatternT>& pattern :
- model->regex_model->patterns) {
- for (std::unique_ptr<CapturingGroupT>& group : pattern->capturing_group) {
- if (group->entity_field_path == nullptr) {
- continue;
- }
-
- if (!SwapFieldNamesForOffsetsInPath(schema,
- group->entity_field_path.get())) {
- return false;
- }
- }
- }
-
- return true;
-}
-
-std::string SwapFieldNamesForOffsetsInPathInSerializedModel(
- const std::string& model) {
- std::unique_ptr<ModelT> unpacked_model = UnPackModel(model.c_str());
- TC3_CHECK(unpacked_model != nullptr);
- TC3_CHECK(SwapFieldNamesForOffsetsInPath(unpacked_model.get()));
- flatbuffers::FlatBufferBuilder builder;
- FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
- return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
- builder.GetSize());
-}
-
-} // namespace libtextclassifier3
diff --git a/native/annotator/flatbuffer-utils.h b/native/annotator/flatbuffer-utils.h
deleted file mode 100644
index a7e5d64..0000000
--- a/native/annotator/flatbuffer-utils.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Utility functions for working with FlatBuffers in the annotator model.
-
-#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_FLATBUFFER_UTILS_H_
-#define LIBTEXTCLASSIFIER_ANNOTATOR_FLATBUFFER_UTILS_H_
-
-#include <string>
-
-#include "annotator/model_generated.h"
-
-namespace libtextclassifier3 {
-
-// Resolves field lookups by name to the concrete field offsets in the regex
-// rules of the model.
-bool SwapFieldNamesForOffsetsInPath(ModelT* model);
-
-// Same as above but for a serialized model.
-std::string SwapFieldNamesForOffsetsInPathInSerializedModel(
- const std::string& model);
-
-} // namespace libtextclassifier3
-
-#endif // LIBTEXTCLASSIFIER_ANNOTATOR_FLATBUFFER_UTILS_H_
diff --git a/native/annotator/grammar/dates/annotations/annotation-util_test.cc b/native/annotator/grammar/dates/annotations/annotation-util_test.cc
new file mode 100644
index 0000000..6d25d64
--- /dev/null
+++ b/native/annotator/grammar/dates/annotations/annotation-util_test.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/grammar/dates/annotations/annotation-util.h"
+
+#include "annotator/grammar/dates/annotations/annotation.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(AnnotationUtilTest, VerifyIntFunctions) {
+ Annotation annotation;
+
+ int index_key1 = AddIntProperty("key1", 1, &annotation);
+ int index_key2 = AddIntProperty("key2", 2, &annotation);
+
+ static const int kValuesKey3[] = {3, 4, 5};
+ int index_key3 =
+ AddRepeatedIntProperty("key3", kValuesKey3, /*size=*/3, &annotation);
+
+ EXPECT_EQ(2, GetIntProperty("key2", annotation));
+ EXPECT_EQ(1, GetIntProperty("key1", annotation));
+
+ EXPECT_EQ(index_key1, GetPropertyIndex("key1", annotation));
+ EXPECT_EQ(index_key2, GetPropertyIndex("key2", annotation));
+ EXPECT_EQ(index_key3, GetPropertyIndex("key3", annotation));
+ EXPECT_EQ(-1, GetPropertyIndex("invalid_key", annotation));
+}
+
+TEST(AnnotationUtilTest, VerifyAnnotationDataFunctions) {
+ Annotation annotation;
+
+ AnnotationData true_annotation_data;
+ Property true_property;
+ true_property.bool_values.push_back(true);
+ true_annotation_data.properties.push_back(true_property);
+ int index_key1 =
+ AddAnnotationDataProperty("key1", true_annotation_data, &annotation);
+
+ AnnotationData false_annotation_data;
+ Property false_property;
+ false_property.bool_values.push_back(false);
+ true_annotation_data.properties.push_back(false_property);
+ int index_key2 =
+ AddAnnotationDataProperty("key2", false_annotation_data, &annotation);
+
+ EXPECT_EQ(index_key1, GetPropertyIndex("key1", annotation));
+ EXPECT_EQ(index_key2, GetPropertyIndex("key2", annotation));
+ EXPECT_EQ(-1, GetPropertyIndex("invalid_key", annotation));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/grammar/dates/timezone-code.fbs b/native/annotator/grammar/dates/timezone-code.fbs
index ae74982..ff615ee 100755
--- a/native/annotator/grammar/dates/timezone-code.fbs
+++ b/native/annotator/grammar/dates/timezone-code.fbs
@@ -17,9 +17,7 @@
namespace libtextclassifier3.dates;
enum TimezoneCode : int {
TIMEZONE_CODE_NONE = -1,
-
ETC_UNKNOWN = 0,
-
PST8PDT = 1,
// Delegate.
diff --git a/native/annotator/grammar/dates/utils/date-match.cc b/native/annotator/grammar/dates/utils/date-match.cc
index 1ab1e6a..d9fca52 100644
--- a/native/annotator/grammar/dates/utils/date-match.cc
+++ b/native/annotator/grammar/dates/utils/date-match.cc
@@ -225,6 +225,18 @@
return DatetimeComponent::RelativeQualifier::UNSPECIFIED;
}
+// Embed RelativeQualifier information of DatetimeComponent as a sign of
+// relative counter field of datetime component i.e. relative counter is
+// negative when relative qualifier RelativeQualifier::PAST.
+int GetAdjustedRelativeCounter(
+ const DatetimeComponent::RelativeQualifier& relative_qualifier,
+ const int relative_counter) {
+ if (DatetimeComponent::RelativeQualifier::PAST == relative_qualifier) {
+ return -relative_counter;
+ }
+ return relative_counter;
+}
+
Optional<DatetimeComponent> CreateDatetimeComponent(
const DatetimeComponent::ComponentType& component_type,
const DatetimeComponent::RelativeQualifier& relative_qualifier,
@@ -232,13 +244,15 @@
if (absolute_value == NO_VAL && relative_value == NO_VAL) {
return Optional<DatetimeComponent>();
}
- return Optional<DatetimeComponent>(
- DatetimeComponent(component_type,
- (relative_value != NO_VAL)
- ? relative_qualifier
- : DatetimeComponent::RelativeQualifier::UNSPECIFIED,
- (absolute_value != NO_VAL) ? absolute_value : 0,
- (relative_value != NO_VAL) ? relative_value : 0));
+ return Optional<DatetimeComponent>(DatetimeComponent(
+ component_type,
+ (relative_value != NO_VAL)
+ ? relative_qualifier
+ : DatetimeComponent::RelativeQualifier::UNSPECIFIED,
+ (absolute_value != NO_VAL) ? absolute_value : 0,
+ (relative_value != NO_VAL)
+ ? GetAdjustedRelativeCounter(relative_qualifier, relative_value)
+ : 0));
}
Optional<DatetimeComponent> CreateDayOfWeekComponent(
diff --git a/native/annotator/grammar/dates/utils/date-match_test.cc b/native/annotator/grammar/dates/utils/date-match_test.cc
new file mode 100644
index 0000000..f10f32a
--- /dev/null
+++ b/native/annotator/grammar/dates/utils/date-match_test.cc
@@ -0,0 +1,397 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/grammar/dates/utils/date-match.h"
+
+#include <stdint.h>
+
+#include <string>
+
+#include "annotator/grammar/dates/dates_generated.h"
+#include "annotator/grammar/dates/timezone-code_generated.h"
+#include "annotator/grammar/dates/utils/date-utils.h"
+#include "utils/strings/append.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace dates {
+namespace {
+
+class DateMatchTest : public ::testing::Test {
+ protected:
+ enum {
+ X = NO_VAL,
+ };
+
+ static DayOfWeek DOW_X() { return DayOfWeek_DOW_NONE; }
+ static DayOfWeek SUN() { return DayOfWeek_SUNDAY; }
+
+ static BCAD BCAD_X() { return BCAD_BCAD_NONE; }
+ static BCAD BC() { return BCAD_BC; }
+
+ DateMatch& SetDate(DateMatch* date, int year, int8 month, int8 day,
+ DayOfWeek day_of_week = DOW_X(), BCAD bc_ad = BCAD_X()) {
+ date->year = year;
+ date->month = month;
+ date->day = day;
+ date->day_of_week = day_of_week;
+ date->bc_ad = bc_ad;
+ return *date;
+ }
+
+ DateMatch& SetTimeValue(DateMatch* date, int8 hour, int8 minute = X,
+ int8 second = X, double fraction_second = X) {
+ date->hour = hour;
+ date->minute = minute;
+ date->second = second;
+ date->fraction_second = fraction_second;
+ return *date;
+ }
+
+ DateMatch& SetTimeSpan(DateMatch* date, TimespanCode time_span_code) {
+ date->time_span_code = time_span_code;
+ return *date;
+ }
+
+ DateMatch& SetTimeZone(DateMatch* date, TimezoneCode time_zone_code,
+ int16 time_zone_offset = INT16_MIN) {
+ date->time_zone_code = time_zone_code;
+ date->time_zone_offset = time_zone_offset;
+ return *date;
+ }
+
+ bool SameDate(const DateMatch& a, const DateMatch& b) {
+ return (a.day == b.day && a.month == b.month && a.year == b.year &&
+ a.day_of_week == b.day_of_week);
+ }
+
+ DateMatch& SetDayOfWeek(DateMatch* date, DayOfWeek dow) {
+ date->day_of_week = dow;
+ return *date;
+ }
+};
+
+TEST_F(DateMatchTest, BitFieldWidth) {
+ // For DateMatch::day_of_week (:8).
+ EXPECT_GE(DayOfWeek_MIN, INT8_MIN);
+ EXPECT_LE(DayOfWeek_MAX, INT8_MAX);
+
+ // For DateMatch::bc_ad (:8).
+ EXPECT_GE(BCAD_MIN, INT8_MIN);
+ EXPECT_LE(BCAD_MAX, INT8_MAX);
+
+ // For DateMatch::time_span_code (:16).
+ EXPECT_GE(TimespanCode_MIN, INT16_MIN);
+ EXPECT_LE(TimespanCode_MAX, INT16_MAX);
+}
+
+TEST_F(DateMatchTest, IsValid) {
+ // Valid: dates.
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, 26);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, X);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, X, X);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, 26);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, X);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, X, 26);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, 26, SUN());
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, 26, SUN());
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, X, 26, SUN());
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, 26, DOW_X(), BC());
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ // Valid: times.
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, 30, 59, 0.99);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, 30, 59);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, 30);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ // Valid: mixed.
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, 26);
+ SetTimeValue(&d, 12, 30, 59, 0.99);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, 26);
+ SetTimeValue(&d, 12, 30, 59);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, X, X, SUN());
+ SetTimeValue(&d, 12, 30);
+ EXPECT_TRUE(d.IsValid()) << d.DebugString();
+ }
+ // Invalid: dates.
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, 26, DOW_X(), BC());
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, X, 26);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, 2014, X, X, SUN());
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetDate(&d, X, 1, X, SUN());
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ // Invalid: times.
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, X, 59);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, X, X, 0.99);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, 12, 30, X, 0.99);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ {
+ DateMatch d;
+ SetTimeValue(&d, X, 30);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ // Invalid: mixed.
+ {
+ DateMatch d;
+ SetDate(&d, 2014, 1, X);
+ SetTimeValue(&d, 12);
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+ // Invalid: empty.
+ {
+ DateMatch d;
+ EXPECT_FALSE(d.IsValid()) << d.DebugString();
+ }
+}
+
+std::string DebugStrings(const std::vector<DateMatch>& instances) {
+ std::string res;
+ for (int i = 0; i < instances.size(); ++i) {
+ ::libtextclassifier3::strings::SStringAppendF(
+ &res, 0, "[%d] == %s\n", i, instances[i].DebugString().c_str());
+ }
+ return res;
+}
+
+TEST_F(DateMatchTest, IsRefinement) {
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, X);
+ DateMatch b;
+ SetDate(&b, 2014, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ DateMatch b;
+ SetDate(&b, 2014, 2, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ DateMatch b;
+ SetDate(&b, X, 2, 24);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, 0, X);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, 0, 0);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, 0, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ SetTimeSpan(&a, TimespanCode_AM);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ SetTimeZone(&a, TimezoneCode_PST8PDT);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ a.priority += 10;
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ SetTimeValue(&b, 9, X, X);
+ EXPECT_TRUE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ DateMatch b;
+ SetDate(&b, X, 2, 24);
+ SetTimeValue(&b, 9, 0, X);
+ EXPECT_FALSE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetDate(&a, X, 2, 24);
+ SetTimeValue(&a, 9, X, X);
+ DateMatch b;
+ SetDate(&b, 2014, 2, 24);
+ EXPECT_FALSE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+ {
+ DateMatch a;
+ SetTimeValue(&a, 9, 0, 0);
+ DateMatch b;
+ SetTimeValue(&b, 9, X, X);
+ SetTimeSpan(&b, TimespanCode_AM);
+ EXPECT_FALSE(IsRefinement(a, b)) << DebugStrings({a, b});
+ }
+}
+
+TEST_F(DateMatchTest, FillDateInstance_AnnotatorPriorityScore) {
+ DateMatch date_match;
+ SetDate(&date_match, 2014, 2, X);
+ date_match.annotator_priority_score = 0.5;
+ DatetimeParseResultSpan datetime_parse_result_span;
+ FillDateInstance(date_match, &datetime_parse_result_span);
+ EXPECT_FLOAT_EQ(datetime_parse_result_span.priority_score, 0.5)
+ << DebugStrings({date_match});
+}
+
+TEST_F(DateMatchTest, MergeDateMatch_AnnotatorPriorityScore) {
+ DateMatch a;
+ SetDate(&a, 2014, 2, 4);
+ a.annotator_priority_score = 0.5;
+
+ DateMatch b;
+ SetTimeValue(&b, 10, 45, 23);
+ b.annotator_priority_score = 1.0;
+
+ MergeDateMatch(b, &a, false);
+ EXPECT_FLOAT_EQ(a.annotator_priority_score, 1.0);
+}
+
+} // namespace
+} // namespace dates
+} // namespace libtextclassifier3
diff --git a/native/annotator/grammar/grammar-annotator.cc b/native/annotator/grammar/grammar-annotator.cc
index 3acc3ce..baa3fac 100644
--- a/native/annotator/grammar/grammar-annotator.cc
+++ b/native/annotator/grammar/grammar-annotator.cc
@@ -72,8 +72,8 @@
// Deduplicate and populate annotations from grammar matches.
bool GetAnnotations(const std::vector<UnicodeText::const_iterator>& text,
std::vector<AnnotatedSpan>* annotations) const {
- for (const grammar::RuleMatch& candidate :
- grammar::DeduplicateMatches(candidates_)) {
+ for (const grammar::Derivation& candidate :
+ grammar::DeduplicateDerivations(candidates_)) {
// Check that assertions are fulfilled.
if (!grammar::VerifyAssertions(candidate.match)) {
continue;
@@ -87,10 +87,10 @@
bool GetTextSelection(const std::vector<UnicodeText::const_iterator>& text,
const CodepointSpan& selection, AnnotatedSpan* result) {
- std::vector<grammar::RuleMatch> selection_candidates;
+ std::vector<grammar::Derivation> selection_candidates;
// Deduplicate and verify matches.
auto maybe_interpretation = GetBestValidInterpretation(
- grammar::DeduplicateMatches(GetOverlappingRuleMatches(
+ grammar::DeduplicateDerivations(GetOverlappingRuleMatches(
selection, candidates_, /*only_exact_overlap=*/false)));
if (!maybe_interpretation.has_value()) {
return false;
@@ -108,7 +108,7 @@
ClassificationResult* classification) const {
// Deduplicate and verify matches.
auto maybe_interpretation = GetBestValidInterpretation(
- grammar::DeduplicateMatches(GetOverlappingRuleMatches(
+ grammar::DeduplicateDerivations(GetOverlappingRuleMatches(
selection, candidates_, /*only_exact_overlap=*/true)));
if (!maybe_interpretation.has_value()) {
return false;
@@ -127,7 +127,7 @@
void HandleRuleMatch(const grammar::Match* match, const int64 rule_id) {
if ((model_->rule_classification_result()->Get(rule_id)->enabled_modes() &
mode_) != 0) {
- candidates_.push_back(grammar::RuleMatch{match, rule_id});
+ candidates_.push_back(grammar::Derivation{match, rule_id});
}
}
@@ -172,12 +172,12 @@
}
// Filters out results that do not overlap with a reference span.
- std::vector<grammar::RuleMatch> GetOverlappingRuleMatches(
+ std::vector<grammar::Derivation> GetOverlappingRuleMatches(
const CodepointSpan& selection,
- const std::vector<grammar::RuleMatch>& candidates,
+ const std::vector<grammar::Derivation>& candidates,
const bool only_exact_overlap) const {
- std::vector<grammar::RuleMatch> result;
- for (const grammar::RuleMatch& candidate : candidates) {
+ std::vector<grammar::Derivation> result;
+ for (const grammar::Derivation& candidate : candidates) {
// Discard matches that do not match the selection.
// Simple check.
if (!SpansOverlap(selection, candidate.match->codepoint_span)) {
@@ -202,11 +202,11 @@
Optional<std::pair<const GrammarModel_::RuleClassificationResult*,
const grammar::Match*>>
GetBestValidInterpretation(
- const std::vector<grammar::RuleMatch>& candidates) const {
+ const std::vector<grammar::Derivation>& candidates) const {
const GrammarModel_::RuleClassificationResult* best_interpretation =
nullptr;
const grammar::Match* best_match = nullptr;
- for (const grammar::RuleMatch& candidate : candidates) {
+ for (const grammar::Derivation& candidate : candidates) {
if (!grammar::VerifyAssertions(candidate.match)) {
continue;
}
@@ -235,7 +235,7 @@
// result.
bool AddAnnotatedSpanFromMatch(
const std::vector<UnicodeText::const_iterator>& text,
- const grammar::RuleMatch& candidate,
+ const grammar::Derivation& candidate,
std::vector<AnnotatedSpan>* result) const {
if (candidate.rule_id < 0 ||
candidate.rule_id >= model_->rule_classification_result()->size()) {
@@ -347,7 +347,7 @@
// All annotation/selection/classification rule match candidates.
// Grammar rule matches are recorded, deduplicated and then instantiated.
- std::vector<grammar::RuleMatch> candidates_;
+ std::vector<grammar::Derivation> candidates_;
};
GrammarAnnotator::GrammarAnnotator(
diff --git a/native/annotator/grammar/utils.cc b/native/annotator/grammar/utils.cc
index 167f3fd..8b9363d 100644
--- a/native/annotator/grammar/utils.cc
+++ b/native/annotator/grammar/utils.cc
@@ -16,7 +16,14 @@
#include "annotator/grammar/utils.h"
+#include "utils/grammar/utils/rules.h"
+
namespace libtextclassifier3 {
+namespace {
+
+using ::libtextclassifier3::GrammarModel_::RuleClassificationResultT;
+
+} // namespace
Tokenizer BuildTokenizer(const UniLib* unilib,
const GrammarTokenizerOptions* options) {
@@ -44,4 +51,16 @@
/*icu_preserve_whitespace_tokens=*/false);
}
+int AddRuleClassificationResult(const std::string& collection,
+ const ModeFlag& enabled_modes,
+ GrammarModelT* model) {
+ const int result_id = model->rule_classification_result.size();
+ model->rule_classification_result.emplace_back(new RuleClassificationResultT);
+ RuleClassificationResultT* result =
+ model->rule_classification_result.back().get();
+ result->collection_name = collection;
+ result->enabled_modes = enabled_modes;
+ return result_id;
+}
+
} // namespace libtextclassifier3
diff --git a/native/annotator/grammar/utils.h b/native/annotator/grammar/utils.h
index a86a4b2..4d870fd 100644
--- a/native/annotator/grammar/utils.h
+++ b/native/annotator/grammar/utils.h
@@ -29,6 +29,14 @@
Tokenizer BuildTokenizer(const UniLib* unilib,
const GrammarTokenizerOptions* options);
+// Adds a rule classification result to the |model|.
+// collection: the classification entity detected.
+// enabled_modes: the target to apply the given rule.
+// Returns the ID associated with the created classification rule.
+int AddRuleClassificationResult(const std::string& collection,
+ const ModeFlag& enabled_modes,
+ GrammarModelT* model);
+
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_UTILS_H_
diff --git a/native/annotator/model.fbs b/native/annotator/model.fbs
index 95e692d..bdb7a17 100755
--- a/native/annotator/model.fbs
+++ b/native/annotator/model.fbs
@@ -15,6 +15,7 @@
//
include "annotator/entity-data.fbs";
+include "annotator/experimental/experimental.fbs";
include "annotator/grammar/dates/dates.fbs";
include "utils/codepoint-range.fbs";
include "utils/flatbuffers.fbs";
@@ -657,6 +658,7 @@
translate_annotator_options:TranslateAnnotatorOptions;
grammar_model:GrammarModel;
conflict_resolution_options:Model_.ConflictResolutionOptions;
+ experimental_model:ExperimentalModel;
}
// Method for selecting the center token.
diff --git a/native/annotator/quantization_test.cc b/native/annotator/quantization_test.cc
new file mode 100644
index 0000000..b995096
--- /dev/null
+++ b/native/annotator/quantization_test.cc
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/quantization.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using testing::ElementsAreArray;
+using testing::FloatEq;
+using testing::Matcher;
+
+namespace libtextclassifier3 {
+namespace {
+
+Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
+ std::vector<Matcher<float>> matchers;
+ for (const float value : values) {
+ matchers.push_back(FloatEq(value));
+ }
+ return ElementsAreArray(matchers);
+}
+
+TEST(QuantizationTest, DequantizeAdd8bit) {
+ std::vector<float> scales{{0.1, 9.0, -7.0}};
+ std::vector<uint8> embeddings{{/*0: */ 0x00, 0xFF, 0x09, 0x00,
+ /*1: */ 0xFF, 0x09, 0x00, 0xFF,
+ /*2: */ 0x09, 0x00, 0xFF, 0x09}};
+
+ const int quantization_bits = 8;
+ const int bytes_per_embedding = 4;
+ const int num_sparse_features = 7;
+ {
+ const int bucket_id = 0;
+ std::vector<float> dest(4, 0.0);
+ DequantizeAdd(scales.data(), embeddings.data(), bytes_per_embedding,
+ num_sparse_features, quantization_bits, bucket_id,
+ dest.data(), dest.size());
+
+ EXPECT_THAT(dest,
+ ElementsAreFloat(std::vector<float>{
+ // clang-format off
+ {1.0 / 7 * 0.1 * (0x00 - 128),
+ 1.0 / 7 * 0.1 * (0xFF - 128),
+ 1.0 / 7 * 0.1 * (0x09 - 128),
+ 1.0 / 7 * 0.1 * (0x00 - 128)}
+ // clang-format on
+ }));
+ }
+
+ {
+ const int bucket_id = 1;
+ std::vector<float> dest(4, 0.0);
+ DequantizeAdd(scales.data(), embeddings.data(), bytes_per_embedding,
+ num_sparse_features, quantization_bits, bucket_id,
+ dest.data(), dest.size());
+
+ EXPECT_THAT(dest,
+ ElementsAreFloat(std::vector<float>{
+ // clang-format off
+ {1.0 / 7 * 9.0 * (0xFF - 128),
+ 1.0 / 7 * 9.0 * (0x09 - 128),
+ 1.0 / 7 * 9.0 * (0x00 - 128),
+ 1.0 / 7 * 9.0 * (0xFF - 128)}
+ // clang-format on
+ }));
+ }
+}
+
+TEST(QuantizationTest, DequantizeAdd1bitZeros) {
+ const int bytes_per_embedding = 4;
+ const int num_buckets = 3;
+ const int num_sparse_features = 7;
+ const int quantization_bits = 1;
+ const int bucket_id = 1;
+
+ std::vector<float> scales(num_buckets);
+ std::vector<uint8> embeddings(bytes_per_embedding * num_buckets);
+ std::fill(scales.begin(), scales.end(), 1);
+ std::fill(embeddings.begin(), embeddings.end(), 0);
+
+ std::vector<float> dest(32);
+ DequantizeAdd(scales.data(), embeddings.data(), bytes_per_embedding,
+ num_sparse_features, quantization_bits, bucket_id, dest.data(),
+ dest.size());
+
+ std::vector<float> expected(32);
+ std::fill(expected.begin(), expected.end(),
+ 1.0 / num_sparse_features * (0 - 1));
+ EXPECT_THAT(dest, ElementsAreFloat(expected));
+}
+
+TEST(QuantizationTest, DequantizeAdd1bitOnes) {
+ const int bytes_per_embedding = 4;
+ const int num_buckets = 3;
+ const int num_sparse_features = 7;
+ const int quantization_bits = 1;
+ const int bucket_id = 1;
+
+ std::vector<float> scales(num_buckets, 1.0);
+ std::vector<uint8> embeddings(bytes_per_embedding * num_buckets, 0xFF);
+
+ std::vector<float> dest(32);
+ DequantizeAdd(scales.data(), embeddings.data(), bytes_per_embedding,
+ num_sparse_features, quantization_bits, bucket_id, dest.data(),
+ dest.size());
+ std::vector<float> expected(32);
+ std::fill(expected.begin(), expected.end(),
+ 1.0 / num_sparse_features * (1 - 1));
+ EXPECT_THAT(dest, ElementsAreFloat(expected));
+}
+
+TEST(QuantizationTest, DequantizeAdd3bit) {
+ const int bytes_per_embedding = 4;
+ const int num_buckets = 3;
+ const int num_sparse_features = 7;
+ const int quantization_bits = 3;
+ const int bucket_id = 1;
+
+ std::vector<float> scales(num_buckets, 1.0);
+ scales[1] = 9.0;
+ std::vector<uint8> embeddings(bytes_per_embedding * num_buckets, 0);
+ // For bucket_id=1, the embedding has values 0..9 for indices 0..9:
+ embeddings[4] = (1 << 7) | (1 << 6) | (1 << 4) | 1;
+ embeddings[5] = (1 << 6) | (1 << 4) | (1 << 3);
+ embeddings[6] = (1 << 4) | (1 << 3) | (1 << 2) | (1 << 1) | 1;
+
+ std::vector<float> dest(10);
+ DequantizeAdd(scales.data(), embeddings.data(), bytes_per_embedding,
+ num_sparse_features, quantization_bits, bucket_id, dest.data(),
+ dest.size());
+
+ std::vector<float> expected;
+ expected.push_back(1.0 / num_sparse_features * (1 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (2 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (3 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (4 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (5 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (6 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (7 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (0 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (0 - 4) * scales[bucket_id]);
+ expected.push_back(1.0 / num_sparse_features * (0 - 4) * scales[bucket_id]);
+ EXPECT_THAT(dest, ElementsAreFloat(expected));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/strip-unpaired-brackets_test.cc b/native/annotator/strip-unpaired-brackets_test.cc
new file mode 100644
index 0000000..32585ce
--- /dev/null
+++ b/native/annotator/strip-unpaired-brackets_test.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/strip-unpaired-brackets.h"
+
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+class StripUnpairedBracketsTest : public ::testing::Test {
+ protected:
+ StripUnpairedBracketsTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
+ UniLib unilib_;
+};
+
+TEST_F(StripUnpairedBracketsTest, StripUnpairedBrackets) {
+ // If the brackets match, nothing gets stripped.
+ EXPECT_EQ(StripUnpairedBrackets("call me (123) 456 today", {8, 17}, unilib_),
+ std::make_pair(8, 17));
+ EXPECT_EQ(StripUnpairedBrackets("call me (123 456) today", {8, 17}, unilib_),
+ std::make_pair(8, 17));
+
+ // If the brackets don't match, they get stripped.
+ EXPECT_EQ(StripUnpairedBrackets("call me (123 456 today", {8, 16}, unilib_),
+ std::make_pair(9, 16));
+ EXPECT_EQ(StripUnpairedBrackets("call me )123 456 today", {8, 16}, unilib_),
+ std::make_pair(9, 16));
+ EXPECT_EQ(StripUnpairedBrackets("call me 123 456) today", {8, 16}, unilib_),
+ std::make_pair(8, 15));
+ EXPECT_EQ(StripUnpairedBrackets("call me 123 456( today", {8, 16}, unilib_),
+ std::make_pair(8, 15));
+
+ // Strips brackets correctly from length-1 selections that consist of
+ // a bracket only.
+ EXPECT_EQ(StripUnpairedBrackets("call me at ) today", {11, 12}, unilib_),
+ std::make_pair(12, 12));
+ EXPECT_EQ(StripUnpairedBrackets("call me at ( today", {11, 12}, unilib_),
+ std::make_pair(12, 12));
+
+ // Handles invalid spans gracefully.
+ EXPECT_EQ(StripUnpairedBrackets("call me at today", {11, 11}, unilib_),
+ std::make_pair(11, 11));
+ EXPECT_EQ(StripUnpairedBrackets("hello world", {0, 0}, unilib_),
+ std::make_pair(0, 0));
+ EXPECT_EQ(StripUnpairedBrackets("hello world", {11, 11}, unilib_),
+ std::make_pair(11, 11));
+ EXPECT_EQ(StripUnpairedBrackets("hello world", {-1, -1}, unilib_),
+ std::make_pair(-1, -1));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/annotator/test_data/test_grammar_model.fb b/native/annotator/test_data/test_grammar_model.fb
deleted file mode 100644
index 30f133e..0000000
--- a/native/annotator/test_data/test_grammar_model.fb
+++ /dev/null
Binary files differ
diff --git a/native/annotator/test_data/test_model.fb b/native/annotator/test_data/test_model.fb
deleted file mode 100644
index 8b6390e..0000000
--- a/native/annotator/test_data/test_model.fb
+++ /dev/null
Binary files differ
diff --git a/native/annotator/test_data/test_person_name_model.fb b/native/annotator/test_data/test_person_name_model.fb
deleted file mode 100644
index 4752a23..0000000
--- a/native/annotator/test_data/test_person_name_model.fb
+++ /dev/null
Binary files differ
diff --git a/native/annotator/test_data/wrong_embeddings.fb b/native/annotator/test_data/wrong_embeddings.fb
deleted file mode 100644
index dd1cf59..0000000
--- a/native/annotator/test_data/wrong_embeddings.fb
+++ /dev/null
Binary files differ
diff --git a/native/annotator/types-test-util.h b/native/annotator/types-test-util.h
new file mode 100644
index 0000000..1d018a1
--- /dev/null
+++ b/native/annotator/types-test-util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_TYPES_TEST_UTIL_H_
+#define LIBTEXTCLASSIFIER_ANNOTATOR_TYPES_TEST_UTIL_H_
+
+#include <ostream>
+
+#include "annotator/types.h"
+#include "utils/base/logging.h"
+
+namespace libtextclassifier3 {
+
+#define TC3_DECLARE_PRINT_OPERATOR(TYPE_NAME) \
+ inline std::ostream& operator<<(std::ostream& stream, \
+ const TYPE_NAME& value) { \
+ logging::LoggingStringStream tmp_stream; \
+ tmp_stream << value; \
+ return stream << tmp_stream.message; \
+ }
+
+TC3_DECLARE_PRINT_OPERATOR(AnnotatedSpan)
+TC3_DECLARE_PRINT_OPERATOR(ClassificationResult)
+TC3_DECLARE_PRINT_OPERATOR(DatetimeParsedData)
+TC3_DECLARE_PRINT_OPERATOR(DatetimeParseResultSpan)
+TC3_DECLARE_PRINT_OPERATOR(Token)
+
+#undef TC3_DECLARE_PRINT_OPERATOR
+
+} // namespace libtextclassifier3
+
+#endif // LIBTEXTCLASSIFIER_ANNOTATOR_TYPES_TEST_UTIL_H_
diff --git a/native/annotator/zlib-utils_test.cc b/native/annotator/zlib-utils_test.cc
new file mode 100644
index 0000000..df33ea1
--- /dev/null
+++ b/native/annotator/zlib-utils_test.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/zlib-utils.h"
+
+#include <memory>
+
+#include "annotator/model_generated.h"
+#include "utils/zlib/zlib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+
+TEST(AnnotatorZlibUtilsTest, CompressModel) {
+ ModelT model;
+ model.regex_model.reset(new RegexModelT);
+ model.regex_model->patterns.emplace_back(new RegexModel_::PatternT);
+ model.regex_model->patterns.back()->pattern = "this is a test pattern";
+ model.regex_model->patterns.emplace_back(new RegexModel_::PatternT);
+ model.regex_model->patterns.back()->pattern = "this is a second test pattern";
+
+ model.datetime_model.reset(new DatetimeModelT);
+ model.datetime_model->patterns.emplace_back(new DatetimeModelPatternT);
+ model.datetime_model->patterns.back()->regexes.emplace_back(
+ new DatetimeModelPattern_::RegexT);
+ model.datetime_model->patterns.back()->regexes.back()->pattern =
+ "an example datetime pattern";
+ model.datetime_model->extractors.emplace_back(new DatetimeModelExtractorT);
+ model.datetime_model->extractors.back()->pattern =
+ "an example datetime extractor";
+
+ model.intent_options.reset(new IntentFactoryModelT);
+ model.intent_options->generator.emplace_back(
+ new IntentFactoryModel_::IntentGeneratorT);
+ const std::string intent_generator1 = "lua generator 1";
+ model.intent_options->generator.back()->lua_template_generator =
+ std::vector<uint8_t>(intent_generator1.begin(), intent_generator1.end());
+ model.intent_options->generator.emplace_back(
+ new IntentFactoryModel_::IntentGeneratorT);
+ const std::string intent_generator2 = "lua generator 2";
+ model.intent_options->generator.back()->lua_template_generator =
+ std::vector<uint8_t>(intent_generator2.begin(), intent_generator2.end());
+
+ // NOTE: The resource strings contain some repetition, so that the compressed
+ // version is smaller than the uncompressed one. Because the compression code
+ // looks at that as well.
+ model.resources.reset(new ResourcePoolT);
+ model.resources->resource_entry.emplace_back(new ResourceEntryT);
+ model.resources->resource_entry.back()->resource.emplace_back(new ResourceT);
+ model.resources->resource_entry.back()->resource.back()->content =
+ "rrrrrrrrrrrrr1.1";
+ model.resources->resource_entry.back()->resource.emplace_back(new ResourceT);
+ model.resources->resource_entry.back()->resource.back()->content =
+ "rrrrrrrrrrrrr1.2";
+ model.resources->resource_entry.emplace_back(new ResourceEntryT);
+ model.resources->resource_entry.back()->resource.emplace_back(new ResourceT);
+ model.resources->resource_entry.back()->resource.back()->content =
+ "rrrrrrrrrrrrr2.1";
+ model.resources->resource_entry.back()->resource.emplace_back(new ResourceT);
+ model.resources->resource_entry.back()->resource.back()->content =
+ "rrrrrrrrrrrrr2.2";
+
+ // Compress the model.
+ EXPECT_TRUE(CompressModel(&model));
+
+ // Sanity check that uncompressed field is removed.
+ EXPECT_TRUE(model.regex_model->patterns[0]->pattern.empty());
+ EXPECT_TRUE(model.regex_model->patterns[1]->pattern.empty());
+ EXPECT_TRUE(model.datetime_model->patterns[0]->regexes[0]->pattern.empty());
+ EXPECT_TRUE(model.datetime_model->extractors[0]->pattern.empty());
+ EXPECT_TRUE(
+ model.intent_options->generator[0]->lua_template_generator.empty());
+ EXPECT_TRUE(
+ model.intent_options->generator[1]->lua_template_generator.empty());
+ EXPECT_TRUE(model.resources->resource_entry[0]->resource[0]->content.empty());
+ EXPECT_TRUE(model.resources->resource_entry[0]->resource[1]->content.empty());
+ EXPECT_TRUE(model.resources->resource_entry[1]->resource[0]->content.empty());
+ EXPECT_TRUE(model.resources->resource_entry[1]->resource[1]->content.empty());
+
+ // Pack and load the model.
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(Model::Pack(builder, &model));
+ const Model* compressed_model =
+ GetModel(reinterpret_cast<const char*>(builder.GetBufferPointer()));
+ ASSERT_TRUE(compressed_model != nullptr);
+
+ // Decompress the fields again and check that they match the original.
+ std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();
+ ASSERT_TRUE(decompressor != nullptr);
+ std::string uncompressed_pattern;
+ EXPECT_TRUE(decompressor->MaybeDecompress(
+ compressed_model->regex_model()->patterns()->Get(0)->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, "this is a test pattern");
+ EXPECT_TRUE(decompressor->MaybeDecompress(
+ compressed_model->regex_model()->patterns()->Get(1)->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, "this is a second test pattern");
+ EXPECT_TRUE(decompressor->MaybeDecompress(compressed_model->datetime_model()
+ ->patterns()
+ ->Get(0)
+ ->regexes()
+ ->Get(0)
+ ->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, "an example datetime pattern");
+ EXPECT_TRUE(decompressor->MaybeDecompress(compressed_model->datetime_model()
+ ->extractors()
+ ->Get(0)
+ ->compressed_pattern(),
+ &uncompressed_pattern));
+ EXPECT_EQ(uncompressed_pattern, "an example datetime extractor");
+
+ EXPECT_TRUE(DecompressModel(&model));
+ EXPECT_EQ(model.regex_model->patterns[0]->pattern, "this is a test pattern");
+ EXPECT_EQ(model.regex_model->patterns[1]->pattern,
+ "this is a second test pattern");
+ EXPECT_EQ(model.datetime_model->patterns[0]->regexes[0]->pattern,
+ "an example datetime pattern");
+ EXPECT_EQ(model.datetime_model->extractors[0]->pattern,
+ "an example datetime extractor");
+ EXPECT_EQ(
+ model.intent_options->generator[0]->lua_template_generator,
+ std::vector<uint8_t>(intent_generator1.begin(), intent_generator1.end()));
+ EXPECT_EQ(
+ model.intent_options->generator[1]->lua_template_generator,
+ std::vector<uint8_t>(intent_generator2.begin(), intent_generator2.end()));
+ EXPECT_EQ(model.resources->resource_entry[0]->resource[0]->content,
+ "rrrrrrrrrrrrr1.1");
+ EXPECT_EQ(model.resources->resource_entry[0]->resource[1]->content,
+ "rrrrrrrrrrrrr1.2");
+ EXPECT_EQ(model.resources->resource_entry[1]->resource[0]->content,
+ "rrrrrrrrrrrrr2.1");
+ EXPECT_EQ(model.resources->resource_entry[1]->resource[1]->content,
+ "rrrrrrrrrrrrr2.2");
+}
+
+} // namespace libtextclassifier3
diff --git a/native/lang_id/script/approx-script-data.cc b/native/lang_id/script/approx-script-data.cc
index e11d7b7..233653f 100755
--- a/native/lang_id/script/approx-script-data.cc
+++ b/native/lang_id/script/approx-script-data.cc
@@ -27,7 +27,7 @@
namespace mobile {
namespace approx_script_internal {
-const int kNumRanges = 367;
+const int kNumRanges = 376;
const uint32 kRangeFirst[] = {
65, // Range #0: [65, 90, Latin]
@@ -48,355 +48,364 @@
1008, // Range #15: [1008, 1023, Greek]
1024, // Range #16: [1024, 1156, Cyrillic]
1159, // Range #17: [1159, 1327, Cyrillic]
- 1329, // Range #18: [1329, 1416, Armenian]
- 1418, // Range #19: [1418, 1423, Armenian]
- 1425, // Range #20: [1425, 1479, Hebrew]
- 1488, // Range #21: [1488, 1524, Hebrew]
- 1536, // Range #22: [1536, 1540, Arabic]
- 1542, // Range #23: [1542, 1547, Arabic]
- 1549, // Range #24: [1549, 1562, Arabic]
- 1564, // Range #25: [1564, 1566, Arabic]
- 1568, // Range #26: [1568, 1599, Arabic]
- 1601, // Range #27: [1601, 1610, Arabic]
- 1622, // Range #28: [1622, 1647, Arabic]
- 1649, // Range #29: [1649, 1756, Arabic]
- 1758, // Range #30: [1758, 1791, Arabic]
- 1792, // Range #31: [1792, 1871, Syriac]
- 1872, // Range #32: [1872, 1919, Arabic]
- 1920, // Range #33: [1920, 1969, Thaana]
- 1984, // Range #34: [1984, 2047, Nko]
- 2048, // Range #35: [2048, 2110, Samaritan]
- 2112, // Range #36: [2112, 2142, Mandaic]
- 2144, // Range #37: [2144, 2154, Syriac]
- 2208, // Range #38: [2208, 2237, Arabic]
- 2259, // Range #39: [2259, 2273, Arabic]
- 2275, // Range #40: [2275, 2303, Arabic]
- 2304, // Range #41: [2304, 2384, Devanagari]
- 2389, // Range #42: [2389, 2403, Devanagari]
- 2406, // Range #43: [2406, 2431, Devanagari]
- 2432, // Range #44: [2432, 2510, Bengali]
- 2519, // Range #45: [2519, 2558, Bengali]
- 2561, // Range #46: [2561, 2641, Gurmukhi]
- 2649, // Range #47: [2649, 2654, Gurmukhi]
- 2662, // Range #48: [2662, 2678, Gurmukhi]
- 2689, // Range #49: [2689, 2768, Gujarati]
- 2784, // Range #50: [2784, 2801, Gujarati]
- 2809, // Range #51: [2809, 2815, Gujarati]
- 2817, // Range #52: [2817, 2893, Oriya]
- 2902, // Range #53: [2902, 2935, Oriya]
- 2946, // Range #54: [2946, 3024, Tamil]
- 3031, // Range #55: [3031, 3031, Tamil]
- 3046, // Range #56: [3046, 3066, Tamil]
- 3072, // Range #57: [3072, 3149, Telugu]
- 3157, // Range #58: [3157, 3162, Telugu]
- 3168, // Range #59: [3168, 3183, Telugu]
- 3191, // Range #60: [3191, 3199, Telugu]
- 3200, // Range #61: [3200, 3277, Kannada]
- 3285, // Range #62: [3285, 3286, Kannada]
- 3294, // Range #63: [3294, 3314, Kannada]
- 3328, // Range #64: [3328, 3455, Malayalam]
- 3458, // Range #65: [3458, 3551, Sinhala]
- 3558, // Range #66: [3558, 3572, Sinhala]
- 3585, // Range #67: [3585, 3642, Thai]
- 3648, // Range #68: [3648, 3675, Thai]
- 3713, // Range #69: [3713, 3807, Lao]
- 3840, // Range #70: [3840, 4052, Tibetan]
- 4057, // Range #71: [4057, 4058, Tibetan]
- 4096, // Range #72: [4096, 4255, Myanmar]
- 4256, // Range #73: [4256, 4295, Georgian]
- 4301, // Range #74: [4301, 4346, Georgian]
- 4348, // Range #75: [4348, 4351, Georgian]
- 4352, // Range #76: [4352, 4607, Hangul]
- 4608, // Range #77: [4608, 5017, Ethiopic]
- 5024, // Range #78: [5024, 5117, Cherokee]
- 5120, // Range #79: [5120, 5759, Canadian_Aboriginal]
- 5760, // Range #80: [5760, 5788, Ogham]
- 5792, // Range #81: [5792, 5866, Runic]
- 5870, // Range #82: [5870, 5880, Runic]
- 5888, // Range #83: [5888, 5908, Tagalog]
- 5920, // Range #84: [5920, 5940, Hanunoo]
- 5952, // Range #85: [5952, 5971, Buhid]
- 5984, // Range #86: [5984, 6003, Tagbanwa]
- 6016, // Range #87: [6016, 6121, Khmer]
- 6128, // Range #88: [6128, 6137, Khmer]
- 6144, // Range #89: [6144, 6145, Mongolian]
- 6148, // Range #90: [6148, 6148, Mongolian]
- 6150, // Range #91: [6150, 6169, Mongolian]
- 6176, // Range #92: [6176, 6264, Mongolian]
- 6272, // Range #93: [6272, 6314, Mongolian]
- 6320, // Range #94: [6320, 6389, Canadian_Aboriginal]
- 6400, // Range #95: [6400, 6479, Limbu]
- 6480, // Range #96: [6480, 6516, Tai_Le]
- 6528, // Range #97: [6528, 6601, New_Tai_Lue]
- 6608, // Range #98: [6608, 6623, New_Tai_Lue]
- 6624, // Range #99: [6624, 6655, Khmer]
- 6656, // Range #100: [6656, 6687, Buginese]
- 6688, // Range #101: [6688, 6793, Tai_Tham]
- 6800, // Range #102: [6800, 6809, Tai_Tham]
- 6816, // Range #103: [6816, 6829, Tai_Tham]
- 6912, // Range #104: [6912, 7036, Balinese]
- 7040, // Range #105: [7040, 7103, Sundanese]
- 7104, // Range #106: [7104, 7155, Batak]
- 7164, // Range #107: [7164, 7167, Batak]
- 7168, // Range #108: [7168, 7247, Lepcha]
- 7248, // Range #109: [7248, 7295, Ol_Chiki]
- 7296, // Range #110: [7296, 7304, Cyrillic]
- 7312, // Range #111: [7312, 7359, Georgian]
- 7360, // Range #112: [7360, 7367, Sundanese]
- 7424, // Range #113: [7424, 7461, Latin]
- 7462, // Range #114: [7462, 7466, Greek]
- 7467, // Range #115: [7467, 7467, Cyrillic]
- 7468, // Range #116: [7468, 7516, Latin]
- 7517, // Range #117: [7517, 7521, Greek]
- 7522, // Range #118: [7522, 7525, Latin]
- 7526, // Range #119: [7526, 7530, Greek]
- 7531, // Range #120: [7531, 7543, Latin]
- 7544, // Range #121: [7544, 7544, Cyrillic]
- 7545, // Range #122: [7545, 7614, Latin]
- 7615, // Range #123: [7615, 7615, Greek]
- 7680, // Range #124: [7680, 7935, Latin]
- 7936, // Range #125: [7936, 8190, Greek]
- 8305, // Range #126: [8305, 8305, Latin]
- 8319, // Range #127: [8319, 8319, Latin]
- 8336, // Range #128: [8336, 8348, Latin]
- 8486, // Range #129: [8486, 8486, Greek]
- 8490, // Range #130: [8490, 8491, Latin]
- 8498, // Range #131: [8498, 8498, Latin]
- 8526, // Range #132: [8526, 8526, Latin]
- 8544, // Range #133: [8544, 8584, Latin]
- 10240, // Range #134: [10240, 10495, Braille]
- 11264, // Range #135: [11264, 11358, Glagolitic]
- 11360, // Range #136: [11360, 11391, Latin]
- 11392, // Range #137: [11392, 11507, Coptic]
- 11513, // Range #138: [11513, 11519, Coptic]
- 11520, // Range #139: [11520, 11559, Georgian]
- 11565, // Range #140: [11565, 11565, Georgian]
- 11568, // Range #141: [11568, 11623, Tifinagh]
- 11631, // Range #142: [11631, 11632, Tifinagh]
- 11647, // Range #143: [11647, 11647, Tifinagh]
- 11648, // Range #144: [11648, 11670, Ethiopic]
- 11680, // Range #145: [11680, 11742, Ethiopic]
- 11744, // Range #146: [11744, 11775, Cyrillic]
- 11904, // Range #147: [11904, 12019, Han]
- 12032, // Range #148: [12032, 12245, Han]
- 12293, // Range #149: [12293, 12293, Han]
- 12295, // Range #150: [12295, 12295, Han]
- 12321, // Range #151: [12321, 12329, Han]
- 12334, // Range #152: [12334, 12335, Hangul]
- 12344, // Range #153: [12344, 12347, Han]
- 12353, // Range #154: [12353, 12438, Hiragana]
- 12445, // Range #155: [12445, 12447, Hiragana]
- 12449, // Range #156: [12449, 12538, Katakana]
- 12541, // Range #157: [12541, 12543, Katakana]
- 12549, // Range #158: [12549, 12591, Bopomofo]
- 12593, // Range #159: [12593, 12686, Hangul]
- 12704, // Range #160: [12704, 12730, Bopomofo]
- 12784, // Range #161: [12784, 12799, Katakana]
- 12800, // Range #162: [12800, 12830, Hangul]
- 12896, // Range #163: [12896, 12926, Hangul]
- 13008, // Range #164: [13008, 13054, Katakana]
- 13056, // Range #165: [13056, 13143, Katakana]
- 13312, // Range #166: [13312, 19893, Han]
- 19968, // Range #167: [19968, 40943, Han]
- 40960, // Range #168: [40960, 42182, Yi]
- 42192, // Range #169: [42192, 42239, Lisu]
- 42240, // Range #170: [42240, 42539, Vai]
- 42560, // Range #171: [42560, 42655, Cyrillic]
- 42656, // Range #172: [42656, 42743, Bamum]
- 42786, // Range #173: [42786, 42887, Latin]
- 42891, // Range #174: [42891, 42950, Latin]
- 42999, // Range #175: [42999, 43007, Latin]
- 43008, // Range #176: [43008, 43051, Syloti_Nagri]
- 43072, // Range #177: [43072, 43127, Phags_Pa]
- 43136, // Range #178: [43136, 43205, Saurashtra]
- 43214, // Range #179: [43214, 43225, Saurashtra]
- 43232, // Range #180: [43232, 43263, Devanagari]
- 43264, // Range #181: [43264, 43309, Kayah_Li]
- 43311, // Range #182: [43311, 43311, Kayah_Li]
- 43312, // Range #183: [43312, 43347, Rejang]
- 43359, // Range #184: [43359, 43359, Rejang]
- 43360, // Range #185: [43360, 43388, Hangul]
- 43392, // Range #186: [43392, 43469, Javanese]
- 43472, // Range #187: [43472, 43487, Javanese]
- 43488, // Range #188: [43488, 43518, Myanmar]
- 43520, // Range #189: [43520, 43574, Cham]
- 43584, // Range #190: [43584, 43615, Cham]
- 43616, // Range #191: [43616, 43647, Myanmar]
- 43648, // Range #192: [43648, 43714, Tai_Viet]
- 43739, // Range #193: [43739, 43743, Tai_Viet]
- 43744, // Range #194: [43744, 43766, Meetei_Mayek]
- 43777, // Range #195: [43777, 43798, Ethiopic]
- 43808, // Range #196: [43808, 43822, Ethiopic]
- 43824, // Range #197: [43824, 43866, Latin]
- 43868, // Range #198: [43868, 43876, Latin]
- 43877, // Range #199: [43877, 43877, Greek]
- 43878, // Range #200: [43878, 43879, Latin]
- 43888, // Range #201: [43888, 43967, Cherokee]
- 43968, // Range #202: [43968, 44025, Meetei_Mayek]
- 44032, // Range #203: [44032, 55203, Hangul]
- 55216, // Range #204: [55216, 55291, Hangul]
- 63744, // Range #205: [63744, 64217, Han]
- 64256, // Range #206: [64256, 64262, Latin]
- 64275, // Range #207: [64275, 64279, Armenian]
- 64285, // Range #208: [64285, 64335, Hebrew]
- 64336, // Range #209: [64336, 64449, Arabic]
- 64467, // Range #210: [64467, 64829, Arabic]
- 64848, // Range #211: [64848, 64967, Arabic]
- 65008, // Range #212: [65008, 65021, Arabic]
- 65070, // Range #213: [65070, 65071, Cyrillic]
- 65136, // Range #214: [65136, 65276, Arabic]
- 65313, // Range #215: [65313, 65338, Latin]
- 65345, // Range #216: [65345, 65370, Latin]
- 65382, // Range #217: [65382, 65391, Katakana]
- 65393, // Range #218: [65393, 65437, Katakana]
- 65440, // Range #219: [65440, 65500, Hangul]
- 65536, // Range #220: [65536, 65629, Linear_B]
- 65664, // Range #221: [65664, 65786, Linear_B]
- 65856, // Range #222: [65856, 65934, Greek]
- 65952, // Range #223: [65952, 65952, Greek]
- 66176, // Range #224: [66176, 66204, Lycian]
- 66208, // Range #225: [66208, 66256, Carian]
- 66304, // Range #226: [66304, 66339, Old_Italic]
- 66349, // Range #227: [66349, 66351, Old_Italic]
- 66352, // Range #228: [66352, 66378, Gothic]
- 66384, // Range #229: [66384, 66426, Old_Permic]
- 66432, // Range #230: [66432, 66463, Ugaritic]
- 66464, // Range #231: [66464, 66517, Old_Persian]
- 66560, // Range #232: [66560, 66639, Deseret]
- 66640, // Range #233: [66640, 66687, Shavian]
- 66688, // Range #234: [66688, 66729, Osmanya]
- 66736, // Range #235: [66736, 66811, Osage]
- 66816, // Range #236: [66816, 66855, Elbasan]
- 66864, // Range #237: [66864, 66915, Caucasian_Albanian]
- 66927, // Range #238: [66927, 66927, Caucasian_Albanian]
- 67072, // Range #239: [67072, 67382, Linear_A]
- 67392, // Range #240: [67392, 67413, Linear_A]
- 67424, // Range #241: [67424, 67431, Linear_A]
- 67584, // Range #242: [67584, 67647, Cypriot]
- 67648, // Range #243: [67648, 67679, Imperial_Aramaic]
- 67680, // Range #244: [67680, 67711, Palmyrene]
- 67712, // Range #245: [67712, 67742, Nabataean]
- 67751, // Range #246: [67751, 67759, Nabataean]
- 67808, // Range #247: [67808, 67829, Hatran]
- 67835, // Range #248: [67835, 67839, Hatran]
- 67840, // Range #249: [67840, 67871, Phoenician]
- 67872, // Range #250: [67872, 67897, Lydian]
- 67903, // Range #251: [67903, 67903, Lydian]
- 67968, // Range #252: [67968, 67999, Meroitic_Hieroglyphs]
- 68000, // Range #253: [68000, 68095, Meroitic_Cursive]
- 68096, // Range #254: [68096, 68102, Kharoshthi]
- 68108, // Range #255: [68108, 68168, Kharoshthi]
- 68176, // Range #256: [68176, 68184, Kharoshthi]
- 68192, // Range #257: [68192, 68223, Old_South_Arabian]
- 68224, // Range #258: [68224, 68255, Old_North_Arabian]
- 68288, // Range #259: [68288, 68342, Manichaean]
- 68352, // Range #260: [68352, 68415, Avestan]
- 68416, // Range #261: [68416, 68447, Inscriptional_Parthian]
- 68448, // Range #262: [68448, 68466, Inscriptional_Pahlavi]
- 68472, // Range #263: [68472, 68479, Inscriptional_Pahlavi]
- 68480, // Range #264: [68480, 68497, Psalter_Pahlavi]
- 68505, // Range #265: [68505, 68508, Psalter_Pahlavi]
- 68521, // Range #266: [68521, 68527, Psalter_Pahlavi]
- 68608, // Range #267: [68608, 68680, Old_Turkic]
- 68736, // Range #268: [68736, 68786, Old_Hungarian]
- 68800, // Range #269: [68800, 68850, Old_Hungarian]
- 68858, // Range #270: [68858, 68863, Old_Hungarian]
- 68864, // Range #271: [68864, 68903, Hanifi_Rohingya]
- 68912, // Range #272: [68912, 68921, Hanifi_Rohingya]
- 69216, // Range #273: [69216, 69246, Arabic]
+ 1329, // Range #18: [1329, 1423, Armenian]
+ 1425, // Range #19: [1425, 1479, Hebrew]
+ 1488, // Range #20: [1488, 1524, Hebrew]
+ 1536, // Range #21: [1536, 1540, Arabic]
+ 1542, // Range #22: [1542, 1547, Arabic]
+ 1549, // Range #23: [1549, 1562, Arabic]
+ 1564, // Range #24: [1564, 1566, Arabic]
+ 1568, // Range #25: [1568, 1599, Arabic]
+ 1601, // Range #26: [1601, 1610, Arabic]
+ 1622, // Range #27: [1622, 1647, Arabic]
+ 1649, // Range #28: [1649, 1756, Arabic]
+ 1758, // Range #29: [1758, 1791, Arabic]
+ 1792, // Range #30: [1792, 1871, Syriac]
+ 1872, // Range #31: [1872, 1919, Arabic]
+ 1920, // Range #32: [1920, 1969, Thaana]
+ 1984, // Range #33: [1984, 2047, Nko]
+ 2048, // Range #34: [2048, 2110, Samaritan]
+ 2112, // Range #35: [2112, 2142, Mandaic]
+ 2144, // Range #36: [2144, 2154, Syriac]
+ 2208, // Range #37: [2208, 2247, Arabic]
+ 2259, // Range #38: [2259, 2273, Arabic]
+ 2275, // Range #39: [2275, 2303, Arabic]
+ 2304, // Range #40: [2304, 2384, Devanagari]
+ 2389, // Range #41: [2389, 2403, Devanagari]
+ 2406, // Range #42: [2406, 2431, Devanagari]
+ 2432, // Range #43: [2432, 2510, Bengali]
+ 2519, // Range #44: [2519, 2558, Bengali]
+ 2561, // Range #45: [2561, 2641, Gurmukhi]
+ 2649, // Range #46: [2649, 2654, Gurmukhi]
+ 2662, // Range #47: [2662, 2678, Gurmukhi]
+ 2689, // Range #48: [2689, 2768, Gujarati]
+ 2784, // Range #49: [2784, 2801, Gujarati]
+ 2809, // Range #50: [2809, 2815, Gujarati]
+ 2817, // Range #51: [2817, 2893, Oriya]
+ 2901, // Range #52: [2901, 2935, Oriya]
+ 2946, // Range #53: [2946, 3024, Tamil]
+ 3031, // Range #54: [3031, 3031, Tamil]
+ 3046, // Range #55: [3046, 3066, Tamil]
+ 3072, // Range #56: [3072, 3149, Telugu]
+ 3157, // Range #57: [3157, 3162, Telugu]
+ 3168, // Range #58: [3168, 3183, Telugu]
+ 3191, // Range #59: [3191, 3199, Telugu]
+ 3200, // Range #60: [3200, 3277, Kannada]
+ 3285, // Range #61: [3285, 3286, Kannada]
+ 3294, // Range #62: [3294, 3314, Kannada]
+ 3328, // Range #63: [3328, 3455, Malayalam]
+ 3457, // Range #64: [3457, 3551, Sinhala]
+ 3558, // Range #65: [3558, 3572, Sinhala]
+ 3585, // Range #66: [3585, 3642, Thai]
+ 3648, // Range #67: [3648, 3675, Thai]
+ 3713, // Range #68: [3713, 3807, Lao]
+ 3840, // Range #69: [3840, 4052, Tibetan]
+ 4057, // Range #70: [4057, 4058, Tibetan]
+ 4096, // Range #71: [4096, 4255, Myanmar]
+ 4256, // Range #72: [4256, 4295, Georgian]
+ 4301, // Range #73: [4301, 4346, Georgian]
+ 4348, // Range #74: [4348, 4351, Georgian]
+ 4352, // Range #75: [4352, 4607, Hangul]
+ 4608, // Range #76: [4608, 5017, Ethiopic]
+ 5024, // Range #77: [5024, 5117, Cherokee]
+ 5120, // Range #78: [5120, 5759, Canadian_Aboriginal]
+ 5760, // Range #79: [5760, 5788, Ogham]
+ 5792, // Range #80: [5792, 5866, Runic]
+ 5870, // Range #81: [5870, 5880, Runic]
+ 5888, // Range #82: [5888, 5908, Tagalog]
+ 5920, // Range #83: [5920, 5940, Hanunoo]
+ 5952, // Range #84: [5952, 5971, Buhid]
+ 5984, // Range #85: [5984, 6003, Tagbanwa]
+ 6016, // Range #86: [6016, 6121, Khmer]
+ 6128, // Range #87: [6128, 6137, Khmer]
+ 6144, // Range #88: [6144, 6145, Mongolian]
+ 6148, // Range #89: [6148, 6148, Mongolian]
+ 6150, // Range #90: [6150, 6169, Mongolian]
+ 6176, // Range #91: [6176, 6264, Mongolian]
+ 6272, // Range #92: [6272, 6314, Mongolian]
+ 6320, // Range #93: [6320, 6389, Canadian_Aboriginal]
+ 6400, // Range #94: [6400, 6479, Limbu]
+ 6480, // Range #95: [6480, 6516, Tai_Le]
+ 6528, // Range #96: [6528, 6601, New_Tai_Lue]
+ 6608, // Range #97: [6608, 6623, New_Tai_Lue]
+ 6624, // Range #98: [6624, 6655, Khmer]
+ 6656, // Range #99: [6656, 6687, Buginese]
+ 6688, // Range #100: [6688, 6793, Tai_Tham]
+ 6800, // Range #101: [6800, 6809, Tai_Tham]
+ 6816, // Range #102: [6816, 6829, Tai_Tham]
+ 6912, // Range #103: [6912, 7036, Balinese]
+ 7040, // Range #104: [7040, 7103, Sundanese]
+ 7104, // Range #105: [7104, 7155, Batak]
+ 7164, // Range #106: [7164, 7167, Batak]
+ 7168, // Range #107: [7168, 7247, Lepcha]
+ 7248, // Range #108: [7248, 7295, Ol_Chiki]
+ 7296, // Range #109: [7296, 7304, Cyrillic]
+ 7312, // Range #110: [7312, 7359, Georgian]
+ 7360, // Range #111: [7360, 7367, Sundanese]
+ 7424, // Range #112: [7424, 7461, Latin]
+ 7462, // Range #113: [7462, 7466, Greek]
+ 7467, // Range #114: [7467, 7467, Cyrillic]
+ 7468, // Range #115: [7468, 7516, Latin]
+ 7517, // Range #116: [7517, 7521, Greek]
+ 7522, // Range #117: [7522, 7525, Latin]
+ 7526, // Range #118: [7526, 7530, Greek]
+ 7531, // Range #119: [7531, 7543, Latin]
+ 7544, // Range #120: [7544, 7544, Cyrillic]
+ 7545, // Range #121: [7545, 7614, Latin]
+ 7615, // Range #122: [7615, 7615, Greek]
+ 7680, // Range #123: [7680, 7935, Latin]
+ 7936, // Range #124: [7936, 8190, Greek]
+ 8305, // Range #125: [8305, 8305, Latin]
+ 8319, // Range #126: [8319, 8319, Latin]
+ 8336, // Range #127: [8336, 8348, Latin]
+ 8486, // Range #128: [8486, 8486, Greek]
+ 8490, // Range #129: [8490, 8491, Latin]
+ 8498, // Range #130: [8498, 8498, Latin]
+ 8526, // Range #131: [8526, 8526, Latin]
+ 8544, // Range #132: [8544, 8584, Latin]
+ 10240, // Range #133: [10240, 10495, Braille]
+ 11264, // Range #134: [11264, 11358, Glagolitic]
+ 11360, // Range #135: [11360, 11391, Latin]
+ 11392, // Range #136: [11392, 11507, Coptic]
+ 11513, // Range #137: [11513, 11519, Coptic]
+ 11520, // Range #138: [11520, 11559, Georgian]
+ 11565, // Range #139: [11565, 11565, Georgian]
+ 11568, // Range #140: [11568, 11623, Tifinagh]
+ 11631, // Range #141: [11631, 11632, Tifinagh]
+ 11647, // Range #142: [11647, 11647, Tifinagh]
+ 11648, // Range #143: [11648, 11670, Ethiopic]
+ 11680, // Range #144: [11680, 11742, Ethiopic]
+ 11744, // Range #145: [11744, 11775, Cyrillic]
+ 11904, // Range #146: [11904, 12019, Han]
+ 12032, // Range #147: [12032, 12245, Han]
+ 12293, // Range #148: [12293, 12293, Han]
+ 12295, // Range #149: [12295, 12295, Han]
+ 12321, // Range #150: [12321, 12329, Han]
+ 12334, // Range #151: [12334, 12335, Hangul]
+ 12344, // Range #152: [12344, 12347, Han]
+ 12353, // Range #153: [12353, 12438, Hiragana]
+ 12445, // Range #154: [12445, 12447, Hiragana]
+ 12449, // Range #155: [12449, 12538, Katakana]
+ 12541, // Range #156: [12541, 12543, Katakana]
+ 12549, // Range #157: [12549, 12591, Bopomofo]
+ 12593, // Range #158: [12593, 12686, Hangul]
+ 12704, // Range #159: [12704, 12735, Bopomofo]
+ 12784, // Range #160: [12784, 12799, Katakana]
+ 12800, // Range #161: [12800, 12830, Hangul]
+ 12896, // Range #162: [12896, 12926, Hangul]
+ 13008, // Range #163: [13008, 13054, Katakana]
+ 13056, // Range #164: [13056, 13143, Katakana]
+ 13312, // Range #165: [13312, 19903, Han]
+ 19968, // Range #166: [19968, 40956, Han]
+ 40960, // Range #167: [40960, 42182, Yi]
+ 42192, // Range #168: [42192, 42239, Lisu]
+ 42240, // Range #169: [42240, 42539, Vai]
+ 42560, // Range #170: [42560, 42655, Cyrillic]
+ 42656, // Range #171: [42656, 42743, Bamum]
+ 42786, // Range #172: [42786, 42887, Latin]
+ 42891, // Range #173: [42891, 42954, Latin]
+ 42997, // Range #174: [42997, 43007, Latin]
+ 43008, // Range #175: [43008, 43052, Syloti_Nagri]
+ 43072, // Range #176: [43072, 43127, Phags_Pa]
+ 43136, // Range #177: [43136, 43205, Saurashtra]
+ 43214, // Range #178: [43214, 43225, Saurashtra]
+ 43232, // Range #179: [43232, 43263, Devanagari]
+ 43264, // Range #180: [43264, 43309, Kayah_Li]
+ 43311, // Range #181: [43311, 43311, Kayah_Li]
+ 43312, // Range #182: [43312, 43347, Rejang]
+ 43359, // Range #183: [43359, 43359, Rejang]
+ 43360, // Range #184: [43360, 43388, Hangul]
+ 43392, // Range #185: [43392, 43469, Javanese]
+ 43472, // Range #186: [43472, 43487, Javanese]
+ 43488, // Range #187: [43488, 43518, Myanmar]
+ 43520, // Range #188: [43520, 43574, Cham]
+ 43584, // Range #189: [43584, 43615, Cham]
+ 43616, // Range #190: [43616, 43647, Myanmar]
+ 43648, // Range #191: [43648, 43714, Tai_Viet]
+ 43739, // Range #192: [43739, 43743, Tai_Viet]
+ 43744, // Range #193: [43744, 43766, Meetei_Mayek]
+ 43777, // Range #194: [43777, 43798, Ethiopic]
+ 43808, // Range #195: [43808, 43822, Ethiopic]
+ 43824, // Range #196: [43824, 43866, Latin]
+ 43868, // Range #197: [43868, 43876, Latin]
+ 43877, // Range #198: [43877, 43877, Greek]
+ 43878, // Range #199: [43878, 43881, Latin]
+ 43888, // Range #200: [43888, 43967, Cherokee]
+ 43968, // Range #201: [43968, 44025, Meetei_Mayek]
+ 44032, // Range #202: [44032, 55203, Hangul]
+ 55216, // Range #203: [55216, 55291, Hangul]
+ 63744, // Range #204: [63744, 64217, Han]
+ 64256, // Range #205: [64256, 64262, Latin]
+ 64275, // Range #206: [64275, 64279, Armenian]
+ 64285, // Range #207: [64285, 64335, Hebrew]
+ 64336, // Range #208: [64336, 64449, Arabic]
+ 64467, // Range #209: [64467, 64829, Arabic]
+ 64848, // Range #210: [64848, 64967, Arabic]
+ 65008, // Range #211: [65008, 65021, Arabic]
+ 65070, // Range #212: [65070, 65071, Cyrillic]
+ 65136, // Range #213: [65136, 65276, Arabic]
+ 65313, // Range #214: [65313, 65338, Latin]
+ 65345, // Range #215: [65345, 65370, Latin]
+ 65382, // Range #216: [65382, 65391, Katakana]
+ 65393, // Range #217: [65393, 65437, Katakana]
+ 65440, // Range #218: [65440, 65500, Hangul]
+ 65536, // Range #219: [65536, 65629, Linear_B]
+ 65664, // Range #220: [65664, 65786, Linear_B]
+ 65856, // Range #221: [65856, 65934, Greek]
+ 65952, // Range #222: [65952, 65952, Greek]
+ 66176, // Range #223: [66176, 66204, Lycian]
+ 66208, // Range #224: [66208, 66256, Carian]
+ 66304, // Range #225: [66304, 66339, Old_Italic]
+ 66349, // Range #226: [66349, 66351, Old_Italic]
+ 66352, // Range #227: [66352, 66378, Gothic]
+ 66384, // Range #228: [66384, 66426, Old_Permic]
+ 66432, // Range #229: [66432, 66463, Ugaritic]
+ 66464, // Range #230: [66464, 66517, Old_Persian]
+ 66560, // Range #231: [66560, 66639, Deseret]
+ 66640, // Range #232: [66640, 66687, Shavian]
+ 66688, // Range #233: [66688, 66729, Osmanya]
+ 66736, // Range #234: [66736, 66811, Osage]
+ 66816, // Range #235: [66816, 66855, Elbasan]
+ 66864, // Range #236: [66864, 66915, Caucasian_Albanian]
+ 66927, // Range #237: [66927, 66927, Caucasian_Albanian]
+ 67072, // Range #238: [67072, 67382, Linear_A]
+ 67392, // Range #239: [67392, 67413, Linear_A]
+ 67424, // Range #240: [67424, 67431, Linear_A]
+ 67584, // Range #241: [67584, 67647, Cypriot]
+ 67648, // Range #242: [67648, 67679, Imperial_Aramaic]
+ 67680, // Range #243: [67680, 67711, Palmyrene]
+ 67712, // Range #244: [67712, 67742, Nabataean]
+ 67751, // Range #245: [67751, 67759, Nabataean]
+ 67808, // Range #246: [67808, 67829, Hatran]
+ 67835, // Range #247: [67835, 67839, Hatran]
+ 67840, // Range #248: [67840, 67871, Phoenician]
+ 67872, // Range #249: [67872, 67897, Lydian]
+ 67903, // Range #250: [67903, 67903, Lydian]
+ 67968, // Range #251: [67968, 67999, Meroitic_Hieroglyphs]
+ 68000, // Range #252: [68000, 68095, Meroitic_Cursive]
+ 68096, // Range #253: [68096, 68102, Kharoshthi]
+ 68108, // Range #254: [68108, 68168, Kharoshthi]
+ 68176, // Range #255: [68176, 68184, Kharoshthi]
+ 68192, // Range #256: [68192, 68223, Old_South_Arabian]
+ 68224, // Range #257: [68224, 68255, Old_North_Arabian]
+ 68288, // Range #258: [68288, 68342, Manichaean]
+ 68352, // Range #259: [68352, 68415, Avestan]
+ 68416, // Range #260: [68416, 68447, Inscriptional_Parthian]
+ 68448, // Range #261: [68448, 68466, Inscriptional_Pahlavi]
+ 68472, // Range #262: [68472, 68479, Inscriptional_Pahlavi]
+ 68480, // Range #263: [68480, 68497, Psalter_Pahlavi]
+ 68505, // Range #264: [68505, 68508, Psalter_Pahlavi]
+ 68521, // Range #265: [68521, 68527, Psalter_Pahlavi]
+ 68608, // Range #266: [68608, 68680, Old_Turkic]
+ 68736, // Range #267: [68736, 68786, Old_Hungarian]
+ 68800, // Range #268: [68800, 68850, Old_Hungarian]
+ 68858, // Range #269: [68858, 68863, Old_Hungarian]
+ 68864, // Range #270: [68864, 68903, Hanifi_Rohingya]
+ 68912, // Range #271: [68912, 68921, Hanifi_Rohingya]
+ 69216, // Range #272: [69216, 69246, Arabic]
+ 69248, // Range #273: [69248, 69297, Yezidi]
69376, // Range #274: [69376, 69415, Old_Sogdian]
69424, // Range #275: [69424, 69465, Sogdian]
- 69600, // Range #276: [69600, 69622, Elymaic]
- 69632, // Range #277: [69632, 69743, Brahmi]
- 69759, // Range #278: [69759, 69759, Brahmi]
- 69760, // Range #279: [69760, 69825, Kaithi]
- 69837, // Range #280: [69837, 69837, Kaithi]
- 69840, // Range #281: [69840, 69864, Sora_Sompeng]
- 69872, // Range #282: [69872, 69881, Sora_Sompeng]
- 69888, // Range #283: [69888, 69958, Chakma]
- 69968, // Range #284: [69968, 70006, Mahajani]
- 70016, // Range #285: [70016, 70111, Sharada]
- 70113, // Range #286: [70113, 70132, Sinhala]
- 70144, // Range #287: [70144, 70206, Khojki]
- 70272, // Range #288: [70272, 70313, Multani]
- 70320, // Range #289: [70320, 70378, Khudawadi]
- 70384, // Range #290: [70384, 70393, Khudawadi]
- 70400, // Range #291: [70400, 70457, Grantha]
- 70460, // Range #292: [70460, 70480, Grantha]
- 70487, // Range #293: [70487, 70487, Grantha]
- 70493, // Range #294: [70493, 70516, Grantha]
- 70656, // Range #295: [70656, 70751, Newa]
- 70784, // Range #296: [70784, 70855, Tirhuta]
- 70864, // Range #297: [70864, 70873, Tirhuta]
- 71040, // Range #298: [71040, 71133, Siddham]
- 71168, // Range #299: [71168, 71236, Modi]
- 71248, // Range #300: [71248, 71257, Modi]
- 71264, // Range #301: [71264, 71276, Mongolian]
- 71296, // Range #302: [71296, 71352, Takri]
- 71360, // Range #303: [71360, 71369, Takri]
- 71424, // Range #304: [71424, 71487, Ahom]
- 71680, // Range #305: [71680, 71739, Dogra]
- 71840, // Range #306: [71840, 71922, Warang_Citi]
- 71935, // Range #307: [71935, 71935, Warang_Citi]
- 72096, // Range #308: [72096, 72164, Nandinagari]
- 72192, // Range #309: [72192, 72263, Zanabazar_Square]
- 72272, // Range #310: [72272, 72354, Soyombo]
- 72384, // Range #311: [72384, 72440, Pau_Cin_Hau]
- 72704, // Range #312: [72704, 72773, Bhaiksuki]
- 72784, // Range #313: [72784, 72812, Bhaiksuki]
- 72816, // Range #314: [72816, 72886, Marchen]
- 72960, // Range #315: [72960, 73031, Masaram_Gondi]
- 73040, // Range #316: [73040, 73049, Masaram_Gondi]
- 73056, // Range #317: [73056, 73112, Gunjala_Gondi]
- 73120, // Range #318: [73120, 73129, Gunjala_Gondi]
- 73440, // Range #319: [73440, 73464, Makasar]
- 73664, // Range #320: [73664, 73713, Tamil]
- 73727, // Range #321: [73727, 73727, Tamil]
- 73728, // Range #322: [73728, 74649, Cuneiform]
- 74752, // Range #323: [74752, 74868, Cuneiform]
- 74880, // Range #324: [74880, 75075, Cuneiform]
- 77824, // Range #325: [77824, 78904, Egyptian_Hieroglyphs]
- 82944, // Range #326: [82944, 83526, Anatolian_Hieroglyphs]
- 92160, // Range #327: [92160, 92728, Bamum]
- 92736, // Range #328: [92736, 92783, Mro]
- 92880, // Range #329: [92880, 92917, Bassa_Vah]
- 92928, // Range #330: [92928, 92997, Pahawh_Hmong]
- 93008, // Range #331: [93008, 93047, Pahawh_Hmong]
- 93053, // Range #332: [93053, 93071, Pahawh_Hmong]
- 93760, // Range #333: [93760, 93850, Medefaidrin]
- 93952, // Range #334: [93952, 94087, Miao]
- 94095, // Range #335: [94095, 94111, Miao]
- 94176, // Range #336: [94176, 94176, Tangut]
- 94177, // Range #337: [94177, 94177, Nushu]
- 94208, // Range #338: [94208, 100343, Tangut]
- 100352, // Range #339: [100352, 101106, Tangut]
- 110592, // Range #340: [110592, 110592, Katakana]
- 110593, // Range #341: [110593, 110878, Hiragana]
- 110928, // Range #342: [110928, 110930, Hiragana]
- 110948, // Range #343: [110948, 110951, Katakana]
- 110960, // Range #344: [110960, 111355, Nushu]
- 113664, // Range #345: [113664, 113770, Duployan]
- 113776, // Range #346: [113776, 113800, Duployan]
- 113808, // Range #347: [113808, 113823, Duployan]
- 119296, // Range #348: [119296, 119365, Greek]
- 120832, // Range #349: [120832, 121483, SignWriting]
- 121499, // Range #350: [121499, 121519, SignWriting]
- 122880, // Range #351: [122880, 122922, Glagolitic]
- 123136, // Range #352: [123136, 123215, Nyiakeng_Puachue_Hmong]
- 123584, // Range #353: [123584, 123641, Wancho]
- 123647, // Range #354: [123647, 123647, Wancho]
- 124928, // Range #355: [124928, 125142, Mende_Kikakui]
- 125184, // Range #356: [125184, 125279, Adlam]
- 126464, // Range #357: [126464, 126523, Arabic]
- 126530, // Range #358: [126530, 126619, Arabic]
- 126625, // Range #359: [126625, 126651, Arabic]
- 126704, // Range #360: [126704, 126705, Arabic]
- 127488, // Range #361: [127488, 127488, Hiragana]
- 131072, // Range #362: [131072, 173782, Han]
- 173824, // Range #363: [173824, 177972, Han]
- 177984, // Range #364: [177984, 183969, Han]
- 183984, // Range #365: [183984, 191456, Han]
- 194560, // Range #366: [194560, 195101, Han]
+ 69552, // Range #276: [69552, 69579, Chorasmian]
+ 69600, // Range #277: [69600, 69622, Elymaic]
+ 69632, // Range #278: [69632, 69743, Brahmi]
+ 69759, // Range #279: [69759, 69759, Brahmi]
+ 69760, // Range #280: [69760, 69825, Kaithi]
+ 69837, // Range #281: [69837, 69837, Kaithi]
+ 69840, // Range #282: [69840, 69864, Sora_Sompeng]
+ 69872, // Range #283: [69872, 69881, Sora_Sompeng]
+ 69888, // Range #284: [69888, 69959, Chakma]
+ 69968, // Range #285: [69968, 70006, Mahajani]
+ 70016, // Range #286: [70016, 70111, Sharada]
+ 70113, // Range #287: [70113, 70132, Sinhala]
+ 70144, // Range #288: [70144, 70206, Khojki]
+ 70272, // Range #289: [70272, 70313, Multani]
+ 70320, // Range #290: [70320, 70378, Khudawadi]
+ 70384, // Range #291: [70384, 70393, Khudawadi]
+ 70400, // Range #292: [70400, 70457, Grantha]
+ 70460, // Range #293: [70460, 70480, Grantha]
+ 70487, // Range #294: [70487, 70487, Grantha]
+ 70493, // Range #295: [70493, 70516, Grantha]
+ 70656, // Range #296: [70656, 70753, Newa]
+ 70784, // Range #297: [70784, 70855, Tirhuta]
+ 70864, // Range #298: [70864, 70873, Tirhuta]
+ 71040, // Range #299: [71040, 71133, Siddham]
+ 71168, // Range #300: [71168, 71236, Modi]
+ 71248, // Range #301: [71248, 71257, Modi]
+ 71264, // Range #302: [71264, 71276, Mongolian]
+ 71296, // Range #303: [71296, 71352, Takri]
+ 71360, // Range #304: [71360, 71369, Takri]
+ 71424, // Range #305: [71424, 71487, Ahom]
+ 71680, // Range #306: [71680, 71739, Dogra]
+ 71840, // Range #307: [71840, 71922, Warang_Citi]
+ 71935, // Range #308: [71935, 71935, Warang_Citi]
+ 71936, // Range #309: [71936, 72006, Dives_Akuru]
+ 72016, // Range #310: [72016, 72025, Dives_Akuru]
+ 72096, // Range #311: [72096, 72164, Nandinagari]
+ 72192, // Range #312: [72192, 72263, Zanabazar_Square]
+ 72272, // Range #313: [72272, 72354, Soyombo]
+ 72384, // Range #314: [72384, 72440, Pau_Cin_Hau]
+ 72704, // Range #315: [72704, 72773, Bhaiksuki]
+ 72784, // Range #316: [72784, 72812, Bhaiksuki]
+ 72816, // Range #317: [72816, 72886, Marchen]
+ 72960, // Range #318: [72960, 73031, Masaram_Gondi]
+ 73040, // Range #319: [73040, 73049, Masaram_Gondi]
+ 73056, // Range #320: [73056, 73112, Gunjala_Gondi]
+ 73120, // Range #321: [73120, 73129, Gunjala_Gondi]
+ 73440, // Range #322: [73440, 73464, Makasar]
+ 73648, // Range #323: [73648, 73648, Lisu]
+ 73664, // Range #324: [73664, 73713, Tamil]
+ 73727, // Range #325: [73727, 73727, Tamil]
+ 73728, // Range #326: [73728, 74649, Cuneiform]
+ 74752, // Range #327: [74752, 74868, Cuneiform]
+ 74880, // Range #328: [74880, 75075, Cuneiform]
+ 77824, // Range #329: [77824, 78904, Egyptian_Hieroglyphs]
+ 82944, // Range #330: [82944, 83526, Anatolian_Hieroglyphs]
+ 92160, // Range #331: [92160, 92728, Bamum]
+ 92736, // Range #332: [92736, 92783, Mro]
+ 92880, // Range #333: [92880, 92917, Bassa_Vah]
+ 92928, // Range #334: [92928, 92997, Pahawh_Hmong]
+ 93008, // Range #335: [93008, 93047, Pahawh_Hmong]
+ 93053, // Range #336: [93053, 93071, Pahawh_Hmong]
+ 93760, // Range #337: [93760, 93850, Medefaidrin]
+ 93952, // Range #338: [93952, 94087, Miao]
+ 94095, // Range #339: [94095, 94111, Miao]
+ 94176, // Range #340: [94176, 94176, Tangut]
+ 94177, // Range #341: [94177, 94177, Nushu]
+ 94180, // Range #342: [94180, 94180, Khitan_Small_Script]
+ 94192, // Range #343: [94192, 94193, Han]
+ 94208, // Range #344: [94208, 100343, Tangut]
+ 100352, // Range #345: [100352, 101119, Tangut]
+ 101120, // Range #346: [101120, 101589, Khitan_Small_Script]
+ 101632, // Range #347: [101632, 101640, Tangut]
+ 110592, // Range #348: [110592, 110592, Katakana]
+ 110593, // Range #349: [110593, 110878, Hiragana]
+ 110928, // Range #350: [110928, 110930, Hiragana]
+ 110948, // Range #351: [110948, 110951, Katakana]
+ 110960, // Range #352: [110960, 111355, Nushu]
+ 113664, // Range #353: [113664, 113770, Duployan]
+ 113776, // Range #354: [113776, 113800, Duployan]
+ 113808, // Range #355: [113808, 113823, Duployan]
+ 119296, // Range #356: [119296, 119365, Greek]
+ 120832, // Range #357: [120832, 121483, SignWriting]
+ 121499, // Range #358: [121499, 121519, SignWriting]
+ 122880, // Range #359: [122880, 122922, Glagolitic]
+ 123136, // Range #360: [123136, 123215, Nyiakeng_Puachue_Hmong]
+ 123584, // Range #361: [123584, 123641, Wancho]
+ 123647, // Range #362: [123647, 123647, Wancho]
+ 124928, // Range #363: [124928, 125142, Mende_Kikakui]
+ 125184, // Range #364: [125184, 125279, Adlam]
+ 126464, // Range #365: [126464, 126523, Arabic]
+ 126530, // Range #366: [126530, 126619, Arabic]
+ 126625, // Range #367: [126625, 126651, Arabic]
+ 126704, // Range #368: [126704, 126705, Arabic]
+ 127488, // Range #369: [127488, 127488, Hiragana]
+ 131072, // Range #370: [131072, 173789, Han]
+ 173824, // Range #371: [173824, 177972, Han]
+ 177984, // Range #372: [177984, 183969, Han]
+ 183984, // Range #373: [183984, 191456, Han]
+ 194560, // Range #374: [194560, 195101, Han]
+ 196608, // Range #375: [196608, 201546, Han]
};
const uint16 kRangeSizeMinusOne[] = {
@@ -418,355 +427,364 @@
15, // Range #15: [1008, 1023, Greek]
132, // Range #16: [1024, 1156, Cyrillic]
168, // Range #17: [1159, 1327, Cyrillic]
- 87, // Range #18: [1329, 1416, Armenian]
- 5, // Range #19: [1418, 1423, Armenian]
- 54, // Range #20: [1425, 1479, Hebrew]
- 36, // Range #21: [1488, 1524, Hebrew]
- 4, // Range #22: [1536, 1540, Arabic]
- 5, // Range #23: [1542, 1547, Arabic]
- 13, // Range #24: [1549, 1562, Arabic]
- 2, // Range #25: [1564, 1566, Arabic]
- 31, // Range #26: [1568, 1599, Arabic]
- 9, // Range #27: [1601, 1610, Arabic]
- 25, // Range #28: [1622, 1647, Arabic]
- 107, // Range #29: [1649, 1756, Arabic]
- 33, // Range #30: [1758, 1791, Arabic]
- 79, // Range #31: [1792, 1871, Syriac]
- 47, // Range #32: [1872, 1919, Arabic]
- 49, // Range #33: [1920, 1969, Thaana]
- 63, // Range #34: [1984, 2047, Nko]
- 62, // Range #35: [2048, 2110, Samaritan]
- 30, // Range #36: [2112, 2142, Mandaic]
- 10, // Range #37: [2144, 2154, Syriac]
- 29, // Range #38: [2208, 2237, Arabic]
- 14, // Range #39: [2259, 2273, Arabic]
- 28, // Range #40: [2275, 2303, Arabic]
- 80, // Range #41: [2304, 2384, Devanagari]
- 14, // Range #42: [2389, 2403, Devanagari]
- 25, // Range #43: [2406, 2431, Devanagari]
- 78, // Range #44: [2432, 2510, Bengali]
- 39, // Range #45: [2519, 2558, Bengali]
- 80, // Range #46: [2561, 2641, Gurmukhi]
- 5, // Range #47: [2649, 2654, Gurmukhi]
- 16, // Range #48: [2662, 2678, Gurmukhi]
- 79, // Range #49: [2689, 2768, Gujarati]
- 17, // Range #50: [2784, 2801, Gujarati]
- 6, // Range #51: [2809, 2815, Gujarati]
- 76, // Range #52: [2817, 2893, Oriya]
- 33, // Range #53: [2902, 2935, Oriya]
- 78, // Range #54: [2946, 3024, Tamil]
- 0, // Range #55: [3031, 3031, Tamil]
- 20, // Range #56: [3046, 3066, Tamil]
- 77, // Range #57: [3072, 3149, Telugu]
- 5, // Range #58: [3157, 3162, Telugu]
- 15, // Range #59: [3168, 3183, Telugu]
- 8, // Range #60: [3191, 3199, Telugu]
- 77, // Range #61: [3200, 3277, Kannada]
- 1, // Range #62: [3285, 3286, Kannada]
- 20, // Range #63: [3294, 3314, Kannada]
- 127, // Range #64: [3328, 3455, Malayalam]
- 93, // Range #65: [3458, 3551, Sinhala]
- 14, // Range #66: [3558, 3572, Sinhala]
- 57, // Range #67: [3585, 3642, Thai]
- 27, // Range #68: [3648, 3675, Thai]
- 94, // Range #69: [3713, 3807, Lao]
- 212, // Range #70: [3840, 4052, Tibetan]
- 1, // Range #71: [4057, 4058, Tibetan]
- 159, // Range #72: [4096, 4255, Myanmar]
- 39, // Range #73: [4256, 4295, Georgian]
- 45, // Range #74: [4301, 4346, Georgian]
- 3, // Range #75: [4348, 4351, Georgian]
- 255, // Range #76: [4352, 4607, Hangul]
- 409, // Range #77: [4608, 5017, Ethiopic]
- 93, // Range #78: [5024, 5117, Cherokee]
- 639, // Range #79: [5120, 5759, Canadian_Aboriginal]
- 28, // Range #80: [5760, 5788, Ogham]
- 74, // Range #81: [5792, 5866, Runic]
- 10, // Range #82: [5870, 5880, Runic]
- 20, // Range #83: [5888, 5908, Tagalog]
- 20, // Range #84: [5920, 5940, Hanunoo]
- 19, // Range #85: [5952, 5971, Buhid]
- 19, // Range #86: [5984, 6003, Tagbanwa]
- 105, // Range #87: [6016, 6121, Khmer]
- 9, // Range #88: [6128, 6137, Khmer]
- 1, // Range #89: [6144, 6145, Mongolian]
- 0, // Range #90: [6148, 6148, Mongolian]
- 19, // Range #91: [6150, 6169, Mongolian]
- 88, // Range #92: [6176, 6264, Mongolian]
- 42, // Range #93: [6272, 6314, Mongolian]
- 69, // Range #94: [6320, 6389, Canadian_Aboriginal]
- 79, // Range #95: [6400, 6479, Limbu]
- 36, // Range #96: [6480, 6516, Tai_Le]
- 73, // Range #97: [6528, 6601, New_Tai_Lue]
- 15, // Range #98: [6608, 6623, New_Tai_Lue]
- 31, // Range #99: [6624, 6655, Khmer]
- 31, // Range #100: [6656, 6687, Buginese]
- 105, // Range #101: [6688, 6793, Tai_Tham]
- 9, // Range #102: [6800, 6809, Tai_Tham]
- 13, // Range #103: [6816, 6829, Tai_Tham]
- 124, // Range #104: [6912, 7036, Balinese]
- 63, // Range #105: [7040, 7103, Sundanese]
- 51, // Range #106: [7104, 7155, Batak]
- 3, // Range #107: [7164, 7167, Batak]
- 79, // Range #108: [7168, 7247, Lepcha]
- 47, // Range #109: [7248, 7295, Ol_Chiki]
- 8, // Range #110: [7296, 7304, Cyrillic]
- 47, // Range #111: [7312, 7359, Georgian]
- 7, // Range #112: [7360, 7367, Sundanese]
- 37, // Range #113: [7424, 7461, Latin]
- 4, // Range #114: [7462, 7466, Greek]
- 0, // Range #115: [7467, 7467, Cyrillic]
- 48, // Range #116: [7468, 7516, Latin]
- 4, // Range #117: [7517, 7521, Greek]
- 3, // Range #118: [7522, 7525, Latin]
- 4, // Range #119: [7526, 7530, Greek]
- 12, // Range #120: [7531, 7543, Latin]
- 0, // Range #121: [7544, 7544, Cyrillic]
- 69, // Range #122: [7545, 7614, Latin]
- 0, // Range #123: [7615, 7615, Greek]
- 255, // Range #124: [7680, 7935, Latin]
- 254, // Range #125: [7936, 8190, Greek]
- 0, // Range #126: [8305, 8305, Latin]
- 0, // Range #127: [8319, 8319, Latin]
- 12, // Range #128: [8336, 8348, Latin]
- 0, // Range #129: [8486, 8486, Greek]
- 1, // Range #130: [8490, 8491, Latin]
- 0, // Range #131: [8498, 8498, Latin]
- 0, // Range #132: [8526, 8526, Latin]
- 40, // Range #133: [8544, 8584, Latin]
- 255, // Range #134: [10240, 10495, Braille]
- 94, // Range #135: [11264, 11358, Glagolitic]
- 31, // Range #136: [11360, 11391, Latin]
- 115, // Range #137: [11392, 11507, Coptic]
- 6, // Range #138: [11513, 11519, Coptic]
- 39, // Range #139: [11520, 11559, Georgian]
- 0, // Range #140: [11565, 11565, Georgian]
- 55, // Range #141: [11568, 11623, Tifinagh]
- 1, // Range #142: [11631, 11632, Tifinagh]
- 0, // Range #143: [11647, 11647, Tifinagh]
- 22, // Range #144: [11648, 11670, Ethiopic]
- 62, // Range #145: [11680, 11742, Ethiopic]
- 31, // Range #146: [11744, 11775, Cyrillic]
- 115, // Range #147: [11904, 12019, Han]
- 213, // Range #148: [12032, 12245, Han]
- 0, // Range #149: [12293, 12293, Han]
- 0, // Range #150: [12295, 12295, Han]
- 8, // Range #151: [12321, 12329, Han]
- 1, // Range #152: [12334, 12335, Hangul]
- 3, // Range #153: [12344, 12347, Han]
- 85, // Range #154: [12353, 12438, Hiragana]
- 2, // Range #155: [12445, 12447, Hiragana]
- 89, // Range #156: [12449, 12538, Katakana]
- 2, // Range #157: [12541, 12543, Katakana]
- 42, // Range #158: [12549, 12591, Bopomofo]
- 93, // Range #159: [12593, 12686, Hangul]
- 26, // Range #160: [12704, 12730, Bopomofo]
- 15, // Range #161: [12784, 12799, Katakana]
- 30, // Range #162: [12800, 12830, Hangul]
- 30, // Range #163: [12896, 12926, Hangul]
- 46, // Range #164: [13008, 13054, Katakana]
- 87, // Range #165: [13056, 13143, Katakana]
- 6581, // Range #166: [13312, 19893, Han]
- 20975, // Range #167: [19968, 40943, Han]
- 1222, // Range #168: [40960, 42182, Yi]
- 47, // Range #169: [42192, 42239, Lisu]
- 299, // Range #170: [42240, 42539, Vai]
- 95, // Range #171: [42560, 42655, Cyrillic]
- 87, // Range #172: [42656, 42743, Bamum]
- 101, // Range #173: [42786, 42887, Latin]
- 59, // Range #174: [42891, 42950, Latin]
- 8, // Range #175: [42999, 43007, Latin]
- 43, // Range #176: [43008, 43051, Syloti_Nagri]
- 55, // Range #177: [43072, 43127, Phags_Pa]
- 69, // Range #178: [43136, 43205, Saurashtra]
- 11, // Range #179: [43214, 43225, Saurashtra]
- 31, // Range #180: [43232, 43263, Devanagari]
- 45, // Range #181: [43264, 43309, Kayah_Li]
- 0, // Range #182: [43311, 43311, Kayah_Li]
- 35, // Range #183: [43312, 43347, Rejang]
- 0, // Range #184: [43359, 43359, Rejang]
- 28, // Range #185: [43360, 43388, Hangul]
- 77, // Range #186: [43392, 43469, Javanese]
- 15, // Range #187: [43472, 43487, Javanese]
- 30, // Range #188: [43488, 43518, Myanmar]
- 54, // Range #189: [43520, 43574, Cham]
- 31, // Range #190: [43584, 43615, Cham]
- 31, // Range #191: [43616, 43647, Myanmar]
- 66, // Range #192: [43648, 43714, Tai_Viet]
- 4, // Range #193: [43739, 43743, Tai_Viet]
- 22, // Range #194: [43744, 43766, Meetei_Mayek]
- 21, // Range #195: [43777, 43798, Ethiopic]
- 14, // Range #196: [43808, 43822, Ethiopic]
- 42, // Range #197: [43824, 43866, Latin]
- 8, // Range #198: [43868, 43876, Latin]
- 0, // Range #199: [43877, 43877, Greek]
- 1, // Range #200: [43878, 43879, Latin]
- 79, // Range #201: [43888, 43967, Cherokee]
- 57, // Range #202: [43968, 44025, Meetei_Mayek]
- 11171, // Range #203: [44032, 55203, Hangul]
- 75, // Range #204: [55216, 55291, Hangul]
- 473, // Range #205: [63744, 64217, Han]
- 6, // Range #206: [64256, 64262, Latin]
- 4, // Range #207: [64275, 64279, Armenian]
- 50, // Range #208: [64285, 64335, Hebrew]
- 113, // Range #209: [64336, 64449, Arabic]
- 362, // Range #210: [64467, 64829, Arabic]
- 119, // Range #211: [64848, 64967, Arabic]
- 13, // Range #212: [65008, 65021, Arabic]
- 1, // Range #213: [65070, 65071, Cyrillic]
- 140, // Range #214: [65136, 65276, Arabic]
- 25, // Range #215: [65313, 65338, Latin]
- 25, // Range #216: [65345, 65370, Latin]
- 9, // Range #217: [65382, 65391, Katakana]
- 44, // Range #218: [65393, 65437, Katakana]
- 60, // Range #219: [65440, 65500, Hangul]
- 93, // Range #220: [65536, 65629, Linear_B]
- 122, // Range #221: [65664, 65786, Linear_B]
- 78, // Range #222: [65856, 65934, Greek]
- 0, // Range #223: [65952, 65952, Greek]
- 28, // Range #224: [66176, 66204, Lycian]
- 48, // Range #225: [66208, 66256, Carian]
- 35, // Range #226: [66304, 66339, Old_Italic]
- 2, // Range #227: [66349, 66351, Old_Italic]
- 26, // Range #228: [66352, 66378, Gothic]
- 42, // Range #229: [66384, 66426, Old_Permic]
- 31, // Range #230: [66432, 66463, Ugaritic]
- 53, // Range #231: [66464, 66517, Old_Persian]
- 79, // Range #232: [66560, 66639, Deseret]
- 47, // Range #233: [66640, 66687, Shavian]
- 41, // Range #234: [66688, 66729, Osmanya]
- 75, // Range #235: [66736, 66811, Osage]
- 39, // Range #236: [66816, 66855, Elbasan]
- 51, // Range #237: [66864, 66915, Caucasian_Albanian]
- 0, // Range #238: [66927, 66927, Caucasian_Albanian]
- 310, // Range #239: [67072, 67382, Linear_A]
- 21, // Range #240: [67392, 67413, Linear_A]
- 7, // Range #241: [67424, 67431, Linear_A]
- 63, // Range #242: [67584, 67647, Cypriot]
- 31, // Range #243: [67648, 67679, Imperial_Aramaic]
- 31, // Range #244: [67680, 67711, Palmyrene]
- 30, // Range #245: [67712, 67742, Nabataean]
- 8, // Range #246: [67751, 67759, Nabataean]
- 21, // Range #247: [67808, 67829, Hatran]
- 4, // Range #248: [67835, 67839, Hatran]
- 31, // Range #249: [67840, 67871, Phoenician]
- 25, // Range #250: [67872, 67897, Lydian]
- 0, // Range #251: [67903, 67903, Lydian]
- 31, // Range #252: [67968, 67999, Meroitic_Hieroglyphs]
- 95, // Range #253: [68000, 68095, Meroitic_Cursive]
- 6, // Range #254: [68096, 68102, Kharoshthi]
- 60, // Range #255: [68108, 68168, Kharoshthi]
- 8, // Range #256: [68176, 68184, Kharoshthi]
- 31, // Range #257: [68192, 68223, Old_South_Arabian]
- 31, // Range #258: [68224, 68255, Old_North_Arabian]
- 54, // Range #259: [68288, 68342, Manichaean]
- 63, // Range #260: [68352, 68415, Avestan]
- 31, // Range #261: [68416, 68447, Inscriptional_Parthian]
- 18, // Range #262: [68448, 68466, Inscriptional_Pahlavi]
- 7, // Range #263: [68472, 68479, Inscriptional_Pahlavi]
- 17, // Range #264: [68480, 68497, Psalter_Pahlavi]
- 3, // Range #265: [68505, 68508, Psalter_Pahlavi]
- 6, // Range #266: [68521, 68527, Psalter_Pahlavi]
- 72, // Range #267: [68608, 68680, Old_Turkic]
- 50, // Range #268: [68736, 68786, Old_Hungarian]
- 50, // Range #269: [68800, 68850, Old_Hungarian]
- 5, // Range #270: [68858, 68863, Old_Hungarian]
- 39, // Range #271: [68864, 68903, Hanifi_Rohingya]
- 9, // Range #272: [68912, 68921, Hanifi_Rohingya]
- 30, // Range #273: [69216, 69246, Arabic]
+ 94, // Range #18: [1329, 1423, Armenian]
+ 54, // Range #19: [1425, 1479, Hebrew]
+ 36, // Range #20: [1488, 1524, Hebrew]
+ 4, // Range #21: [1536, 1540, Arabic]
+ 5, // Range #22: [1542, 1547, Arabic]
+ 13, // Range #23: [1549, 1562, Arabic]
+ 2, // Range #24: [1564, 1566, Arabic]
+ 31, // Range #25: [1568, 1599, Arabic]
+ 9, // Range #26: [1601, 1610, Arabic]
+ 25, // Range #27: [1622, 1647, Arabic]
+ 107, // Range #28: [1649, 1756, Arabic]
+ 33, // Range #29: [1758, 1791, Arabic]
+ 79, // Range #30: [1792, 1871, Syriac]
+ 47, // Range #31: [1872, 1919, Arabic]
+ 49, // Range #32: [1920, 1969, Thaana]
+ 63, // Range #33: [1984, 2047, Nko]
+ 62, // Range #34: [2048, 2110, Samaritan]
+ 30, // Range #35: [2112, 2142, Mandaic]
+ 10, // Range #36: [2144, 2154, Syriac]
+ 39, // Range #37: [2208, 2247, Arabic]
+ 14, // Range #38: [2259, 2273, Arabic]
+ 28, // Range #39: [2275, 2303, Arabic]
+ 80, // Range #40: [2304, 2384, Devanagari]
+ 14, // Range #41: [2389, 2403, Devanagari]
+ 25, // Range #42: [2406, 2431, Devanagari]
+ 78, // Range #43: [2432, 2510, Bengali]
+ 39, // Range #44: [2519, 2558, Bengali]
+ 80, // Range #45: [2561, 2641, Gurmukhi]
+ 5, // Range #46: [2649, 2654, Gurmukhi]
+ 16, // Range #47: [2662, 2678, Gurmukhi]
+ 79, // Range #48: [2689, 2768, Gujarati]
+ 17, // Range #49: [2784, 2801, Gujarati]
+ 6, // Range #50: [2809, 2815, Gujarati]
+ 76, // Range #51: [2817, 2893, Oriya]
+ 34, // Range #52: [2901, 2935, Oriya]
+ 78, // Range #53: [2946, 3024, Tamil]
+ 0, // Range #54: [3031, 3031, Tamil]
+ 20, // Range #55: [3046, 3066, Tamil]
+ 77, // Range #56: [3072, 3149, Telugu]
+ 5, // Range #57: [3157, 3162, Telugu]
+ 15, // Range #58: [3168, 3183, Telugu]
+ 8, // Range #59: [3191, 3199, Telugu]
+ 77, // Range #60: [3200, 3277, Kannada]
+ 1, // Range #61: [3285, 3286, Kannada]
+ 20, // Range #62: [3294, 3314, Kannada]
+ 127, // Range #63: [3328, 3455, Malayalam]
+ 94, // Range #64: [3457, 3551, Sinhala]
+ 14, // Range #65: [3558, 3572, Sinhala]
+ 57, // Range #66: [3585, 3642, Thai]
+ 27, // Range #67: [3648, 3675, Thai]
+ 94, // Range #68: [3713, 3807, Lao]
+ 212, // Range #69: [3840, 4052, Tibetan]
+ 1, // Range #70: [4057, 4058, Tibetan]
+ 159, // Range #71: [4096, 4255, Myanmar]
+ 39, // Range #72: [4256, 4295, Georgian]
+ 45, // Range #73: [4301, 4346, Georgian]
+ 3, // Range #74: [4348, 4351, Georgian]
+ 255, // Range #75: [4352, 4607, Hangul]
+ 409, // Range #76: [4608, 5017, Ethiopic]
+ 93, // Range #77: [5024, 5117, Cherokee]
+ 639, // Range #78: [5120, 5759, Canadian_Aboriginal]
+ 28, // Range #79: [5760, 5788, Ogham]
+ 74, // Range #80: [5792, 5866, Runic]
+ 10, // Range #81: [5870, 5880, Runic]
+ 20, // Range #82: [5888, 5908, Tagalog]
+ 20, // Range #83: [5920, 5940, Hanunoo]
+ 19, // Range #84: [5952, 5971, Buhid]
+ 19, // Range #85: [5984, 6003, Tagbanwa]
+ 105, // Range #86: [6016, 6121, Khmer]
+ 9, // Range #87: [6128, 6137, Khmer]
+ 1, // Range #88: [6144, 6145, Mongolian]
+ 0, // Range #89: [6148, 6148, Mongolian]
+ 19, // Range #90: [6150, 6169, Mongolian]
+ 88, // Range #91: [6176, 6264, Mongolian]
+ 42, // Range #92: [6272, 6314, Mongolian]
+ 69, // Range #93: [6320, 6389, Canadian_Aboriginal]
+ 79, // Range #94: [6400, 6479, Limbu]
+ 36, // Range #95: [6480, 6516, Tai_Le]
+ 73, // Range #96: [6528, 6601, New_Tai_Lue]
+ 15, // Range #97: [6608, 6623, New_Tai_Lue]
+ 31, // Range #98: [6624, 6655, Khmer]
+ 31, // Range #99: [6656, 6687, Buginese]
+ 105, // Range #100: [6688, 6793, Tai_Tham]
+ 9, // Range #101: [6800, 6809, Tai_Tham]
+ 13, // Range #102: [6816, 6829, Tai_Tham]
+ 124, // Range #103: [6912, 7036, Balinese]
+ 63, // Range #104: [7040, 7103, Sundanese]
+ 51, // Range #105: [7104, 7155, Batak]
+ 3, // Range #106: [7164, 7167, Batak]
+ 79, // Range #107: [7168, 7247, Lepcha]
+ 47, // Range #108: [7248, 7295, Ol_Chiki]
+ 8, // Range #109: [7296, 7304, Cyrillic]
+ 47, // Range #110: [7312, 7359, Georgian]
+ 7, // Range #111: [7360, 7367, Sundanese]
+ 37, // Range #112: [7424, 7461, Latin]
+ 4, // Range #113: [7462, 7466, Greek]
+ 0, // Range #114: [7467, 7467, Cyrillic]
+ 48, // Range #115: [7468, 7516, Latin]
+ 4, // Range #116: [7517, 7521, Greek]
+ 3, // Range #117: [7522, 7525, Latin]
+ 4, // Range #118: [7526, 7530, Greek]
+ 12, // Range #119: [7531, 7543, Latin]
+ 0, // Range #120: [7544, 7544, Cyrillic]
+ 69, // Range #121: [7545, 7614, Latin]
+ 0, // Range #122: [7615, 7615, Greek]
+ 255, // Range #123: [7680, 7935, Latin]
+ 254, // Range #124: [7936, 8190, Greek]
+ 0, // Range #125: [8305, 8305, Latin]
+ 0, // Range #126: [8319, 8319, Latin]
+ 12, // Range #127: [8336, 8348, Latin]
+ 0, // Range #128: [8486, 8486, Greek]
+ 1, // Range #129: [8490, 8491, Latin]
+ 0, // Range #130: [8498, 8498, Latin]
+ 0, // Range #131: [8526, 8526, Latin]
+ 40, // Range #132: [8544, 8584, Latin]
+ 255, // Range #133: [10240, 10495, Braille]
+ 94, // Range #134: [11264, 11358, Glagolitic]
+ 31, // Range #135: [11360, 11391, Latin]
+ 115, // Range #136: [11392, 11507, Coptic]
+ 6, // Range #137: [11513, 11519, Coptic]
+ 39, // Range #138: [11520, 11559, Georgian]
+ 0, // Range #139: [11565, 11565, Georgian]
+ 55, // Range #140: [11568, 11623, Tifinagh]
+ 1, // Range #141: [11631, 11632, Tifinagh]
+ 0, // Range #142: [11647, 11647, Tifinagh]
+ 22, // Range #143: [11648, 11670, Ethiopic]
+ 62, // Range #144: [11680, 11742, Ethiopic]
+ 31, // Range #145: [11744, 11775, Cyrillic]
+ 115, // Range #146: [11904, 12019, Han]
+ 213, // Range #147: [12032, 12245, Han]
+ 0, // Range #148: [12293, 12293, Han]
+ 0, // Range #149: [12295, 12295, Han]
+ 8, // Range #150: [12321, 12329, Han]
+ 1, // Range #151: [12334, 12335, Hangul]
+ 3, // Range #152: [12344, 12347, Han]
+ 85, // Range #153: [12353, 12438, Hiragana]
+ 2, // Range #154: [12445, 12447, Hiragana]
+ 89, // Range #155: [12449, 12538, Katakana]
+ 2, // Range #156: [12541, 12543, Katakana]
+ 42, // Range #157: [12549, 12591, Bopomofo]
+ 93, // Range #158: [12593, 12686, Hangul]
+ 31, // Range #159: [12704, 12735, Bopomofo]
+ 15, // Range #160: [12784, 12799, Katakana]
+ 30, // Range #161: [12800, 12830, Hangul]
+ 30, // Range #162: [12896, 12926, Hangul]
+ 46, // Range #163: [13008, 13054, Katakana]
+ 87, // Range #164: [13056, 13143, Katakana]
+ 6591, // Range #165: [13312, 19903, Han]
+ 20988, // Range #166: [19968, 40956, Han]
+ 1222, // Range #167: [40960, 42182, Yi]
+ 47, // Range #168: [42192, 42239, Lisu]
+ 299, // Range #169: [42240, 42539, Vai]
+ 95, // Range #170: [42560, 42655, Cyrillic]
+ 87, // Range #171: [42656, 42743, Bamum]
+ 101, // Range #172: [42786, 42887, Latin]
+ 63, // Range #173: [42891, 42954, Latin]
+ 10, // Range #174: [42997, 43007, Latin]
+ 44, // Range #175: [43008, 43052, Syloti_Nagri]
+ 55, // Range #176: [43072, 43127, Phags_Pa]
+ 69, // Range #177: [43136, 43205, Saurashtra]
+ 11, // Range #178: [43214, 43225, Saurashtra]
+ 31, // Range #179: [43232, 43263, Devanagari]
+ 45, // Range #180: [43264, 43309, Kayah_Li]
+ 0, // Range #181: [43311, 43311, Kayah_Li]
+ 35, // Range #182: [43312, 43347, Rejang]
+ 0, // Range #183: [43359, 43359, Rejang]
+ 28, // Range #184: [43360, 43388, Hangul]
+ 77, // Range #185: [43392, 43469, Javanese]
+ 15, // Range #186: [43472, 43487, Javanese]
+ 30, // Range #187: [43488, 43518, Myanmar]
+ 54, // Range #188: [43520, 43574, Cham]
+ 31, // Range #189: [43584, 43615, Cham]
+ 31, // Range #190: [43616, 43647, Myanmar]
+ 66, // Range #191: [43648, 43714, Tai_Viet]
+ 4, // Range #192: [43739, 43743, Tai_Viet]
+ 22, // Range #193: [43744, 43766, Meetei_Mayek]
+ 21, // Range #194: [43777, 43798, Ethiopic]
+ 14, // Range #195: [43808, 43822, Ethiopic]
+ 42, // Range #196: [43824, 43866, Latin]
+ 8, // Range #197: [43868, 43876, Latin]
+ 0, // Range #198: [43877, 43877, Greek]
+ 3, // Range #199: [43878, 43881, Latin]
+ 79, // Range #200: [43888, 43967, Cherokee]
+ 57, // Range #201: [43968, 44025, Meetei_Mayek]
+ 11171, // Range #202: [44032, 55203, Hangul]
+ 75, // Range #203: [55216, 55291, Hangul]
+ 473, // Range #204: [63744, 64217, Han]
+ 6, // Range #205: [64256, 64262, Latin]
+ 4, // Range #206: [64275, 64279, Armenian]
+ 50, // Range #207: [64285, 64335, Hebrew]
+ 113, // Range #208: [64336, 64449, Arabic]
+ 362, // Range #209: [64467, 64829, Arabic]
+ 119, // Range #210: [64848, 64967, Arabic]
+ 13, // Range #211: [65008, 65021, Arabic]
+ 1, // Range #212: [65070, 65071, Cyrillic]
+ 140, // Range #213: [65136, 65276, Arabic]
+ 25, // Range #214: [65313, 65338, Latin]
+ 25, // Range #215: [65345, 65370, Latin]
+ 9, // Range #216: [65382, 65391, Katakana]
+ 44, // Range #217: [65393, 65437, Katakana]
+ 60, // Range #218: [65440, 65500, Hangul]
+ 93, // Range #219: [65536, 65629, Linear_B]
+ 122, // Range #220: [65664, 65786, Linear_B]
+ 78, // Range #221: [65856, 65934, Greek]
+ 0, // Range #222: [65952, 65952, Greek]
+ 28, // Range #223: [66176, 66204, Lycian]
+ 48, // Range #224: [66208, 66256, Carian]
+ 35, // Range #225: [66304, 66339, Old_Italic]
+ 2, // Range #226: [66349, 66351, Old_Italic]
+ 26, // Range #227: [66352, 66378, Gothic]
+ 42, // Range #228: [66384, 66426, Old_Permic]
+ 31, // Range #229: [66432, 66463, Ugaritic]
+ 53, // Range #230: [66464, 66517, Old_Persian]
+ 79, // Range #231: [66560, 66639, Deseret]
+ 47, // Range #232: [66640, 66687, Shavian]
+ 41, // Range #233: [66688, 66729, Osmanya]
+ 75, // Range #234: [66736, 66811, Osage]
+ 39, // Range #235: [66816, 66855, Elbasan]
+ 51, // Range #236: [66864, 66915, Caucasian_Albanian]
+ 0, // Range #237: [66927, 66927, Caucasian_Albanian]
+ 310, // Range #238: [67072, 67382, Linear_A]
+ 21, // Range #239: [67392, 67413, Linear_A]
+ 7, // Range #240: [67424, 67431, Linear_A]
+ 63, // Range #241: [67584, 67647, Cypriot]
+ 31, // Range #242: [67648, 67679, Imperial_Aramaic]
+ 31, // Range #243: [67680, 67711, Palmyrene]
+ 30, // Range #244: [67712, 67742, Nabataean]
+ 8, // Range #245: [67751, 67759, Nabataean]
+ 21, // Range #246: [67808, 67829, Hatran]
+ 4, // Range #247: [67835, 67839, Hatran]
+ 31, // Range #248: [67840, 67871, Phoenician]
+ 25, // Range #249: [67872, 67897, Lydian]
+ 0, // Range #250: [67903, 67903, Lydian]
+ 31, // Range #251: [67968, 67999, Meroitic_Hieroglyphs]
+ 95, // Range #252: [68000, 68095, Meroitic_Cursive]
+ 6, // Range #253: [68096, 68102, Kharoshthi]
+ 60, // Range #254: [68108, 68168, Kharoshthi]
+ 8, // Range #255: [68176, 68184, Kharoshthi]
+ 31, // Range #256: [68192, 68223, Old_South_Arabian]
+ 31, // Range #257: [68224, 68255, Old_North_Arabian]
+ 54, // Range #258: [68288, 68342, Manichaean]
+ 63, // Range #259: [68352, 68415, Avestan]
+ 31, // Range #260: [68416, 68447, Inscriptional_Parthian]
+ 18, // Range #261: [68448, 68466, Inscriptional_Pahlavi]
+ 7, // Range #262: [68472, 68479, Inscriptional_Pahlavi]
+ 17, // Range #263: [68480, 68497, Psalter_Pahlavi]
+ 3, // Range #264: [68505, 68508, Psalter_Pahlavi]
+ 6, // Range #265: [68521, 68527, Psalter_Pahlavi]
+ 72, // Range #266: [68608, 68680, Old_Turkic]
+ 50, // Range #267: [68736, 68786, Old_Hungarian]
+ 50, // Range #268: [68800, 68850, Old_Hungarian]
+ 5, // Range #269: [68858, 68863, Old_Hungarian]
+ 39, // Range #270: [68864, 68903, Hanifi_Rohingya]
+ 9, // Range #271: [68912, 68921, Hanifi_Rohingya]
+ 30, // Range #272: [69216, 69246, Arabic]
+ 49, // Range #273: [69248, 69297, Yezidi]
39, // Range #274: [69376, 69415, Old_Sogdian]
41, // Range #275: [69424, 69465, Sogdian]
- 22, // Range #276: [69600, 69622, Elymaic]
- 111, // Range #277: [69632, 69743, Brahmi]
- 0, // Range #278: [69759, 69759, Brahmi]
- 65, // Range #279: [69760, 69825, Kaithi]
- 0, // Range #280: [69837, 69837, Kaithi]
- 24, // Range #281: [69840, 69864, Sora_Sompeng]
- 9, // Range #282: [69872, 69881, Sora_Sompeng]
- 70, // Range #283: [69888, 69958, Chakma]
- 38, // Range #284: [69968, 70006, Mahajani]
- 95, // Range #285: [70016, 70111, Sharada]
- 19, // Range #286: [70113, 70132, Sinhala]
- 62, // Range #287: [70144, 70206, Khojki]
- 41, // Range #288: [70272, 70313, Multani]
- 58, // Range #289: [70320, 70378, Khudawadi]
- 9, // Range #290: [70384, 70393, Khudawadi]
- 57, // Range #291: [70400, 70457, Grantha]
- 20, // Range #292: [70460, 70480, Grantha]
- 0, // Range #293: [70487, 70487, Grantha]
- 23, // Range #294: [70493, 70516, Grantha]
- 95, // Range #295: [70656, 70751, Newa]
- 71, // Range #296: [70784, 70855, Tirhuta]
- 9, // Range #297: [70864, 70873, Tirhuta]
- 93, // Range #298: [71040, 71133, Siddham]
- 68, // Range #299: [71168, 71236, Modi]
- 9, // Range #300: [71248, 71257, Modi]
- 12, // Range #301: [71264, 71276, Mongolian]
- 56, // Range #302: [71296, 71352, Takri]
- 9, // Range #303: [71360, 71369, Takri]
- 63, // Range #304: [71424, 71487, Ahom]
- 59, // Range #305: [71680, 71739, Dogra]
- 82, // Range #306: [71840, 71922, Warang_Citi]
- 0, // Range #307: [71935, 71935, Warang_Citi]
- 68, // Range #308: [72096, 72164, Nandinagari]
- 71, // Range #309: [72192, 72263, Zanabazar_Square]
- 82, // Range #310: [72272, 72354, Soyombo]
- 56, // Range #311: [72384, 72440, Pau_Cin_Hau]
- 69, // Range #312: [72704, 72773, Bhaiksuki]
- 28, // Range #313: [72784, 72812, Bhaiksuki]
- 70, // Range #314: [72816, 72886, Marchen]
- 71, // Range #315: [72960, 73031, Masaram_Gondi]
- 9, // Range #316: [73040, 73049, Masaram_Gondi]
- 56, // Range #317: [73056, 73112, Gunjala_Gondi]
- 9, // Range #318: [73120, 73129, Gunjala_Gondi]
- 24, // Range #319: [73440, 73464, Makasar]
- 49, // Range #320: [73664, 73713, Tamil]
- 0, // Range #321: [73727, 73727, Tamil]
- 921, // Range #322: [73728, 74649, Cuneiform]
- 116, // Range #323: [74752, 74868, Cuneiform]
- 195, // Range #324: [74880, 75075, Cuneiform]
- 1080, // Range #325: [77824, 78904, Egyptian_Hieroglyphs]
- 582, // Range #326: [82944, 83526, Anatolian_Hieroglyphs]
- 568, // Range #327: [92160, 92728, Bamum]
- 47, // Range #328: [92736, 92783, Mro]
- 37, // Range #329: [92880, 92917, Bassa_Vah]
- 69, // Range #330: [92928, 92997, Pahawh_Hmong]
- 39, // Range #331: [93008, 93047, Pahawh_Hmong]
- 18, // Range #332: [93053, 93071, Pahawh_Hmong]
- 90, // Range #333: [93760, 93850, Medefaidrin]
- 135, // Range #334: [93952, 94087, Miao]
- 16, // Range #335: [94095, 94111, Miao]
- 0, // Range #336: [94176, 94176, Tangut]
- 0, // Range #337: [94177, 94177, Nushu]
- 6135, // Range #338: [94208, 100343, Tangut]
- 754, // Range #339: [100352, 101106, Tangut]
- 0, // Range #340: [110592, 110592, Katakana]
- 285, // Range #341: [110593, 110878, Hiragana]
- 2, // Range #342: [110928, 110930, Hiragana]
- 3, // Range #343: [110948, 110951, Katakana]
- 395, // Range #344: [110960, 111355, Nushu]
- 106, // Range #345: [113664, 113770, Duployan]
- 24, // Range #346: [113776, 113800, Duployan]
- 15, // Range #347: [113808, 113823, Duployan]
- 69, // Range #348: [119296, 119365, Greek]
- 651, // Range #349: [120832, 121483, SignWriting]
- 20, // Range #350: [121499, 121519, SignWriting]
- 42, // Range #351: [122880, 122922, Glagolitic]
- 79, // Range #352: [123136, 123215, Nyiakeng_Puachue_Hmong]
- 57, // Range #353: [123584, 123641, Wancho]
- 0, // Range #354: [123647, 123647, Wancho]
- 214, // Range #355: [124928, 125142, Mende_Kikakui]
- 95, // Range #356: [125184, 125279, Adlam]
- 59, // Range #357: [126464, 126523, Arabic]
- 89, // Range #358: [126530, 126619, Arabic]
- 26, // Range #359: [126625, 126651, Arabic]
- 1, // Range #360: [126704, 126705, Arabic]
- 0, // Range #361: [127488, 127488, Hiragana]
- 42710, // Range #362: [131072, 173782, Han]
- 4148, // Range #363: [173824, 177972, Han]
- 5985, // Range #364: [177984, 183969, Han]
- 7472, // Range #365: [183984, 191456, Han]
- 541, // Range #366: [194560, 195101, Han]
+ 27, // Range #276: [69552, 69579, Chorasmian]
+ 22, // Range #277: [69600, 69622, Elymaic]
+ 111, // Range #278: [69632, 69743, Brahmi]
+ 0, // Range #279: [69759, 69759, Brahmi]
+ 65, // Range #280: [69760, 69825, Kaithi]
+ 0, // Range #281: [69837, 69837, Kaithi]
+ 24, // Range #282: [69840, 69864, Sora_Sompeng]
+ 9, // Range #283: [69872, 69881, Sora_Sompeng]
+ 71, // Range #284: [69888, 69959, Chakma]
+ 38, // Range #285: [69968, 70006, Mahajani]
+ 95, // Range #286: [70016, 70111, Sharada]
+ 19, // Range #287: [70113, 70132, Sinhala]
+ 62, // Range #288: [70144, 70206, Khojki]
+ 41, // Range #289: [70272, 70313, Multani]
+ 58, // Range #290: [70320, 70378, Khudawadi]
+ 9, // Range #291: [70384, 70393, Khudawadi]
+ 57, // Range #292: [70400, 70457, Grantha]
+ 20, // Range #293: [70460, 70480, Grantha]
+ 0, // Range #294: [70487, 70487, Grantha]
+ 23, // Range #295: [70493, 70516, Grantha]
+ 97, // Range #296: [70656, 70753, Newa]
+ 71, // Range #297: [70784, 70855, Tirhuta]
+ 9, // Range #298: [70864, 70873, Tirhuta]
+ 93, // Range #299: [71040, 71133, Siddham]
+ 68, // Range #300: [71168, 71236, Modi]
+ 9, // Range #301: [71248, 71257, Modi]
+ 12, // Range #302: [71264, 71276, Mongolian]
+ 56, // Range #303: [71296, 71352, Takri]
+ 9, // Range #304: [71360, 71369, Takri]
+ 63, // Range #305: [71424, 71487, Ahom]
+ 59, // Range #306: [71680, 71739, Dogra]
+ 82, // Range #307: [71840, 71922, Warang_Citi]
+ 0, // Range #308: [71935, 71935, Warang_Citi]
+ 70, // Range #309: [71936, 72006, Dives_Akuru]
+ 9, // Range #310: [72016, 72025, Dives_Akuru]
+ 68, // Range #311: [72096, 72164, Nandinagari]
+ 71, // Range #312: [72192, 72263, Zanabazar_Square]
+ 82, // Range #313: [72272, 72354, Soyombo]
+ 56, // Range #314: [72384, 72440, Pau_Cin_Hau]
+ 69, // Range #315: [72704, 72773, Bhaiksuki]
+ 28, // Range #316: [72784, 72812, Bhaiksuki]
+ 70, // Range #317: [72816, 72886, Marchen]
+ 71, // Range #318: [72960, 73031, Masaram_Gondi]
+ 9, // Range #319: [73040, 73049, Masaram_Gondi]
+ 56, // Range #320: [73056, 73112, Gunjala_Gondi]
+ 9, // Range #321: [73120, 73129, Gunjala_Gondi]
+ 24, // Range #322: [73440, 73464, Makasar]
+ 0, // Range #323: [73648, 73648, Lisu]
+ 49, // Range #324: [73664, 73713, Tamil]
+ 0, // Range #325: [73727, 73727, Tamil]
+ 921, // Range #326: [73728, 74649, Cuneiform]
+ 116, // Range #327: [74752, 74868, Cuneiform]
+ 195, // Range #328: [74880, 75075, Cuneiform]
+ 1080, // Range #329: [77824, 78904, Egyptian_Hieroglyphs]
+ 582, // Range #330: [82944, 83526, Anatolian_Hieroglyphs]
+ 568, // Range #331: [92160, 92728, Bamum]
+ 47, // Range #332: [92736, 92783, Mro]
+ 37, // Range #333: [92880, 92917, Bassa_Vah]
+ 69, // Range #334: [92928, 92997, Pahawh_Hmong]
+ 39, // Range #335: [93008, 93047, Pahawh_Hmong]
+ 18, // Range #336: [93053, 93071, Pahawh_Hmong]
+ 90, // Range #337: [93760, 93850, Medefaidrin]
+ 135, // Range #338: [93952, 94087, Miao]
+ 16, // Range #339: [94095, 94111, Miao]
+ 0, // Range #340: [94176, 94176, Tangut]
+ 0, // Range #341: [94177, 94177, Nushu]
+ 0, // Range #342: [94180, 94180, Khitan_Small_Script]
+ 1, // Range #343: [94192, 94193, Han]
+ 6135, // Range #344: [94208, 100343, Tangut]
+ 767, // Range #345: [100352, 101119, Tangut]
+ 469, // Range #346: [101120, 101589, Khitan_Small_Script]
+ 8, // Range #347: [101632, 101640, Tangut]
+ 0, // Range #348: [110592, 110592, Katakana]
+ 285, // Range #349: [110593, 110878, Hiragana]
+ 2, // Range #350: [110928, 110930, Hiragana]
+ 3, // Range #351: [110948, 110951, Katakana]
+ 395, // Range #352: [110960, 111355, Nushu]
+ 106, // Range #353: [113664, 113770, Duployan]
+ 24, // Range #354: [113776, 113800, Duployan]
+ 15, // Range #355: [113808, 113823, Duployan]
+ 69, // Range #356: [119296, 119365, Greek]
+ 651, // Range #357: [120832, 121483, SignWriting]
+ 20, // Range #358: [121499, 121519, SignWriting]
+ 42, // Range #359: [122880, 122922, Glagolitic]
+ 79, // Range #360: [123136, 123215, Nyiakeng_Puachue_Hmong]
+ 57, // Range #361: [123584, 123641, Wancho]
+ 0, // Range #362: [123647, 123647, Wancho]
+ 214, // Range #363: [124928, 125142, Mende_Kikakui]
+ 95, // Range #364: [125184, 125279, Adlam]
+ 59, // Range #365: [126464, 126523, Arabic]
+ 89, // Range #366: [126530, 126619, Arabic]
+ 26, // Range #367: [126625, 126651, Arabic]
+ 1, // Range #368: [126704, 126705, Arabic]
+ 0, // Range #369: [127488, 127488, Hiragana]
+ 42717, // Range #370: [131072, 173789, Han]
+ 4148, // Range #371: [173824, 177972, Han]
+ 5985, // Range #372: [177984, 183969, Han]
+ 7472, // Range #373: [183984, 191456, Han]
+ 541, // Range #374: [194560, 195101, Han]
+ 4938, // Range #375: [196608, 201546, Han]
};
const uint8 kRangeScript[] = {
@@ -788,358 +806,367 @@
14, // Range #15: [1008, 1023, Greek]
8, // Range #16: [1024, 1156, Cyrillic]
8, // Range #17: [1159, 1327, Cyrillic]
- 3, // Range #18: [1329, 1416, Armenian]
- 3, // Range #19: [1418, 1423, Armenian]
- 19, // Range #20: [1425, 1479, Hebrew]
- 19, // Range #21: [1488, 1524, Hebrew]
- 2, // Range #22: [1536, 1540, Arabic]
- 2, // Range #23: [1542, 1547, Arabic]
- 2, // Range #24: [1549, 1562, Arabic]
- 2, // Range #25: [1564, 1566, Arabic]
- 2, // Range #26: [1568, 1599, Arabic]
- 2, // Range #27: [1601, 1610, Arabic]
- 2, // Range #28: [1622, 1647, Arabic]
- 2, // Range #29: [1649, 1756, Arabic]
- 2, // Range #30: [1758, 1791, Arabic]
- 34, // Range #31: [1792, 1871, Syriac]
- 2, // Range #32: [1872, 1919, Arabic]
- 37, // Range #33: [1920, 1969, Thaana]
- 87, // Range #34: [1984, 2047, Nko]
- 126, // Range #35: [2048, 2110, Samaritan]
- 84, // Range #36: [2112, 2142, Mandaic]
- 34, // Range #37: [2144, 2154, Syriac]
- 2, // Range #38: [2208, 2237, Arabic]
- 2, // Range #39: [2259, 2273, Arabic]
- 2, // Range #40: [2275, 2303, Arabic]
- 10, // Range #41: [2304, 2384, Devanagari]
- 10, // Range #42: [2389, 2403, Devanagari]
- 10, // Range #43: [2406, 2431, Devanagari]
- 4, // Range #44: [2432, 2510, Bengali]
- 4, // Range #45: [2519, 2558, Bengali]
- 16, // Range #46: [2561, 2641, Gurmukhi]
- 16, // Range #47: [2649, 2654, Gurmukhi]
- 16, // Range #48: [2662, 2678, Gurmukhi]
- 15, // Range #49: [2689, 2768, Gujarati]
- 15, // Range #50: [2784, 2801, Gujarati]
- 15, // Range #51: [2809, 2815, Gujarati]
- 31, // Range #52: [2817, 2893, Oriya]
- 31, // Range #53: [2902, 2935, Oriya]
- 35, // Range #54: [2946, 3024, Tamil]
- 35, // Range #55: [3031, 3031, Tamil]
- 35, // Range #56: [3046, 3066, Tamil]
- 36, // Range #57: [3072, 3149, Telugu]
- 36, // Range #58: [3157, 3162, Telugu]
- 36, // Range #59: [3168, 3183, Telugu]
- 36, // Range #60: [3191, 3199, Telugu]
- 21, // Range #61: [3200, 3277, Kannada]
- 21, // Range #62: [3285, 3286, Kannada]
- 21, // Range #63: [3294, 3314, Kannada]
- 26, // Range #64: [3328, 3455, Malayalam]
- 33, // Range #65: [3458, 3551, Sinhala]
- 33, // Range #66: [3558, 3572, Sinhala]
- 38, // Range #67: [3585, 3642, Thai]
- 38, // Range #68: [3648, 3675, Thai]
- 24, // Range #69: [3713, 3807, Lao]
- 39, // Range #70: [3840, 4052, Tibetan]
- 39, // Range #71: [4057, 4058, Tibetan]
- 28, // Range #72: [4096, 4255, Myanmar]
- 12, // Range #73: [4256, 4295, Georgian]
- 12, // Range #74: [4301, 4346, Georgian]
- 12, // Range #75: [4348, 4351, Georgian]
- 18, // Range #76: [4352, 4607, Hangul]
- 11, // Range #77: [4608, 5017, Ethiopic]
- 6, // Range #78: [5024, 5117, Cherokee]
- 40, // Range #79: [5120, 5759, Canadian_Aboriginal]
- 29, // Range #80: [5760, 5788, Ogham]
- 32, // Range #81: [5792, 5866, Runic]
- 32, // Range #82: [5870, 5880, Runic]
- 42, // Range #83: [5888, 5908, Tagalog]
- 43, // Range #84: [5920, 5940, Hanunoo]
- 44, // Range #85: [5952, 5971, Buhid]
- 45, // Range #86: [5984, 6003, Tagbanwa]
- 23, // Range #87: [6016, 6121, Khmer]
- 23, // Range #88: [6128, 6137, Khmer]
- 27, // Range #89: [6144, 6145, Mongolian]
- 27, // Range #90: [6148, 6148, Mongolian]
- 27, // Range #91: [6150, 6169, Mongolian]
- 27, // Range #92: [6176, 6264, Mongolian]
- 27, // Range #93: [6272, 6314, Mongolian]
- 40, // Range #94: [6320, 6389, Canadian_Aboriginal]
- 48, // Range #95: [6400, 6479, Limbu]
- 52, // Range #96: [6480, 6516, Tai_Le]
- 59, // Range #97: [6528, 6601, New_Tai_Lue]
- 59, // Range #98: [6608, 6623, New_Tai_Lue]
- 23, // Range #99: [6624, 6655, Khmer]
- 55, // Range #100: [6656, 6687, Buginese]
- 106, // Range #101: [6688, 6793, Tai_Tham]
- 106, // Range #102: [6800, 6809, Tai_Tham]
- 106, // Range #103: [6816, 6829, Tai_Tham]
- 62, // Range #104: [6912, 7036, Balinese]
- 113, // Range #105: [7040, 7103, Sundanese]
- 63, // Range #106: [7104, 7155, Batak]
- 63, // Range #107: [7164, 7167, Batak]
- 82, // Range #108: [7168, 7247, Lepcha]
- 109, // Range #109: [7248, 7295, Ol_Chiki]
- 8, // Range #110: [7296, 7304, Cyrillic]
- 12, // Range #111: [7312, 7359, Georgian]
- 113, // Range #112: [7360, 7367, Sundanese]
- 25, // Range #113: [7424, 7461, Latin]
- 14, // Range #114: [7462, 7466, Greek]
- 8, // Range #115: [7467, 7467, Cyrillic]
- 25, // Range #116: [7468, 7516, Latin]
- 14, // Range #117: [7517, 7521, Greek]
- 25, // Range #118: [7522, 7525, Latin]
- 14, // Range #119: [7526, 7530, Greek]
- 25, // Range #120: [7531, 7543, Latin]
- 8, // Range #121: [7544, 7544, Cyrillic]
- 25, // Range #122: [7545, 7614, Latin]
- 14, // Range #123: [7615, 7615, Greek]
- 25, // Range #124: [7680, 7935, Latin]
- 14, // Range #125: [7936, 8190, Greek]
- 25, // Range #126: [8305, 8305, Latin]
- 25, // Range #127: [8319, 8319, Latin]
- 25, // Range #128: [8336, 8348, Latin]
- 14, // Range #129: [8486, 8486, Greek]
- 25, // Range #130: [8490, 8491, Latin]
- 25, // Range #131: [8498, 8498, Latin]
- 25, // Range #132: [8526, 8526, Latin]
- 25, // Range #133: [8544, 8584, Latin]
- 46, // Range #134: [10240, 10495, Braille]
- 56, // Range #135: [11264, 11358, Glagolitic]
- 25, // Range #136: [11360, 11391, Latin]
- 7, // Range #137: [11392, 11507, Coptic]
- 7, // Range #138: [11513, 11519, Coptic]
- 12, // Range #139: [11520, 11559, Georgian]
- 12, // Range #140: [11565, 11565, Georgian]
- 60, // Range #141: [11568, 11623, Tifinagh]
- 60, // Range #142: [11631, 11632, Tifinagh]
- 60, // Range #143: [11647, 11647, Tifinagh]
- 11, // Range #144: [11648, 11670, Ethiopic]
- 11, // Range #145: [11680, 11742, Ethiopic]
- 8, // Range #146: [11744, 11775, Cyrillic]
- 17, // Range #147: [11904, 12019, Han]
- 17, // Range #148: [12032, 12245, Han]
- 17, // Range #149: [12293, 12293, Han]
- 17, // Range #150: [12295, 12295, Han]
- 17, // Range #151: [12321, 12329, Han]
- 18, // Range #152: [12334, 12335, Hangul]
- 17, // Range #153: [12344, 12347, Han]
- 20, // Range #154: [12353, 12438, Hiragana]
- 20, // Range #155: [12445, 12447, Hiragana]
- 22, // Range #156: [12449, 12538, Katakana]
- 22, // Range #157: [12541, 12543, Katakana]
- 5, // Range #158: [12549, 12591, Bopomofo]
- 18, // Range #159: [12593, 12686, Hangul]
- 5, // Range #160: [12704, 12730, Bopomofo]
- 22, // Range #161: [12784, 12799, Katakana]
- 18, // Range #162: [12800, 12830, Hangul]
- 18, // Range #163: [12896, 12926, Hangul]
- 22, // Range #164: [13008, 13054, Katakana]
- 22, // Range #165: [13056, 13143, Katakana]
- 17, // Range #166: [13312, 19893, Han]
- 17, // Range #167: [19968, 40943, Han]
- 41, // Range #168: [40960, 42182, Yi]
- 131, // Range #169: [42192, 42239, Lisu]
- 99, // Range #170: [42240, 42539, Vai]
- 8, // Range #171: [42560, 42655, Cyrillic]
- 130, // Range #172: [42656, 42743, Bamum]
- 25, // Range #173: [42786, 42887, Latin]
- 25, // Range #174: [42891, 42950, Latin]
- 25, // Range #175: [42999, 43007, Latin]
- 58, // Range #176: [43008, 43051, Syloti_Nagri]
- 90, // Range #177: [43072, 43127, Phags_Pa]
- 111, // Range #178: [43136, 43205, Saurashtra]
- 111, // Range #179: [43214, 43225, Saurashtra]
- 10, // Range #180: [43232, 43263, Devanagari]
- 79, // Range #181: [43264, 43309, Kayah_Li]
- 79, // Range #182: [43311, 43311, Kayah_Li]
- 110, // Range #183: [43312, 43347, Rejang]
- 110, // Range #184: [43359, 43359, Rejang]
- 18, // Range #185: [43360, 43388, Hangul]
- 78, // Range #186: [43392, 43469, Javanese]
- 78, // Range #187: [43472, 43487, Javanese]
- 28, // Range #188: [43488, 43518, Myanmar]
- 66, // Range #189: [43520, 43574, Cham]
- 66, // Range #190: [43584, 43615, Cham]
- 28, // Range #191: [43616, 43647, Myanmar]
- 127, // Range #192: [43648, 43714, Tai_Viet]
- 127, // Range #193: [43739, 43743, Tai_Viet]
- 115, // Range #194: [43744, 43766, Meetei_Mayek]
- 11, // Range #195: [43777, 43798, Ethiopic]
- 11, // Range #196: [43808, 43822, Ethiopic]
- 25, // Range #197: [43824, 43866, Latin]
- 25, // Range #198: [43868, 43876, Latin]
- 14, // Range #199: [43877, 43877, Greek]
- 25, // Range #200: [43878, 43879, Latin]
- 6, // Range #201: [43888, 43967, Cherokee]
- 115, // Range #202: [43968, 44025, Meetei_Mayek]
- 18, // Range #203: [44032, 55203, Hangul]
- 18, // Range #204: [55216, 55291, Hangul]
- 17, // Range #205: [63744, 64217, Han]
- 25, // Range #206: [64256, 64262, Latin]
- 3, // Range #207: [64275, 64279, Armenian]
- 19, // Range #208: [64285, 64335, Hebrew]
- 2, // Range #209: [64336, 64449, Arabic]
- 2, // Range #210: [64467, 64829, Arabic]
- 2, // Range #211: [64848, 64967, Arabic]
- 2, // Range #212: [65008, 65021, Arabic]
- 8, // Range #213: [65070, 65071, Cyrillic]
- 2, // Range #214: [65136, 65276, Arabic]
- 25, // Range #215: [65313, 65338, Latin]
- 25, // Range #216: [65345, 65370, Latin]
- 22, // Range #217: [65382, 65391, Katakana]
- 22, // Range #218: [65393, 65437, Katakana]
- 18, // Range #219: [65440, 65500, Hangul]
- 49, // Range #220: [65536, 65629, Linear_B]
- 49, // Range #221: [65664, 65786, Linear_B]
- 14, // Range #222: [65856, 65934, Greek]
- 14, // Range #223: [65952, 65952, Greek]
- 107, // Range #224: [66176, 66204, Lycian]
- 104, // Range #225: [66208, 66256, Carian]
- 30, // Range #226: [66304, 66339, Old_Italic]
- 30, // Range #227: [66349, 66351, Old_Italic]
- 13, // Range #228: [66352, 66378, Gothic]
- 89, // Range #229: [66384, 66426, Old_Permic]
- 53, // Range #230: [66432, 66463, Ugaritic]
- 61, // Range #231: [66464, 66517, Old_Persian]
- 9, // Range #232: [66560, 66639, Deseret]
- 51, // Range #233: [66640, 66687, Shavian]
- 50, // Range #234: [66688, 66729, Osmanya]
- 171, // Range #235: [66736, 66811, Osage]
- 136, // Range #236: [66816, 66855, Elbasan]
- 159, // Range #237: [66864, 66915, Caucasian_Albanian]
- 159, // Range #238: [66927, 66927, Caucasian_Albanian]
- 83, // Range #239: [67072, 67382, Linear_A]
- 83, // Range #240: [67392, 67413, Linear_A]
- 83, // Range #241: [67424, 67431, Linear_A]
- 47, // Range #242: [67584, 67647, Cypriot]
- 116, // Range #243: [67648, 67679, Imperial_Aramaic]
- 144, // Range #244: [67680, 67711, Palmyrene]
- 143, // Range #245: [67712, 67742, Nabataean]
- 143, // Range #246: [67751, 67759, Nabataean]
- 162, // Range #247: [67808, 67829, Hatran]
- 162, // Range #248: [67835, 67839, Hatran]
- 91, // Range #249: [67840, 67871, Phoenician]
- 108, // Range #250: [67872, 67897, Lydian]
- 108, // Range #251: [67903, 67903, Lydian]
- 86, // Range #252: [67968, 67999, Meroitic_Hieroglyphs]
- 141, // Range #253: [68000, 68095, Meroitic_Cursive]
- 57, // Range #254: [68096, 68102, Kharoshthi]
- 57, // Range #255: [68108, 68168, Kharoshthi]
- 57, // Range #256: [68176, 68184, Kharoshthi]
- 133, // Range #257: [68192, 68223, Old_South_Arabian]
- 142, // Range #258: [68224, 68255, Old_North_Arabian]
- 121, // Range #259: [68288, 68342, Manichaean]
- 117, // Range #260: [68352, 68415, Avestan]
- 125, // Range #261: [68416, 68447, Inscriptional_Parthian]
- 122, // Range #262: [68448, 68466, Inscriptional_Pahlavi]
- 122, // Range #263: [68472, 68479, Inscriptional_Pahlavi]
- 123, // Range #264: [68480, 68497, Psalter_Pahlavi]
- 123, // Range #265: [68505, 68508, Psalter_Pahlavi]
- 123, // Range #266: [68521, 68527, Psalter_Pahlavi]
- 88, // Range #267: [68608, 68680, Old_Turkic]
- 76, // Range #268: [68736, 68786, Old_Hungarian]
- 76, // Range #269: [68800, 68850, Old_Hungarian]
- 76, // Range #270: [68858, 68863, Old_Hungarian]
- 182, // Range #271: [68864, 68903, Hanifi_Rohingya]
- 182, // Range #272: [68912, 68921, Hanifi_Rohingya]
- 2, // Range #273: [69216, 69246, Arabic]
+ 3, // Range #18: [1329, 1423, Armenian]
+ 19, // Range #19: [1425, 1479, Hebrew]
+ 19, // Range #20: [1488, 1524, Hebrew]
+ 2, // Range #21: [1536, 1540, Arabic]
+ 2, // Range #22: [1542, 1547, Arabic]
+ 2, // Range #23: [1549, 1562, Arabic]
+ 2, // Range #24: [1564, 1566, Arabic]
+ 2, // Range #25: [1568, 1599, Arabic]
+ 2, // Range #26: [1601, 1610, Arabic]
+ 2, // Range #27: [1622, 1647, Arabic]
+ 2, // Range #28: [1649, 1756, Arabic]
+ 2, // Range #29: [1758, 1791, Arabic]
+ 34, // Range #30: [1792, 1871, Syriac]
+ 2, // Range #31: [1872, 1919, Arabic]
+ 37, // Range #32: [1920, 1969, Thaana]
+ 87, // Range #33: [1984, 2047, Nko]
+ 126, // Range #34: [2048, 2110, Samaritan]
+ 84, // Range #35: [2112, 2142, Mandaic]
+ 34, // Range #36: [2144, 2154, Syriac]
+ 2, // Range #37: [2208, 2247, Arabic]
+ 2, // Range #38: [2259, 2273, Arabic]
+ 2, // Range #39: [2275, 2303, Arabic]
+ 10, // Range #40: [2304, 2384, Devanagari]
+ 10, // Range #41: [2389, 2403, Devanagari]
+ 10, // Range #42: [2406, 2431, Devanagari]
+ 4, // Range #43: [2432, 2510, Bengali]
+ 4, // Range #44: [2519, 2558, Bengali]
+ 16, // Range #45: [2561, 2641, Gurmukhi]
+ 16, // Range #46: [2649, 2654, Gurmukhi]
+ 16, // Range #47: [2662, 2678, Gurmukhi]
+ 15, // Range #48: [2689, 2768, Gujarati]
+ 15, // Range #49: [2784, 2801, Gujarati]
+ 15, // Range #50: [2809, 2815, Gujarati]
+ 31, // Range #51: [2817, 2893, Oriya]
+ 31, // Range #52: [2901, 2935, Oriya]
+ 35, // Range #53: [2946, 3024, Tamil]
+ 35, // Range #54: [3031, 3031, Tamil]
+ 35, // Range #55: [3046, 3066, Tamil]
+ 36, // Range #56: [3072, 3149, Telugu]
+ 36, // Range #57: [3157, 3162, Telugu]
+ 36, // Range #58: [3168, 3183, Telugu]
+ 36, // Range #59: [3191, 3199, Telugu]
+ 21, // Range #60: [3200, 3277, Kannada]
+ 21, // Range #61: [3285, 3286, Kannada]
+ 21, // Range #62: [3294, 3314, Kannada]
+ 26, // Range #63: [3328, 3455, Malayalam]
+ 33, // Range #64: [3457, 3551, Sinhala]
+ 33, // Range #65: [3558, 3572, Sinhala]
+ 38, // Range #66: [3585, 3642, Thai]
+ 38, // Range #67: [3648, 3675, Thai]
+ 24, // Range #68: [3713, 3807, Lao]
+ 39, // Range #69: [3840, 4052, Tibetan]
+ 39, // Range #70: [4057, 4058, Tibetan]
+ 28, // Range #71: [4096, 4255, Myanmar]
+ 12, // Range #72: [4256, 4295, Georgian]
+ 12, // Range #73: [4301, 4346, Georgian]
+ 12, // Range #74: [4348, 4351, Georgian]
+ 18, // Range #75: [4352, 4607, Hangul]
+ 11, // Range #76: [4608, 5017, Ethiopic]
+ 6, // Range #77: [5024, 5117, Cherokee]
+ 40, // Range #78: [5120, 5759, Canadian_Aboriginal]
+ 29, // Range #79: [5760, 5788, Ogham]
+ 32, // Range #80: [5792, 5866, Runic]
+ 32, // Range #81: [5870, 5880, Runic]
+ 42, // Range #82: [5888, 5908, Tagalog]
+ 43, // Range #83: [5920, 5940, Hanunoo]
+ 44, // Range #84: [5952, 5971, Buhid]
+ 45, // Range #85: [5984, 6003, Tagbanwa]
+ 23, // Range #86: [6016, 6121, Khmer]
+ 23, // Range #87: [6128, 6137, Khmer]
+ 27, // Range #88: [6144, 6145, Mongolian]
+ 27, // Range #89: [6148, 6148, Mongolian]
+ 27, // Range #90: [6150, 6169, Mongolian]
+ 27, // Range #91: [6176, 6264, Mongolian]
+ 27, // Range #92: [6272, 6314, Mongolian]
+ 40, // Range #93: [6320, 6389, Canadian_Aboriginal]
+ 48, // Range #94: [6400, 6479, Limbu]
+ 52, // Range #95: [6480, 6516, Tai_Le]
+ 59, // Range #96: [6528, 6601, New_Tai_Lue]
+ 59, // Range #97: [6608, 6623, New_Tai_Lue]
+ 23, // Range #98: [6624, 6655, Khmer]
+ 55, // Range #99: [6656, 6687, Buginese]
+ 106, // Range #100: [6688, 6793, Tai_Tham]
+ 106, // Range #101: [6800, 6809, Tai_Tham]
+ 106, // Range #102: [6816, 6829, Tai_Tham]
+ 62, // Range #103: [6912, 7036, Balinese]
+ 113, // Range #104: [7040, 7103, Sundanese]
+ 63, // Range #105: [7104, 7155, Batak]
+ 63, // Range #106: [7164, 7167, Batak]
+ 82, // Range #107: [7168, 7247, Lepcha]
+ 109, // Range #108: [7248, 7295, Ol_Chiki]
+ 8, // Range #109: [7296, 7304, Cyrillic]
+ 12, // Range #110: [7312, 7359, Georgian]
+ 113, // Range #111: [7360, 7367, Sundanese]
+ 25, // Range #112: [7424, 7461, Latin]
+ 14, // Range #113: [7462, 7466, Greek]
+ 8, // Range #114: [7467, 7467, Cyrillic]
+ 25, // Range #115: [7468, 7516, Latin]
+ 14, // Range #116: [7517, 7521, Greek]
+ 25, // Range #117: [7522, 7525, Latin]
+ 14, // Range #118: [7526, 7530, Greek]
+ 25, // Range #119: [7531, 7543, Latin]
+ 8, // Range #120: [7544, 7544, Cyrillic]
+ 25, // Range #121: [7545, 7614, Latin]
+ 14, // Range #122: [7615, 7615, Greek]
+ 25, // Range #123: [7680, 7935, Latin]
+ 14, // Range #124: [7936, 8190, Greek]
+ 25, // Range #125: [8305, 8305, Latin]
+ 25, // Range #126: [8319, 8319, Latin]
+ 25, // Range #127: [8336, 8348, Latin]
+ 14, // Range #128: [8486, 8486, Greek]
+ 25, // Range #129: [8490, 8491, Latin]
+ 25, // Range #130: [8498, 8498, Latin]
+ 25, // Range #131: [8526, 8526, Latin]
+ 25, // Range #132: [8544, 8584, Latin]
+ 46, // Range #133: [10240, 10495, Braille]
+ 56, // Range #134: [11264, 11358, Glagolitic]
+ 25, // Range #135: [11360, 11391, Latin]
+ 7, // Range #136: [11392, 11507, Coptic]
+ 7, // Range #137: [11513, 11519, Coptic]
+ 12, // Range #138: [11520, 11559, Georgian]
+ 12, // Range #139: [11565, 11565, Georgian]
+ 60, // Range #140: [11568, 11623, Tifinagh]
+ 60, // Range #141: [11631, 11632, Tifinagh]
+ 60, // Range #142: [11647, 11647, Tifinagh]
+ 11, // Range #143: [11648, 11670, Ethiopic]
+ 11, // Range #144: [11680, 11742, Ethiopic]
+ 8, // Range #145: [11744, 11775, Cyrillic]
+ 17, // Range #146: [11904, 12019, Han]
+ 17, // Range #147: [12032, 12245, Han]
+ 17, // Range #148: [12293, 12293, Han]
+ 17, // Range #149: [12295, 12295, Han]
+ 17, // Range #150: [12321, 12329, Han]
+ 18, // Range #151: [12334, 12335, Hangul]
+ 17, // Range #152: [12344, 12347, Han]
+ 20, // Range #153: [12353, 12438, Hiragana]
+ 20, // Range #154: [12445, 12447, Hiragana]
+ 22, // Range #155: [12449, 12538, Katakana]
+ 22, // Range #156: [12541, 12543, Katakana]
+ 5, // Range #157: [12549, 12591, Bopomofo]
+ 18, // Range #158: [12593, 12686, Hangul]
+ 5, // Range #159: [12704, 12735, Bopomofo]
+ 22, // Range #160: [12784, 12799, Katakana]
+ 18, // Range #161: [12800, 12830, Hangul]
+ 18, // Range #162: [12896, 12926, Hangul]
+ 22, // Range #163: [13008, 13054, Katakana]
+ 22, // Range #164: [13056, 13143, Katakana]
+ 17, // Range #165: [13312, 19903, Han]
+ 17, // Range #166: [19968, 40956, Han]
+ 41, // Range #167: [40960, 42182, Yi]
+ 131, // Range #168: [42192, 42239, Lisu]
+ 99, // Range #169: [42240, 42539, Vai]
+ 8, // Range #170: [42560, 42655, Cyrillic]
+ 130, // Range #171: [42656, 42743, Bamum]
+ 25, // Range #172: [42786, 42887, Latin]
+ 25, // Range #173: [42891, 42954, Latin]
+ 25, // Range #174: [42997, 43007, Latin]
+ 58, // Range #175: [43008, 43052, Syloti_Nagri]
+ 90, // Range #176: [43072, 43127, Phags_Pa]
+ 111, // Range #177: [43136, 43205, Saurashtra]
+ 111, // Range #178: [43214, 43225, Saurashtra]
+ 10, // Range #179: [43232, 43263, Devanagari]
+ 79, // Range #180: [43264, 43309, Kayah_Li]
+ 79, // Range #181: [43311, 43311, Kayah_Li]
+ 110, // Range #182: [43312, 43347, Rejang]
+ 110, // Range #183: [43359, 43359, Rejang]
+ 18, // Range #184: [43360, 43388, Hangul]
+ 78, // Range #185: [43392, 43469, Javanese]
+ 78, // Range #186: [43472, 43487, Javanese]
+ 28, // Range #187: [43488, 43518, Myanmar]
+ 66, // Range #188: [43520, 43574, Cham]
+ 66, // Range #189: [43584, 43615, Cham]
+ 28, // Range #190: [43616, 43647, Myanmar]
+ 127, // Range #191: [43648, 43714, Tai_Viet]
+ 127, // Range #192: [43739, 43743, Tai_Viet]
+ 115, // Range #193: [43744, 43766, Meetei_Mayek]
+ 11, // Range #194: [43777, 43798, Ethiopic]
+ 11, // Range #195: [43808, 43822, Ethiopic]
+ 25, // Range #196: [43824, 43866, Latin]
+ 25, // Range #197: [43868, 43876, Latin]
+ 14, // Range #198: [43877, 43877, Greek]
+ 25, // Range #199: [43878, 43881, Latin]
+ 6, // Range #200: [43888, 43967, Cherokee]
+ 115, // Range #201: [43968, 44025, Meetei_Mayek]
+ 18, // Range #202: [44032, 55203, Hangul]
+ 18, // Range #203: [55216, 55291, Hangul]
+ 17, // Range #204: [63744, 64217, Han]
+ 25, // Range #205: [64256, 64262, Latin]
+ 3, // Range #206: [64275, 64279, Armenian]
+ 19, // Range #207: [64285, 64335, Hebrew]
+ 2, // Range #208: [64336, 64449, Arabic]
+ 2, // Range #209: [64467, 64829, Arabic]
+ 2, // Range #210: [64848, 64967, Arabic]
+ 2, // Range #211: [65008, 65021, Arabic]
+ 8, // Range #212: [65070, 65071, Cyrillic]
+ 2, // Range #213: [65136, 65276, Arabic]
+ 25, // Range #214: [65313, 65338, Latin]
+ 25, // Range #215: [65345, 65370, Latin]
+ 22, // Range #216: [65382, 65391, Katakana]
+ 22, // Range #217: [65393, 65437, Katakana]
+ 18, // Range #218: [65440, 65500, Hangul]
+ 49, // Range #219: [65536, 65629, Linear_B]
+ 49, // Range #220: [65664, 65786, Linear_B]
+ 14, // Range #221: [65856, 65934, Greek]
+ 14, // Range #222: [65952, 65952, Greek]
+ 107, // Range #223: [66176, 66204, Lycian]
+ 104, // Range #224: [66208, 66256, Carian]
+ 30, // Range #225: [66304, 66339, Old_Italic]
+ 30, // Range #226: [66349, 66351, Old_Italic]
+ 13, // Range #227: [66352, 66378, Gothic]
+ 89, // Range #228: [66384, 66426, Old_Permic]
+ 53, // Range #229: [66432, 66463, Ugaritic]
+ 61, // Range #230: [66464, 66517, Old_Persian]
+ 9, // Range #231: [66560, 66639, Deseret]
+ 51, // Range #232: [66640, 66687, Shavian]
+ 50, // Range #233: [66688, 66729, Osmanya]
+ 171, // Range #234: [66736, 66811, Osage]
+ 136, // Range #235: [66816, 66855, Elbasan]
+ 159, // Range #236: [66864, 66915, Caucasian_Albanian]
+ 159, // Range #237: [66927, 66927, Caucasian_Albanian]
+ 83, // Range #238: [67072, 67382, Linear_A]
+ 83, // Range #239: [67392, 67413, Linear_A]
+ 83, // Range #240: [67424, 67431, Linear_A]
+ 47, // Range #241: [67584, 67647, Cypriot]
+ 116, // Range #242: [67648, 67679, Imperial_Aramaic]
+ 144, // Range #243: [67680, 67711, Palmyrene]
+ 143, // Range #244: [67712, 67742, Nabataean]
+ 143, // Range #245: [67751, 67759, Nabataean]
+ 162, // Range #246: [67808, 67829, Hatran]
+ 162, // Range #247: [67835, 67839, Hatran]
+ 91, // Range #248: [67840, 67871, Phoenician]
+ 108, // Range #249: [67872, 67897, Lydian]
+ 108, // Range #250: [67903, 67903, Lydian]
+ 86, // Range #251: [67968, 67999, Meroitic_Hieroglyphs]
+ 141, // Range #252: [68000, 68095, Meroitic_Cursive]
+ 57, // Range #253: [68096, 68102, Kharoshthi]
+ 57, // Range #254: [68108, 68168, Kharoshthi]
+ 57, // Range #255: [68176, 68184, Kharoshthi]
+ 133, // Range #256: [68192, 68223, Old_South_Arabian]
+ 142, // Range #257: [68224, 68255, Old_North_Arabian]
+ 121, // Range #258: [68288, 68342, Manichaean]
+ 117, // Range #259: [68352, 68415, Avestan]
+ 125, // Range #260: [68416, 68447, Inscriptional_Parthian]
+ 122, // Range #261: [68448, 68466, Inscriptional_Pahlavi]
+ 122, // Range #262: [68472, 68479, Inscriptional_Pahlavi]
+ 123, // Range #263: [68480, 68497, Psalter_Pahlavi]
+ 123, // Range #264: [68505, 68508, Psalter_Pahlavi]
+ 123, // Range #265: [68521, 68527, Psalter_Pahlavi]
+ 88, // Range #266: [68608, 68680, Old_Turkic]
+ 76, // Range #267: [68736, 68786, Old_Hungarian]
+ 76, // Range #268: [68800, 68850, Old_Hungarian]
+ 76, // Range #269: [68858, 68863, Old_Hungarian]
+ 182, // Range #270: [68864, 68903, Hanifi_Rohingya]
+ 182, // Range #271: [68912, 68921, Hanifi_Rohingya]
+ 2, // Range #272: [69216, 69246, Arabic]
+ 192, // Range #273: [69248, 69297, Yezidi]
184, // Range #274: [69376, 69415, Old_Sogdian]
183, // Range #275: [69424, 69465, Sogdian]
- 185, // Range #276: [69600, 69622, Elymaic]
- 65, // Range #277: [69632, 69743, Brahmi]
- 65, // Range #278: [69759, 69759, Brahmi]
- 120, // Range #279: [69760, 69825, Kaithi]
- 120, // Range #280: [69837, 69837, Kaithi]
- 152, // Range #281: [69840, 69864, Sora_Sompeng]
- 152, // Range #282: [69872, 69881, Sora_Sompeng]
- 118, // Range #283: [69888, 69958, Chakma]
- 160, // Range #284: [69968, 70006, Mahajani]
- 151, // Range #285: [70016, 70111, Sharada]
- 33, // Range #286: [70113, 70132, Sinhala]
- 157, // Range #287: [70144, 70206, Khojki]
- 164, // Range #288: [70272, 70313, Multani]
- 145, // Range #289: [70320, 70378, Khudawadi]
- 145, // Range #290: [70384, 70393, Khudawadi]
- 137, // Range #291: [70400, 70457, Grantha]
- 137, // Range #292: [70460, 70480, Grantha]
- 137, // Range #293: [70487, 70487, Grantha]
- 137, // Range #294: [70493, 70516, Grantha]
- 170, // Range #295: [70656, 70751, Newa]
- 158, // Range #296: [70784, 70855, Tirhuta]
- 158, // Range #297: [70864, 70873, Tirhuta]
- 166, // Range #298: [71040, 71133, Siddham]
- 163, // Range #299: [71168, 71236, Modi]
- 163, // Range #300: [71248, 71257, Modi]
- 27, // Range #301: [71264, 71276, Mongolian]
- 153, // Range #302: [71296, 71352, Takri]
- 153, // Range #303: [71360, 71369, Takri]
- 161, // Range #304: [71424, 71487, Ahom]
- 178, // Range #305: [71680, 71739, Dogra]
- 146, // Range #306: [71840, 71922, Warang_Citi]
- 146, // Range #307: [71935, 71935, Warang_Citi]
- 187, // Range #308: [72096, 72164, Nandinagari]
- 177, // Range #309: [72192, 72263, Zanabazar_Square]
- 176, // Range #310: [72272, 72354, Soyombo]
- 165, // Range #311: [72384, 72440, Pau_Cin_Hau]
- 168, // Range #312: [72704, 72773, Bhaiksuki]
- 168, // Range #313: [72784, 72812, Bhaiksuki]
- 169, // Range #314: [72816, 72886, Marchen]
- 175, // Range #315: [72960, 73031, Masaram_Gondi]
- 175, // Range #316: [73040, 73049, Masaram_Gondi]
- 179, // Range #317: [73056, 73112, Gunjala_Gondi]
- 179, // Range #318: [73120, 73129, Gunjala_Gondi]
- 180, // Range #319: [73440, 73464, Makasar]
- 35, // Range #320: [73664, 73713, Tamil]
- 35, // Range #321: [73727, 73727, Tamil]
- 101, // Range #322: [73728, 74649, Cuneiform]
- 101, // Range #323: [74752, 74868, Cuneiform]
- 101, // Range #324: [74880, 75075, Cuneiform]
- 71, // Range #325: [77824, 78904, Egyptian_Hieroglyphs]
- 156, // Range #326: [82944, 83526, Anatolian_Hieroglyphs]
- 130, // Range #327: [92160, 92728, Bamum]
- 149, // Range #328: [92736, 92783, Mro]
- 134, // Range #329: [92880, 92917, Bassa_Vah]
- 75, // Range #330: [92928, 92997, Pahawh_Hmong]
- 75, // Range #331: [93008, 93047, Pahawh_Hmong]
- 75, // Range #332: [93053, 93071, Pahawh_Hmong]
- 181, // Range #333: [93760, 93850, Medefaidrin]
- 92, // Range #334: [93952, 94087, Miao]
- 92, // Range #335: [94095, 94111, Miao]
- 154, // Range #336: [94176, 94176, Tangut]
- 150, // Range #337: [94177, 94177, Nushu]
- 154, // Range #338: [94208, 100343, Tangut]
- 154, // Range #339: [100352, 101106, Tangut]
- 22, // Range #340: [110592, 110592, Katakana]
- 20, // Range #341: [110593, 110878, Hiragana]
- 20, // Range #342: [110928, 110930, Hiragana]
- 22, // Range #343: [110948, 110951, Katakana]
- 150, // Range #344: [110960, 111355, Nushu]
- 135, // Range #345: [113664, 113770, Duployan]
- 135, // Range #346: [113776, 113800, Duployan]
- 135, // Range #347: [113808, 113823, Duployan]
- 14, // Range #348: [119296, 119365, Greek]
- 112, // Range #349: [120832, 121483, SignWriting]
- 112, // Range #350: [121499, 121519, SignWriting]
- 56, // Range #351: [122880, 122922, Glagolitic]
- 186, // Range #352: [123136, 123215, Nyiakeng_Puachue_Hmong]
- 188, // Range #353: [123584, 123641, Wancho]
- 188, // Range #354: [123647, 123647, Wancho]
- 140, // Range #355: [124928, 125142, Mende_Kikakui]
- 167, // Range #356: [125184, 125279, Adlam]
- 2, // Range #357: [126464, 126523, Arabic]
- 2, // Range #358: [126530, 126619, Arabic]
- 2, // Range #359: [126625, 126651, Arabic]
- 2, // Range #360: [126704, 126705, Arabic]
- 20, // Range #361: [127488, 127488, Hiragana]
- 17, // Range #362: [131072, 173782, Han]
- 17, // Range #363: [173824, 177972, Han]
- 17, // Range #364: [177984, 183969, Han]
- 17, // Range #365: [183984, 191456, Han]
- 17, // Range #366: [194560, 195101, Han]
+ 189, // Range #276: [69552, 69579, Chorasmian]
+ 185, // Range #277: [69600, 69622, Elymaic]
+ 65, // Range #278: [69632, 69743, Brahmi]
+ 65, // Range #279: [69759, 69759, Brahmi]
+ 120, // Range #280: [69760, 69825, Kaithi]
+ 120, // Range #281: [69837, 69837, Kaithi]
+ 152, // Range #282: [69840, 69864, Sora_Sompeng]
+ 152, // Range #283: [69872, 69881, Sora_Sompeng]
+ 118, // Range #284: [69888, 69959, Chakma]
+ 160, // Range #285: [69968, 70006, Mahajani]
+ 151, // Range #286: [70016, 70111, Sharada]
+ 33, // Range #287: [70113, 70132, Sinhala]
+ 157, // Range #288: [70144, 70206, Khojki]
+ 164, // Range #289: [70272, 70313, Multani]
+ 145, // Range #290: [70320, 70378, Khudawadi]
+ 145, // Range #291: [70384, 70393, Khudawadi]
+ 137, // Range #292: [70400, 70457, Grantha]
+ 137, // Range #293: [70460, 70480, Grantha]
+ 137, // Range #294: [70487, 70487, Grantha]
+ 137, // Range #295: [70493, 70516, Grantha]
+ 170, // Range #296: [70656, 70753, Newa]
+ 158, // Range #297: [70784, 70855, Tirhuta]
+ 158, // Range #298: [70864, 70873, Tirhuta]
+ 166, // Range #299: [71040, 71133, Siddham]
+ 163, // Range #300: [71168, 71236, Modi]
+ 163, // Range #301: [71248, 71257, Modi]
+ 27, // Range #302: [71264, 71276, Mongolian]
+ 153, // Range #303: [71296, 71352, Takri]
+ 153, // Range #304: [71360, 71369, Takri]
+ 161, // Range #305: [71424, 71487, Ahom]
+ 178, // Range #306: [71680, 71739, Dogra]
+ 146, // Range #307: [71840, 71922, Warang_Citi]
+ 146, // Range #308: [71935, 71935, Warang_Citi]
+ 190, // Range #309: [71936, 72006, Dives_Akuru]
+ 190, // Range #310: [72016, 72025, Dives_Akuru]
+ 187, // Range #311: [72096, 72164, Nandinagari]
+ 177, // Range #312: [72192, 72263, Zanabazar_Square]
+ 176, // Range #313: [72272, 72354, Soyombo]
+ 165, // Range #314: [72384, 72440, Pau_Cin_Hau]
+ 168, // Range #315: [72704, 72773, Bhaiksuki]
+ 168, // Range #316: [72784, 72812, Bhaiksuki]
+ 169, // Range #317: [72816, 72886, Marchen]
+ 175, // Range #318: [72960, 73031, Masaram_Gondi]
+ 175, // Range #319: [73040, 73049, Masaram_Gondi]
+ 179, // Range #320: [73056, 73112, Gunjala_Gondi]
+ 179, // Range #321: [73120, 73129, Gunjala_Gondi]
+ 180, // Range #322: [73440, 73464, Makasar]
+ 131, // Range #323: [73648, 73648, Lisu]
+ 35, // Range #324: [73664, 73713, Tamil]
+ 35, // Range #325: [73727, 73727, Tamil]
+ 101, // Range #326: [73728, 74649, Cuneiform]
+ 101, // Range #327: [74752, 74868, Cuneiform]
+ 101, // Range #328: [74880, 75075, Cuneiform]
+ 71, // Range #329: [77824, 78904, Egyptian_Hieroglyphs]
+ 156, // Range #330: [82944, 83526, Anatolian_Hieroglyphs]
+ 130, // Range #331: [92160, 92728, Bamum]
+ 149, // Range #332: [92736, 92783, Mro]
+ 134, // Range #333: [92880, 92917, Bassa_Vah]
+ 75, // Range #334: [92928, 92997, Pahawh_Hmong]
+ 75, // Range #335: [93008, 93047, Pahawh_Hmong]
+ 75, // Range #336: [93053, 93071, Pahawh_Hmong]
+ 181, // Range #337: [93760, 93850, Medefaidrin]
+ 92, // Range #338: [93952, 94087, Miao]
+ 92, // Range #339: [94095, 94111, Miao]
+ 154, // Range #340: [94176, 94176, Tangut]
+ 150, // Range #341: [94177, 94177, Nushu]
+ 191, // Range #342: [94180, 94180, Khitan_Small_Script]
+ 17, // Range #343: [94192, 94193, Han]
+ 154, // Range #344: [94208, 100343, Tangut]
+ 154, // Range #345: [100352, 101119, Tangut]
+ 191, // Range #346: [101120, 101589, Khitan_Small_Script]
+ 154, // Range #347: [101632, 101640, Tangut]
+ 22, // Range #348: [110592, 110592, Katakana]
+ 20, // Range #349: [110593, 110878, Hiragana]
+ 20, // Range #350: [110928, 110930, Hiragana]
+ 22, // Range #351: [110948, 110951, Katakana]
+ 150, // Range #352: [110960, 111355, Nushu]
+ 135, // Range #353: [113664, 113770, Duployan]
+ 135, // Range #354: [113776, 113800, Duployan]
+ 135, // Range #355: [113808, 113823, Duployan]
+ 14, // Range #356: [119296, 119365, Greek]
+ 112, // Range #357: [120832, 121483, SignWriting]
+ 112, // Range #358: [121499, 121519, SignWriting]
+ 56, // Range #359: [122880, 122922, Glagolitic]
+ 186, // Range #360: [123136, 123215, Nyiakeng_Puachue_Hmong]
+ 188, // Range #361: [123584, 123641, Wancho]
+ 188, // Range #362: [123647, 123647, Wancho]
+ 140, // Range #363: [124928, 125142, Mende_Kikakui]
+ 167, // Range #364: [125184, 125279, Adlam]
+ 2, // Range #365: [126464, 126523, Arabic]
+ 2, // Range #366: [126530, 126619, Arabic]
+ 2, // Range #367: [126625, 126651, Arabic]
+ 2, // Range #368: [126704, 126705, Arabic]
+ 20, // Range #369: [127488, 127488, Hiragana]
+ 17, // Range #370: [131072, 173789, Han]
+ 17, // Range #371: [173824, 177972, Han]
+ 17, // Range #372: [177984, 183969, Han]
+ 17, // Range #373: [183984, 191456, Han]
+ 17, // Range #374: [194560, 195101, Han]
+ 17, // Range #375: [196608, 201546, Han]
};
-const uint8 kMaxScript = 188;
+const uint8 kMaxScript = 192;
} // namespace approx_script_internal
} // namespace mobile
diff --git a/native/models/actions_suggestions.en.model b/native/models/actions_suggestions.en.model
index 6604fcb..d4b0ced 100755
--- a/native/models/actions_suggestions.en.model
+++ b/native/models/actions_suggestions.en.model
Binary files differ
diff --git a/native/models/actions_suggestions.universal.model b/native/models/actions_suggestions.universal.model
index 6261d8f..2ee546c 100755
--- a/native/models/actions_suggestions.universal.model
+++ b/native/models/actions_suggestions.universal.model
Binary files differ
diff --git a/native/models/textclassifier.ar.model b/native/models/textclassifier.ar.model
index 2224598..dbd685b 100755
--- a/native/models/textclassifier.ar.model
+++ b/native/models/textclassifier.ar.model
Binary files differ
diff --git a/native/models/textclassifier.en.model b/native/models/textclassifier.en.model
index fbb5a6c..c930fe6 100755
--- a/native/models/textclassifier.en.model
+++ b/native/models/textclassifier.en.model
Binary files differ
diff --git a/native/models/textclassifier.es.model b/native/models/textclassifier.es.model
index 2ef143c..26e3908 100755
--- a/native/models/textclassifier.es.model
+++ b/native/models/textclassifier.es.model
Binary files differ
diff --git a/native/models/textclassifier.fr.model b/native/models/textclassifier.fr.model
index 76babd6..9746ec9 100755
--- a/native/models/textclassifier.fr.model
+++ b/native/models/textclassifier.fr.model
Binary files differ
diff --git a/native/models/textclassifier.it.model b/native/models/textclassifier.it.model
index c5cd3b6..1ce898c 100755
--- a/native/models/textclassifier.it.model
+++ b/native/models/textclassifier.it.model
Binary files differ
diff --git a/native/models/textclassifier.ja.model b/native/models/textclassifier.ja.model
index a1c3bed..bc61400 100755
--- a/native/models/textclassifier.ja.model
+++ b/native/models/textclassifier.ja.model
Binary files differ
diff --git a/native/models/textclassifier.ko.model b/native/models/textclassifier.ko.model
index 8fe96cd..59a9cde 100755
--- a/native/models/textclassifier.ko.model
+++ b/native/models/textclassifier.ko.model
Binary files differ
diff --git a/native/models/textclassifier.nl.model b/native/models/textclassifier.nl.model
index e97afd9..aa95ca4 100755
--- a/native/models/textclassifier.nl.model
+++ b/native/models/textclassifier.nl.model
Binary files differ
diff --git a/native/models/textclassifier.pl.model b/native/models/textclassifier.pl.model
index 5b25d5e..10e36e1 100755
--- a/native/models/textclassifier.pl.model
+++ b/native/models/textclassifier.pl.model
Binary files differ
diff --git a/native/models/textclassifier.pt.model b/native/models/textclassifier.pt.model
index 1d8fd1a..c76e430 100755
--- a/native/models/textclassifier.pt.model
+++ b/native/models/textclassifier.pt.model
Binary files differ
diff --git a/native/models/textclassifier.ru.model b/native/models/textclassifier.ru.model
index b579beb..b9a3ffd 100755
--- a/native/models/textclassifier.ru.model
+++ b/native/models/textclassifier.ru.model
Binary files differ
diff --git a/native/models/textclassifier.th.model b/native/models/textclassifier.th.model
index 78c2bc9..a67237a 100755
--- a/native/models/textclassifier.th.model
+++ b/native/models/textclassifier.th.model
Binary files differ
diff --git a/native/models/textclassifier.tr.model b/native/models/textclassifier.tr.model
index d56e5ce..e3cfd68 100755
--- a/native/models/textclassifier.tr.model
+++ b/native/models/textclassifier.tr.model
Binary files differ
diff --git a/native/models/textclassifier.universal.model b/native/models/textclassifier.universal.model
index 27f023d..7f7476c 100755
--- a/native/models/textclassifier.universal.model
+++ b/native/models/textclassifier.universal.model
Binary files differ
diff --git a/native/models/textclassifier.zh.model b/native/models/textclassifier.zh.model
index d700417..fe11975 100755
--- a/native/models/textclassifier.zh.model
+++ b/native/models/textclassifier.zh.model
Binary files differ
diff --git a/native/utils/base/arena.h b/native/utils/base/arena.h
index 7562917..28b6f6c 100644
--- a/native/utils/base/arena.h
+++ b/native/utils/base/arena.h
@@ -53,6 +53,7 @@
#include <assert.h>
#include <string.h>
+
#include <vector>
#ifdef ADDRESS_SANITIZER
#include <sanitizer/asan_interface.h>
@@ -67,7 +68,7 @@
// arena at the same time without locking, as long as they use only
// const methods.
class BaseArena {
- protected: // You can't make an arena directly; only a subclass of one
+ protected: // You can't make an arena directly; only a subclass of one
BaseArena(char* first_block, const size_t block_size, bool align_to_page);
public:
@@ -77,18 +78,17 @@
// they're "slow" only 'cause they're virtual (subclasses define "fast" ones)
virtual char* SlowAlloc(size_t size) = 0;
- virtual void SlowFree(void* memory, size_t size) = 0;
+ virtual void SlowFree(void* memory, size_t size) = 0;
virtual char* SlowRealloc(char* memory, size_t old_size, size_t new_size) = 0;
class Status {
private:
friend class BaseArena;
size_t bytes_allocated_;
+
public:
- Status() : bytes_allocated_(0) { }
- size_t bytes_allocated() const {
- return bytes_allocated_;
- }
+ Status() : bytes_allocated_(0) {}
+ size_t bytes_allocated() const { return bytes_allocated_; }
};
// Accessors and stats counters
@@ -96,8 +96,8 @@
// type-compatible with ArenaAllocator (in arena_allocator.h). That is,
// we define arena() because ArenaAllocator does, and that way you
// can template on either of these and know it's safe to call arena().
- virtual BaseArena* arena() { return this; }
- size_t block_size() const { return block_size_; }
+ virtual BaseArena* arena() { return this; }
+ size_t block_size() const { return block_size_; }
int block_count() const;
bool is_empty() const {
// must check block count in case we allocated a block larger than blksize
@@ -112,8 +112,8 @@
void MakeNewBlock(const uint32 alignment);
void* GetMemoryFallback(const size_t size, const int align);
void* GetMemory(const size_t size, const int align) {
- assert(remaining_ <= block_size_); // an invariant
- if ( size > 0 && size <= remaining_ && align == 1 ) { // common case
+ assert(remaining_ <= block_size_); // an invariant
+ if (size > 0 && size <= remaining_ && align == 1) { // common case
last_alloc_ = freestart_;
freestart_ += size;
remaining_ -= size;
@@ -161,18 +161,18 @@
const AllocatedBlock* IndexToBlock(int index) const;
const size_t block_size_;
- char* freestart_; // beginning of the free space in most recent block
+ char* freestart_; // beginning of the free space in most recent block
char* freestart_when_empty_; // beginning of the free space when we're empty
- char* last_alloc_; // used to make sure ReturnBytes() is safe
+ char* last_alloc_; // used to make sure ReturnBytes() is safe
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
std::vector<AllocatedBlock>* overflow_blocks_;
// STL vector isn't as efficient as it could be, so we use an array at first
- const bool first_block_externally_owned_; // true if they pass in 1st block
+ const bool first_block_externally_owned_; // true if they pass in 1st block
const bool page_aligned_; // when true, all blocks need to be page aligned
int8_t blocks_alloced_; // how many of the first_blocks_ have been allocated
- AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
+ AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
- void FreeBlocks(); // Frees all except first block
+ void FreeBlocks(); // Frees all except first block
BaseArena(const BaseArena&) = delete;
BaseArena& operator=(const BaseArena&) = delete;
@@ -182,18 +182,18 @@
public:
// Allocates a thread-compatible arena with the specified block size.
explicit UnsafeArena(const size_t block_size)
- : BaseArena(nullptr, block_size, false) { }
+ : BaseArena(nullptr, block_size, false) {}
UnsafeArena(const size_t block_size, bool align)
- : BaseArena(nullptr, block_size, align) { }
+ : BaseArena(nullptr, block_size, align) {}
// Allocates a thread-compatible arena with the specified block
// size. "first_block" must have size "block_size". Memory is
// allocated from "first_block" until it is exhausted; after that
// memory is allocated by allocating new blocks from the heap.
UnsafeArena(char* first_block, const size_t block_size)
- : BaseArena(first_block, block_size, false) { }
+ : BaseArena(first_block, block_size, false) {}
UnsafeArena(char* first_block, const size_t block_size, bool align)
- : BaseArena(first_block, block_size, align) { }
+ : BaseArena(first_block, block_size, align) {}
char* Alloc(const size_t size) {
return reinterpret_cast<char*>(GetMemory(size, 1));
@@ -201,6 +201,14 @@
void* AllocAligned(const size_t size, const int align) {
return GetMemory(size, align);
}
+
+ // Allocates and initializes an object on the arena.
+ template <typename T, typename... Args>
+ T* AllocAndInit(Args... args) {
+ return new (reinterpret_cast<T*>(AllocAligned(sizeof(T), alignof(T))))
+ T(std::forward<Args>(args)...);
+ }
+
char* Calloc(const size_t size) {
void* return_value = Alloc(size);
memset(return_value, 0, size);
@@ -214,9 +222,7 @@
}
// Free does nothing except for the last piece allocated.
- void Free(void* memory, size_t size) {
- ReturnMemory(memory, size);
- }
+ void Free(void* memory, size_t size) { ReturnMemory(memory, size); }
char* SlowAlloc(size_t size) override { // "slow" 'cause it's virtual
return Alloc(size);
}
@@ -234,14 +240,12 @@
return newstr;
}
char* MemdupPlusNUL(const char* s, size_t bytes) { // like "string(s, len)"
- char* newstr = Alloc(bytes+1);
+ char* newstr = Alloc(bytes + 1);
memcpy(newstr, s, bytes);
newstr[bytes] = '\0';
return newstr;
}
- char* Strdup(const char* s) {
- return Memdup(s, strlen(s) + 1);
- }
+ char* Strdup(const char* s) { return Memdup(s, strlen(s) + 1); }
// Unlike libc's strncpy, I always NUL-terminate. libc's semantics are dumb.
// This will allocate at most n+1 bytes (+1 is for the nul terminator).
char* Strndup(const char* s, size_t n) {
@@ -261,8 +265,8 @@
// If you know the new size is smaller (or equal), you don't need to know
// oldsize. We don't check that newsize is smaller, so you'd better be sure!
char* Shrink(char* s, size_t newsize) {
- AdjustLastAlloc(s, newsize); // reclaim space if we can
- return s; // never need to move if we go smaller
+ AdjustLastAlloc(s, newsize); // reclaim space if we can
+ return s; // never need to move if we go smaller
}
// We make a copy so you can keep track of status at a given point in time
diff --git a/native/utils/base/arena_leakage_unittest.cc b/native/utils/base/arena_leakage_unittest.cc
new file mode 100644
index 0000000..642dacd
--- /dev/null
+++ b/native/utils/base/arena_leakage_unittest.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/base/arena.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+
+TEST(Arena, Leakage) {
+ UnsafeArena arena(32);
+ // Grab just 10 bytes.
+ EXPECT_EQ(arena.bytes_until_next_allocation(), 32);
+ const char* block = arena.Alloc(10);
+ EXPECT_NE(block, nullptr);
+ EXPECT_EQ(arena.bytes_until_next_allocation(), 22);
+ // Grab the rest.
+ const char* expected_next_block = block + 10;
+ const char* next_block = arena.Alloc(22);
+ // If the below test fails, a new block has been allocated for "next_block".
+ // This means that the last 22 bytes of the previous block have been lost.
+ EXPECT_EQ(next_block, expected_next_block);
+ EXPECT_EQ(arena.bytes_until_next_allocation(), 0);
+ // Try allocating a 0 bytes block. Arena should remain unchanged.
+ const char* null_block = arena.Alloc(0);
+ EXPECT_EQ(null_block, nullptr);
+ EXPECT_EQ(arena.bytes_until_next_allocation(), 0);
+}
+
+} // namespace libtextclassifier3
diff --git a/native/utils/base/prefixvarint.cc b/native/utils/base/prefixvarint.cc
deleted file mode 100644
index 5febbc5..0000000
--- a/native/utils/base/prefixvarint.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "utils/base/prefixvarint.h"
-
-#include "utils/base/integral_types.h"
-
-namespace libtextclassifier3 {
-
-const int PrefixVarint::kMax32;
-const int PrefixVarint::kMax64;
-const int PrefixVarint::kSlopBytes;
-const int PrefixVarint::kEncode32SlopBytes;
-const int PrefixVarint::kEncode64SlopBytes;
-
-char* PrefixVarint::SafeEncode32(char* ptr, uint32 val) {
- return SafeEncode32Inline(ptr, val);
-}
-
-char* PrefixVarint::SafeEncode64(char* ptr, uint64 val) {
- return SafeEncode64Inline(ptr, val);
-}
-
-void PrefixVarint::Append32Slow(std::string* s, uint32 value) {
- size_t start = s->size();
- s->resize(start + PrefixVarint::Length32(value));
- PrefixVarint::SafeEncode32(&((*s)[start]), value);
-}
-
-void PrefixVarint::Append64Slow(std::string* s, uint64 value) {
- size_t start = s->size();
- s->resize(start + PrefixVarint::Length64(value));
- PrefixVarint::SafeEncode64(&((*s)[start]), value);
-}
-
-const char* PrefixVarint::Parse32Fallback(uint32 code, const char* ptr,
- uint32* val) {
- return Parse32FallbackInline(code, ptr, val);
-}
-
-const char* PrefixVarint::Parse64Fallback(uint64 code, const char* ptr,
- uint64* val) {
- return Parse64FallbackInline(code, ptr, val);
-}
-
-#if 0
-const PrefixVarint::CodeInfo PrefixVarint::code_info_[8] = {
- {2, 0xff00}, {2, 0xff00},
- {2, 0xff00}, {2, 0xff00},
- {3, 0xffff00}, {3, 0xffff00},
- {4, 0xffffff00}, {5, 0xffffff00}
-};
-#endif
-
-} // namespace libtextclassifier3
diff --git a/native/utils/base/prefixvarint.h b/native/utils/base/prefixvarint.h
deleted file mode 100644
index 8e4f308..0000000
--- a/native/utils/base/prefixvarint.h
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// PrefixVarint is an integer encoding method that has the exact same
-// compression size as Varint, but is faster to decode because all of the
-// length information is encoded in the first byte.
-// On a Warp 19 it can parse up to 42% faster than Varint, for the distributions
-// tested below.
-// On an Ilium it can parse up to 37% faster than Varint.
-//
-// But there are a few caveats:
-// - This is fastest if both the encoder and decoder are little endian.
-// Somewhat slower versions are provided for encoding and decoding on big
-// endian machines.
-// - This doesn't support backwards decoding.
-//
-// The PrefixVarint encoding uses a unary code in the high bits of the first
-// byte to encode the total number of bytes, as follows:
-// - 32bit encoding:
-// 1 byte: "0" + 7 value bits
-// 2 bytes: "10" + 6 value bits
-// 3 bytes: "110" + 5 value bits
-// 4 bytes: "1110" + 4 value bits
-// 5 bytes: "1111" + no value bits (value is in the next 4 bytes)
-//
-// - 64bit encoding:
-// 1 byte: "0" + 7 value bits
-// 2 bytes: "10" + 6 value bits
-// 3 bytes: "110" + 5 value bits
-// 4 bytes: "1110" + 4 value bits
-// 5 bytes: "11110" + 3 value bits
-// 6 bytes: "111110" + 2 value bits
-// 7 bytes: "1111110" + 1 value bits
-// 8 bytes: "11111110" + no value bits (value is in the next 7 bytes)
-// 9 bytes: "11111111" + no value bits (value is in the next 8 bytes)
-//
-// Note that 32bit and 64bit PrefixVarint encoding are same for values between
-// 0 and (1<<28)-1 (i.e., upto 4 byte-encodable value).
-//
-// The following are benchmark results (in cycles per operation, so lower is
-// better) on randomly generated sequences of values whose encodings have the
-// given distribution of byte lengths. The cycle counts include some overhead
-// (1-2 cycles) for the testing loop operation.
-//
-// UNIFORM 2^14 means the values are randomly generated in the range [0-2^14),
-// so the majority will require 2 bytes to encode. MIXED 60:20:10:6:4, on the
-// other hand, means 60% of the values encode to 1 byte, 20% to 2 bytes, and
-// so on. The MIXED 15:71:13:1.2:0.1 distribution simulates a power law with
-// median value of 1024.
-//
-// VI is Varint, PVI is PrefixVarint. In both cases, Parse32Inline was used.
-//
-// Warp 19 (Opteron):
-// Encode Parse Skip
-// Byte Len Dist VI PVI VI PVI VI PVI
-// UNIFORM 2^7 12.2 9.9 3.4 3.3 3.2 3.2
-// UNIFORM 2^14 18.2 14.0 8.8 6.0 5.4 6.4
-// UNIFORM 2^21 18.1 15.1 13.0 9.7 6.7 9.5
-// UNIFORM 2^28 18.9 14.9 15.4 12.1 9.8 10.7
-// UNIFORM 2^31 23.6 19.3 20.1 14.9 12.7 10.7
-// MIXED 50:50:0:0:0 19.4 19.8 15.0 12.7 11.8 12.6
-// MIXED 20:20:20:20:20 28.2 27.3 24.9 21.8 20.7 18.8
-// MIXED 60:20:10:6:4 23.5 23.3 29.7 17.3 16.7 16.3
-// MIXED 80:12:5:2:1 16.5 16.3 11.6 9.9 9.7 9.6
-// MIXED 90:7:2:1:0 12.9 12.9 8.2 6.2 6.1 6.1
-// MIXED 15:71:13:1.2:0.1 18.9 19.2 13.8 11.2 11.0 11.8
-//
-// Ilium:
-// Encode Parse Skip
-// Byte Len Dist VI PVI VI PVI VI PVI
-// UNIFORM 2^7 10.2 8.7 3.1 3.1 2.9 2.1
-// UNIFORM 2^14 15.8 13.2 7.1 4.5 4.2 3.4
-// UNIFORM 2^21 15.6 14.1 10.1 6.6 5.4 5.7
-// UNIFORM 2^28 18.1 15.2 12.7 8.8 7.3 8.3
-// UNIFORM 2^31 21.8 16.5 17.9 13.3 13.9 8.1
-// MIXED 50:50:0:0:0 19.8 20.7 14.2 13.0 12.4 12.2
-// MIXED 20:20:20:20:20 29.8 30.1 27.7 24.3 22.7 20.2
-// MIXED 60:20:10:6:4 24.2 24.9 20.1 18.9 18.7 17.2
-// MIXED 80:12:5:2:1 16.3 16.6 12.0 11.6 11.3 10.7
-// MIXED 90:7:2:1:0 12.1 12.3 7.2 7.0 6.8 6.5
-// MIXED 15:71:13:1.2:0.1 19.2 20.1 14.2 13.1 12.5 12.0
-//
-
-#ifndef LIBTEXTCLASSIFIER_UTILS_BASE_PREFIXVARINT_H_
-#define LIBTEXTCLASSIFIER_UTILS_BASE_PREFIXVARINT_H_
-
-#include <string>
-
-#include "utils/base/casts.h"
-#include "utils/base/endian.h"
-#include "utils/base/integral_types.h"
-#include "utils/base/unaligned_access.h"
-
-namespace libtextclassifier3 {
-
-class PrefixVarint {
- public:
- // The max bytes used to encode a uint32:
- static constexpr int kMax32 = 5;
- static constexpr int kMax64 = 9;
-
- // This decoder does not read past the encoded buffer.
- static constexpr int kSlopBytes = 0;
-
- // Returns the number of bytes used to encode the given value:
- static int Length32(uint32 val);
- static int Length64(uint64 val);
-
- // The Encode functions could reset up to the following bytes past the last
- // encoded byte. Use the slower SafeEncode equivalent if you want the encode
- // to not use any slop bytes.
- static constexpr int kEncode32SlopBytes = 1;
- static constexpr int kEncode64SlopBytes = 3;
-
- // The safer version of the Encode functions, which don't need any slop bytes.
- static char* SafeEncode32(char* ptr, uint32 val);
- static char* SafeEncode64(char* ptr, uint64 val);
- // Inlined version:
- static char* SafeEncode32Inline(char* ptr, uint32 val);
- static char* SafeEncode64Inline(char* ptr, uint64 val);
-
- // Appends the encoded value to *s.
- static void Append32(std::string* s, uint32 value);
- static void Append64(std::string* s, uint64 value);
-
- // Parses the next value in the ptr buffer and returns the pointer advanced
- // past the end of the encoded value.
- static const char* Parse32(const char* ptr, uint32* val);
- static const char* Parse64(const char* ptr, uint64* val);
- // Use this in time-critical code:
- static const char* Parse32Inline(const char* ptr, uint32* val);
- static const char* Parse64Inline(const char* ptr, uint64* val);
-
- private:
- static constexpr int kMin2Bytes = (1 << 7);
- static constexpr int kMin3Bytes = (1 << 14);
- static constexpr int kMin4Bytes = (1 << 21);
- static constexpr int kMin5Bytes = (1 << 28);
- static constexpr int64 kMin6Bytes = (1LL << 35);
- static constexpr int64 kMin7Bytes = (1LL << 42);
- static constexpr int64 kMin8Bytes = (1LL << 49);
- static constexpr int64 kMin9Bytes = (1LL << 56);
-
- static void Append32Slow(std::string* s, uint32 value);
- static void Append64Slow(std::string* s, uint64 value);
- static const char* Parse32Fallback(uint32 code, const char* ptr, uint32* val);
- static const char* Parse64Fallback(uint64 code, const char* ptr, uint64* val);
- static const char* Parse32FallbackInline(uint32 code, const char* ptr,
- uint32* val);
- static const char* Parse64FallbackInline(uint64 code, const char* ptr,
- uint64* val);
-
- // Casting helpers to aid in making this code signed-char-clean.
- static uint8* MakeUnsigned(char* p) { return bit_cast<uint8*>(p); }
- static const uint8* MakeUnsigned(const char* p) {
- return bit_cast<const uint8*>(p);
- }
-};
-
-inline int PrefixVarint::Length32(uint32 val) {
- if (val < kMin2Bytes) return 1;
- if (val < kMin3Bytes) return 2;
- if (val < kMin4Bytes) return 3;
- if (val < kMin5Bytes) return 4;
- return 5;
-}
-
-inline int PrefixVarint::Length64(uint64 val) {
- if (val < kMin2Bytes) return 1;
- if (val < kMin3Bytes) return 2;
- if (val < kMin4Bytes) return 3;
- if (val < kMin5Bytes) return 4;
- if (val < kMin6Bytes) return 5;
- if (val < kMin7Bytes) return 6;
- if (val < kMin8Bytes) return 7;
- if (val < kMin9Bytes) return 8;
- return 9;
-}
-
-inline char* PrefixVarint::SafeEncode32Inline(char* p, uint32 val) {
- uint8* const ptr = MakeUnsigned(p);
- if (val < kMin2Bytes) {
- ptr[0] = val;
- return p + 1;
- } else if (val < kMin3Bytes) {
- val <<= 2;
- uint8 low = val;
- ptr[0] = (low >> 2) | 128;
- ptr[1] = val >> 8;
- return p + 2;
- } else if (val < kMin4Bytes) {
- val <<= 3;
- uint8 low = val;
- ptr[0] = (low >> 3) | 192;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- return p + 3;
- } else if (val < kMin5Bytes) {
- val <<= 4;
- uint8 low = val;
- ptr[0] = (low >> 4) | 224;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- ptr[3] = val >> 24;
- return p + 4;
- } else {
- ptr[0] = 0xff;
- ptr[1] = val;
- ptr[2] = val >> 8;
- ptr[3] = val >> 16;
- ptr[4] = val >> 24;
- return p + 5;
- }
-}
-
-inline char* PrefixVarint::SafeEncode64Inline(char* p, uint64 val) {
- uint8* const ptr = MakeUnsigned(p);
- if (val < kMin2Bytes) {
- ptr[0] = val;
- return p + 1;
- } else if (val < kMin3Bytes) {
- val <<= 2;
- uint8 low = val;
- ptr[0] = (low >> 2) | 128;
- ptr[1] = val >> 8;
- return p + 2;
- } else if (val < kMin4Bytes) {
- val <<= 3;
- uint8 low = val;
- ptr[0] = (low >> 3) | 192;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- return p + 3;
- } else if (val < kMin5Bytes) {
- val <<= 4;
- uint8 low = val;
- ptr[0] = (low >> 4) | 224;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- ptr[3] = val >> 24;
- return p + 4;
- } else if (val < kMin6Bytes) {
- val <<= 5;
- uint8 low = val;
- ptr[0] = (low >> 5) | 240;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- ptr[3] = val >> 24;
- ptr[4] = val >> 32;
- return p + 5;
- } else if (val < kMin7Bytes) {
- val <<= 6;
- uint8 low = val;
- ptr[0] = (low >> 6) | 248;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- ptr[3] = val >> 24;
- ptr[4] = val >> 32;
- ptr[5] = val >> 40;
- return p + 6;
- } else if (val < kMin8Bytes) {
- val <<= 7;
- uint8 low = val;
- ptr[0] = (low >> 7) | 252;
- ptr[1] = val >> 8;
- ptr[2] = val >> 16;
- ptr[3] = val >> 24;
- ptr[4] = val >> 32;
- ptr[5] = val >> 40;
- ptr[6] = val >> 48;
- return p + 7;
- } else if (val < kMin9Bytes) {
- ptr[0] = 254;
- ptr[1] = val;
- ptr[2] = val >> 8;
- ptr[3] = val >> 16;
- ptr[4] = val >> 24;
- ptr[5] = val >> 32;
- ptr[6] = val >> 40;
- ptr[7] = val >> 48;
- return p + 8;
- } else {
- ptr[0] = 255;
- ptr[1] = val;
- ptr[2] = val >> 8;
- ptr[3] = val >> 16;
- ptr[4] = val >> 24;
- ptr[5] = val >> 32;
- ptr[6] = val >> 40;
- ptr[7] = val >> 48;
- ptr[8] = val >> 56;
- return p + 9;
- }
-}
-
-inline void PrefixVarint::Append32(std::string* s, uint32 value) {
- // Inline the fast-path for single-character output, but fall back to the .cc
- // file for the full version. The size<capacity check is so the compiler can
- // optimize out the string resize code.
- if (value < kMin2Bytes && s->size() < s->capacity()) {
- s->push_back(static_cast<unsigned char>(value));
- } else {
- Append32Slow(s, value);
- }
-}
-
-inline void PrefixVarint::Append64(std::string* s, uint64 value) {
- // Inline the fast-path for single-character output, but fall back to the .cc
- // file for the full version. The size<capacity check is so the compiler can
- // optimize out the string resize code.
- if (value < kMin2Bytes && s->size() < s->capacity()) {
- s->push_back(static_cast<unsigned char>(value));
- } else {
- Append64Slow(s, value);
- }
-}
-
-#ifdef IS_LITTLE_ENDIAN
-
-inline const char* PrefixVarint::Parse32(const char* p, uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint32 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint32 v = ptr[1];
- *val = (code & 0x3f) | (v << 6);
- return p + 2;
- } else {
- return Parse32Fallback(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse64(const char* p, uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint64 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint64 v = ptr[1];
- *val = (code & 0x3fLLU) | (v << 6);
- return p + 2;
- } else {
- return Parse64Fallback(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse32Inline(const char* p, uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint32 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint32 v = ptr[1];
- *val = (code & 0x3f) | (v << 6);
- return p + 2;
- } else {
- return Parse32FallbackInline(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse64Inline(const char* p, uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint64 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint64 v = ptr[1];
- *val = (code & 0x3f) | (v << 6);
- return p + 2;
- } else {
- return Parse64FallbackInline(code, p, val);
- }
-}
-
-// Only handles cases with 3-5 bytes
-inline const char* PrefixVarint::Parse32FallbackInline(uint32 code,
- const char* p,
- uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- if (code < 224) {
- uint32 v = TC3_UNALIGNED_LOAD16(ptr + 1);
- *val = (code & 0x1f) | (v << 5);
- return p + 3;
- } else if (code < 240) {
- uint32 v = ptr[3];
- v = (v << 16) | TC3_UNALIGNED_LOAD16(ptr + 1);
- *val = (code & 0xf) | (v << 4);
- return p + 4;
- } else {
- *val = TC3_UNALIGNED_LOAD32(ptr + 1);
- return p + 5;
- }
-}
-
-// Only handles cases with 3-9 bytes
-inline const char* PrefixVarint::Parse64FallbackInline(uint64 code,
- const char* p,
- uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- if (code < 224) {
- uint64 v = TC3_UNALIGNED_LOAD16(ptr + 1);
- *val = (code & 0x1fLLU) | (v << 5);
- return p + 3;
- } else if (code < 240) {
- uint64 v = ptr[3];
- v = (v << 16) | TC3_UNALIGNED_LOAD16(ptr + 1);
- *val = (code & 0xfLLU) | (v << 4);
- return p + 4;
- } else if (code < 248) {
- uint64 v = TC3_UNALIGNED_LOAD32(ptr + 1);
- *val = (code & 0x7LLU) | (v << 3);
- return p + 5;
- } else if (code < 252) {
- uint64 v = ptr[5];
- v = (v << 32) | TC3_UNALIGNED_LOAD32(ptr + 1);
- *val = (code & 0x3LLU) | (v << 2);
- return p + 6;
- } else if (code < 254) {
- uint64 v = TC3_UNALIGNED_LOAD16(ptr + 5);
- v = (v << 32) | TC3_UNALIGNED_LOAD32(ptr + 1);
- *val = (code & 0x1LLU) | (v << 1);
- return p + 7;
- } else if (code < 255) {
- uint64 v = TC3_UNALIGNED_LOAD64(ptr);
- *val = v >> 8;
- return p + 8;
- } else {
- *val = TC3_UNALIGNED_LOAD64(ptr + 1);
- return p + 9;
- }
-}
-
-#else // IS_BIG_ENDIAN
-
-// This works on big-endian machines. Performance is 1-16% slower, depending
-// on the data.
-inline const char* PrefixVarint::Parse32(const char* p, uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint32 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint32 v = ptr[1];
- *val = (code & 0x3f) | (v << 6);
- return p + 2;
- } else {
- return Parse32Fallback(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse64(const char* p, uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint64 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint64 v = ptr[1];
- *val = (code & 0x3fLLU) | (v << 6);
- return p + 2;
- } else {
- return Parse64Fallback(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse32Inline(const char* p, uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint32 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint32 v = ptr[1];
- *val = (code & 0x3f) | (v << 6);
- return p + 2;
- } else {
- return Parse32FallbackInline(code, p, val);
- }
-}
-
-inline const char* PrefixVarint::Parse64Inline(const char* p, uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- uint64 code = *ptr;
- if (code < 128) {
- *val = code;
- return p + 1;
- } else if (code < 192) {
- uint64 v = ptr[1];
- *val = (code & 0x3fLLU) | (v << 6);
- return p + 2;
- } else {
- return Parse64FallbackInline(code, p, val);
- }
-}
-
-// Only handles cases with 3-5 bytes
-inline const char* PrefixVarint::Parse32FallbackInline(uint32 code,
- const char* p,
- uint32* val) {
- const uint8* const ptr = MakeUnsigned(p);
- if (code < 224) {
- uint32 v = ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0x1f) | (v << 5);
- return p + 3;
- } else if (code < 240) {
- uint32 v = ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0xf) | (v << 4);
- return p + 4;
- } else {
- uint32 v = ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = v;
- return p + 5;
- }
-}
-
-// Only handles cases with 3-9 bytes
-inline const char* PrefixVarint::Parse64FallbackInline(uint64 code,
- const char* p,
- uint64* val) {
- const uint8* const ptr = MakeUnsigned(p);
- if (code < 224) {
- uint64 v = ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0x1f) | (v << 5);
- return p + 3;
- } else if (code < 240) {
- uint64 v = ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0xf) | (v << 4);
- return p + 4;
- } else if (code < 248) {
- uint64 v = ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0x7) | (v << 3);
- return p + 5;
- } else if (code < 252) {
- uint64 v = ptr[5];
- v = (v << 8) | ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0x3) | (v << 2);
- return p + 6;
- } else if (code < 254) {
- uint64 v = ptr[6];
- v = (v << 8) | ptr[5];
- v = (v << 8) | ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = (code & 0x1) | (v << 1);
- return p + 7;
- } else if (code < 255) {
- uint64 v = ptr[7];
- v = (v << 8) | ptr[6];
- v = (v << 8) | ptr[5];
- v = (v << 8) | ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = v;
- return p + 8;
- } else {
- uint64 v = ptr[8];
- v = (v << 8) | ptr[7];
- v = (v << 8) | ptr[6];
- v = (v << 8) | ptr[5];
- v = (v << 8) | ptr[4];
- v = (v << 8) | ptr[3];
- v = (v << 8) | ptr[2];
- v = (v << 8) | ptr[1];
- *val = v;
- return p + 9;
- }
-}
-
-#endif // IS_LITTLE_ENDIAN
-
-} // namespace libtextclassifier3
-
-#endif // LIBTEXTCLASSIFIER_UTILS_BASE_PREFIXVARINT_H_
diff --git a/native/utils/base/status_test.cc b/native/utils/base/status_test.cc
new file mode 100644
index 0000000..82d5aad
--- /dev/null
+++ b/native/utils/base/status_test.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/base/status.h"
+
+#include "utils/base/logging.h"
+#include "utils/base/status_macros.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(StatusTest, PrintsAbortedStatus) {
+ logging::LoggingStringStream stream;
+ stream << Status::UNKNOWN;
+ EXPECT_EQ(Status::UNKNOWN.error_code(), 2);
+ EXPECT_EQ(Status::UNKNOWN.CanonicalCode(), StatusCode::UNKNOWN);
+ EXPECT_EQ(Status::UNKNOWN.error_message(), "");
+ EXPECT_EQ(stream.message, "2");
+}
+
+TEST(StatusTest, PrintsOKStatus) {
+ logging::LoggingStringStream stream;
+ stream << Status::OK;
+ EXPECT_EQ(Status::OK.error_code(), 0);
+ EXPECT_EQ(Status::OK.CanonicalCode(), StatusCode::OK);
+ EXPECT_EQ(Status::OK.error_message(), "");
+ EXPECT_EQ(stream.message, "0");
+}
+
+TEST(StatusTest, UnknownStatusHasRightAttributes) {
+ EXPECT_EQ(Status::UNKNOWN.error_code(), 2);
+ EXPECT_EQ(Status::UNKNOWN.CanonicalCode(), StatusCode::UNKNOWN);
+ EXPECT_EQ(Status::UNKNOWN.error_message(), "");
+}
+
+TEST(StatusTest, OkStatusHasRightAttributes) {
+ EXPECT_EQ(Status::OK.error_code(), 0);
+ EXPECT_EQ(Status::OK.CanonicalCode(), StatusCode::OK);
+ EXPECT_EQ(Status::OK.error_message(), "");
+}
+
+TEST(StatusTest, CustomStatusHasRightAttributes) {
+ Status status(StatusCode::INVALID_ARGUMENT, "You can't put this here!");
+ EXPECT_EQ(status.error_code(), 3);
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::INVALID_ARGUMENT);
+ EXPECT_EQ(status.error_message(), "You can't put this here!");
+}
+
+TEST(StatusTest, AssignmentPreservesMembers) {
+ Status status(StatusCode::INVALID_ARGUMENT, "You can't put this here!");
+
+ Status status2 = status;
+
+ EXPECT_EQ(status2.error_code(), 3);
+ EXPECT_EQ(status2.CanonicalCode(), StatusCode::INVALID_ARGUMENT);
+ EXPECT_EQ(status2.error_message(), "You can't put this here!");
+}
+
+TEST(StatusTest, ReturnIfErrorOkStatus) {
+ bool returned_due_to_error = true;
+ auto lambda = [&returned_due_to_error](const Status& s) {
+ TC3_RETURN_IF_ERROR(s);
+ returned_due_to_error = false;
+ return Status::OK;
+ };
+
+ // OK should allow execution to continue and the returned status should also
+ // be OK.
+ Status status = lambda(Status());
+ EXPECT_EQ(status.error_code(), 0);
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::OK);
+ EXPECT_EQ(status.error_message(), "");
+ EXPECT_FALSE(returned_due_to_error);
+}
+
+TEST(StatusTest, ReturnIfErrorInvalidArgumentStatus) {
+ bool returned_due_to_error = true;
+ auto lambda = [&returned_due_to_error](const Status& s) {
+ TC3_RETURN_IF_ERROR(s);
+ returned_due_to_error = false;
+ return Status::OK;
+ };
+
+ // INVALID_ARGUMENT should cause an early return.
+ Status invalid_arg_status(StatusCode::INVALID_ARGUMENT, "You can't do that!");
+ Status status = lambda(invalid_arg_status);
+ EXPECT_EQ(status.error_code(), 3);
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::INVALID_ARGUMENT);
+ EXPECT_EQ(status.error_message(), "You can't do that!");
+ EXPECT_TRUE(returned_due_to_error);
+}
+
+TEST(StatusTest, ReturnIfErrorUnknownStatus) {
+ bool returned_due_to_error = true;
+ auto lambda = [&returned_due_to_error](const Status& s) {
+ TC3_RETURN_IF_ERROR(s);
+ returned_due_to_error = false;
+ return Status::OK;
+ };
+
+ // UNKNOWN should cause an early return.
+ Status unknown_status(StatusCode::UNKNOWN,
+ "We also know there are known unknowns.");
+ libtextclassifier3::Status status = lambda(unknown_status);
+ EXPECT_EQ(status.error_code(), 2);
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::UNKNOWN);
+ EXPECT_EQ(status.error_message(), "We also know there are known unknowns.");
+ EXPECT_TRUE(returned_due_to_error);
+}
+
+TEST(StatusTest, ReturnIfErrorOnlyInvokesExpressionOnce) {
+ int num_invocations = 0;
+ auto ok_internal_expr = [&num_invocations]() {
+ ++num_invocations;
+ return Status::OK;
+ };
+ auto ok_lambda = [&ok_internal_expr]() {
+ TC3_RETURN_IF_ERROR(ok_internal_expr());
+ return Status::OK;
+ };
+
+ libtextclassifier3::Status status = ok_lambda();
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::OK);
+ EXPECT_EQ(num_invocations, 1);
+
+ num_invocations = 0;
+ auto error_internal_expr = [&num_invocations]() {
+ ++num_invocations;
+ return Status::UNKNOWN;
+ };
+ auto error_lambda = [&error_internal_expr]() {
+ TC3_RETURN_IF_ERROR(error_internal_expr());
+ return Status::OK;
+ };
+
+ status = error_lambda();
+ EXPECT_EQ(status.CanonicalCode(), StatusCode::UNKNOWN);
+ EXPECT_EQ(num_invocations, 1);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/base/statusor_test.cc b/native/utils/base/statusor_test.cc
new file mode 100644
index 0000000..23165b0
--- /dev/null
+++ b/native/utils/base/statusor_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/base/statusor.h"
+
+#include "utils/base/logging.h"
+#include "utils/base/status.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(StatusOrTest, DoesntDieWhenOK) {
+ StatusOr<std::string> status_or_string = std::string("Hello World");
+ EXPECT_TRUE(status_or_string.ok());
+ EXPECT_EQ(status_or_string.ValueOrDie(), "Hello World");
+}
+
+TEST(StatusOrTest, DiesWhenNotOK) {
+ StatusOr<std::string> status_or_string = {Status::UNKNOWN};
+ EXPECT_FALSE(status_or_string.ok());
+ // Android does not print the error message to stderr, so we are not checking
+ // the error message here.
+ EXPECT_DEATH(status_or_string.ValueOrDie(), "");
+}
+
+// Foo is NOT default constructible and can be implicitly converted to from int.
+class Foo {
+ public:
+ // Copy value conversion
+ Foo(int i) : i_(i) {} // NOLINT
+ int i() const { return i_; }
+
+ private:
+ int i_;
+};
+
+TEST(StatusOrTest, HandlesNonDefaultConstructibleValues) {
+ StatusOr<Foo> foo_or(Foo(7));
+ EXPECT_TRUE(foo_or.ok());
+ EXPECT_EQ(foo_or.ValueOrDie().i(), 7);
+
+ StatusOr<Foo> error_or(Status::UNKNOWN);
+ EXPECT_FALSE(error_or.ok());
+ EXPECT_EQ(error_or.status().CanonicalCode(), StatusCode::UNKNOWN);
+}
+
+class Bar {
+ public:
+ // Move value conversion
+ Bar(Foo&& f) : i_(2 * f.i()) {} // NOLINT
+
+ // Movable, but not copyable.
+ Bar(const Bar& other) = delete;
+ Bar& operator=(const Bar& rhs) = delete;
+ Bar(Bar&& other) = default;
+ Bar& operator=(Bar&& rhs) = default;
+
+ int i() const { return i_; }
+
+ private:
+ int i_;
+};
+
+TEST(StatusOrTest, HandlesValueConversion) {
+ // Copy value conversion constructor : StatusOr<Foo>(const int&)
+ StatusOr<Foo> foo_status(19);
+ EXPECT_TRUE(foo_status.ok());
+ EXPECT_EQ(foo_status.ValueOrDie().i(), 19);
+
+ // Move value conversion constructor : StatusOr<Bar>(Foo&&)
+ StatusOr<Bar> bar_status(std::move(foo_status));
+ EXPECT_TRUE(bar_status.ok());
+ EXPECT_EQ(bar_status.ValueOrDie().i(), 38);
+
+ StatusOr<int> int_status(19);
+ // Copy conversion constructor : StatusOr<Foo>(const StatusOr<int>&)
+ StatusOr<Foo> copied_status(int_status);
+ EXPECT_TRUE(copied_status.ok());
+ EXPECT_EQ(copied_status.ValueOrDie().i(), 19);
+
+ // Move conversion constructor : StatusOr<Bar>(StatusOr<Foo>&&)
+ StatusOr<Bar> moved_status(std::move(copied_status));
+ EXPECT_TRUE(moved_status.ok());
+ EXPECT_EQ(moved_status.ValueOrDie().i(), 38);
+
+ // Move conversion constructor with error : StatusOr<Bar>(StatusOr<Foo>&&)
+ StatusOr<Foo> error_status(Status::UNKNOWN);
+ StatusOr<Bar> moved_error_status(std::move(error_status));
+ EXPECT_FALSE(moved_error_status.ok());
+}
+
+struct OkFn {
+ StatusOr<int> operator()() { return 42; }
+};
+TEST(StatusOrTest, AssignOrReturnValOk) {
+ auto lambda = []() {
+ TC3_ASSIGN_OR_RETURN(int i, OkFn()(), -1);
+ return i;
+ };
+
+ // OkFn() should return a valid integer, so lambda should return that integer.
+ EXPECT_EQ(lambda(), 42);
+}
+
+struct FailFn {
+ StatusOr<int> operator()() { return Status::UNKNOWN; }
+};
+TEST(StatusOrTest, AssignOrReturnValError) {
+ auto lambda = []() {
+ TC3_ASSIGN_OR_RETURN(int i, FailFn()(), -1);
+ return i;
+ };
+
+ // FailFn() should return an error, so lambda should return -1.
+ EXPECT_EQ(lambda(), -1);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/base/unaligned_access.h b/native/utils/base/unaligned_access.h
deleted file mode 100644
index 68fe207..0000000
--- a/native/utils/base/unaligned_access.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBTEXTCLASSIFIER_UTILS_BASE_UNALIGNED_ACCESS_H_
-#define LIBTEXTCLASSIFIER_UTILS_BASE_UNALIGNED_ACCESS_H_
-
-#include <string.h>
-
-#include <cstdint>
-
-#include "utils/base/integral_types.h"
-#include "utils/base/macros.h"
-
-// unaligned APIs
-
-// Portable handling of unaligned loads, stores, and copies.
-// On some platforms, like ARM, the copy functions can be more efficient
-// then a load and a store.
-//
-// It is possible to implement all of these these using constant-length memcpy
-// calls, which is portable and will usually be inlined into simple loads and
-// stores if the architecture supports it. However, such inlining usually
-// happens in a pass that's quite late in compilation, which means the resulting
-// loads and stores cannot participate in many other optimizations, leading to
-// overall worse code.
-
-// The unaligned API is C++ only. The declarations use C++ features
-// (namespaces, inline) which are absent or incompatible in C.
-#if defined(__cplusplus)
-
-#if defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) ||\
- defined(MEMORY_SANITIZER)
-// Consider we have an unaligned load/store of 4 bytes from address 0x...05.
-// AddressSanitizer will treat it as a 3-byte access to the range 05:07 and
-// will miss a bug if 08 is the first unaddressable byte.
-// ThreadSanitizer will also treat this as a 3-byte access to 05:07 and will
-// miss a race between this access and some other accesses to 08.
-// MemorySanitizer will correctly propagate the shadow on unaligned stores
-// and correctly report bugs on unaligned loads, but it may not properly
-// update and report the origin of the uninitialized memory.
-// For all three tools, replacing an unaligned access with a tool-specific
-// callback solves the problem.
-
-// Make sure uint16_t/uint32_t/uint64_t are defined.
-#include <stdint.h>
-
-extern "C" {
-uint16_t __sanitizer_unaligned_load16(const void *p);
-uint32_t __sanitizer_unaligned_load32(const void *p);
-uint64_t __sanitizer_unaligned_load64(const void *p);
-void __sanitizer_unaligned_store16(void *p, uint16_t v);
-void __sanitizer_unaligned_store32(void *p, uint32_t v);
-void __sanitizer_unaligned_store64(void *p, uint64_t v);
-} // extern "C"
-
-namespace libtextclassifier3 {
-
-inline uint16_t UnalignedLoad16(const void *p) {
- return __sanitizer_unaligned_load16(p);
-}
-
-inline uint32_t UnalignedLoad32(const void *p) {
- return __sanitizer_unaligned_load32(p);
-}
-
-inline uint64 UnalignedLoad64(const void *p) {
- return __sanitizer_unaligned_load64(p);
-}
-
-inline void UnalignedStore16(void *p, uint16_t v) {
- __sanitizer_unaligned_store16(p, v);
-}
-
-inline void UnalignedStore32(void *p, uint32_t v) {
- __sanitizer_unaligned_store32(p, v);
-}
-
-inline void UnalignedStore64(void *p, uint64 v) {
- __sanitizer_unaligned_store64(p, v);
-}
-
-} // namespace libtextclassifier3
-
-#define TC3_UNALIGNED_LOAD16(_p) (::libtextclassifier3::UnalignedLoad16(_p))
-#define TC3_UNALIGNED_LOAD32(_p) (::libtextclassifier3::UnalignedLoad32(_p))
-#define TC3_UNALIGNED_LOAD64(_p) \
- (::libtextclassifier3::UnalignedLoad64(_p))
-
-#define TC3_UNALIGNED_STORE16(_p, _val) \
- (::libtextclassifier3::UnalignedStore16(_p, _val))
-#define TC3_UNALIGNED_STORE32(_p, _val) \
- (::libtextclassifier3::UnalignedStore32(_p, _val))
-#define TC3_UNALIGNED_STORE64(_p, _val) \
- (::libtextclassifier3::UnalignedStore64(_p, _val))
-
-#elif defined(UNDEFINED_BEHAVIOR_SANITIZER)
-
-namespace libtextclassifier3 {
-
-inline uint16_t UnalignedLoad16(const void *p) {
- uint16_t t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline uint32_t UnalignedLoad32(const void *p) {
- uint32_t t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline uint64 UnalignedLoad64(const void *p) {
- uint64 t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline void UnalignedStore16(void *p, uint16_t v) { memcpy(p, &v, sizeof v); }
-
-inline void UnalignedStore32(void *p, uint32_t v) { memcpy(p, &v, sizeof v); }
-
-inline void UnalignedStore64(void *p, uint64 v) { memcpy(p, &v, sizeof v); }
-
-} // namespace libtextclassifier3
-
-#define TC3_UNALIGNED_LOAD16(_p) (::libtextclassifier3::UnalignedLoad16(_p))
-#define TC3_UNALIGNED_LOAD32(_p) (::libtextclassifier3::UnalignedLoad32(_p))
-#define TC3_UNALIGNED_LOAD64(_p) (::libtextclassifier3::UnalignedLoad64(_p))
-
-#define TC3_UNALIGNED_STORE16(_p, _val) \
- (::libtextclassifier3::UnalignedStore16(_p, _val))
-#define TC3_UNALIGNED_STORE32(_p, _val) \
- (::libtextclassifier3::UnalignedStore32(_p, _val))
-#define TC3_UNALIGNED_STORE64(_p, _val) \
- (::libtextclassifier3::UnalignedStore64(_p, _val))
-
-#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
- defined(_M_IX86) || defined(__ppc__) || defined(__PPC__) || \
- defined(__ppc64__) || defined(__PPC64__)
-
-// x86 and x86-64 can perform unaligned loads/stores directly;
-// modern PowerPC hardware can also do unaligned integer loads and stores;
-// but note: the FPU still sends unaligned loads and stores to a trap handler!
-
-#define TC3_UNALIGNED_LOAD16(_p) \
- (*reinterpret_cast<const uint16_t *>(_p))
-#define TC3_UNALIGNED_LOAD32(_p) \
- (*reinterpret_cast<const uint32_t *>(_p))
-#define TC3_UNALIGNED_LOAD64(_p) \
- (*reinterpret_cast<const uint64 *>(_p))
-
-#define TC3_UNALIGNED_STORE16(_p, _val) \
- (*reinterpret_cast<uint16_t *>(_p) = (_val))
-#define TC3_UNALIGNED_STORE32(_p, _val) \
- (*reinterpret_cast<uint32_t *>(_p) = (_val))
-#define TC3_UNALIGNED_STORE64(_p, _val) \
- (*reinterpret_cast<uint64 *>(_p) = (_val))
-
-#elif defined(__arm__) && \
- !defined(__ARM_ARCH_5__) && \
- !defined(__ARM_ARCH_5T__) && \
- !defined(__ARM_ARCH_5TE__) && \
- !defined(__ARM_ARCH_5TEJ__) && \
- !defined(__ARM_ARCH_6__) && \
- !defined(__ARM_ARCH_6J__) && \
- !defined(__ARM_ARCH_6K__) && \
- !defined(__ARM_ARCH_6Z__) && \
- !defined(__ARM_ARCH_6ZK__) && \
- !defined(__ARM_ARCH_6T2__)
-
-
-// ARMv7 and newer support native unaligned accesses, but only of 16-bit
-// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
-// do an unaligned read and rotate the words around a bit, or do the reads very
-// slowly (trip through kernel mode). There's no simple #define that says just
-// "ARMv7 or higher", so we have to filter away all ARMv5 and ARMv6
-// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
-// so in time, maybe we can move on to that.
-//
-// This is a mess, but there's not much we can do about it.
-//
-// To further complicate matters, only LDR instructions (single reads) are
-// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
-// explicitly tell the compiler that these accesses can be unaligned, it can and
-// will combine accesses. On armcc, the way to signal this is done by accessing
-// through the type (uint32_t __packed *), but GCC has no such attribute
-// (it ignores __attribute__((packed)) on individual variables). However,
-// we can tell it that a _struct_ is unaligned, which has the same effect,
-// so we do that.
-
-namespace libtextclassifier3 {
-
-struct Unaligned16Struct {
- uint16_t value;
- uint8_t dummy; // To make the size non-power-of-two.
-} TC3_ATTRIBUTE_PACKED;
-
-struct Unaligned32Struct {
- uint32_t value;
- uint8_t dummy; // To make the size non-power-of-two.
-} TC3_ATTRIBUTE_PACKED;
-
-} // namespace libtextclassifier3
-
-#define TC3_UNALIGNED_LOAD16(_p) \
- ((reinterpret_cast<const ::libtextclassifier3::Unaligned16Struct *>(_p)) \
- ->value)
-#define TC3_UNALIGNED_LOAD32(_p) \
- ((reinterpret_cast<const ::libtextclassifier3::Unaligned32Struct *>(_p)) \
- ->value)
-
-#define TC3_UNALIGNED_STORE16(_p, _val) \
- ((reinterpret_cast< ::libtextclassifier3::Unaligned16Struct *>(_p)) \
- ->value = (_val))
-#define TC3_UNALIGNED_STORE32(_p, _val) \
- ((reinterpret_cast< ::libtextclassifier3::Unaligned32Struct *>(_p)) \
- ->value = (_val))
-
-namespace libtextclassifier3 {
-
-inline uint64 UnalignedLoad64(const void *p) {
- uint64 t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline void UnalignedStore64(void *p, uint64 v) { memcpy(p, &v, sizeof v); }
-
-} // namespace libtextclassifier3
-
-#define TC3_UNALIGNED_LOAD64(_p) (::libtextclassifier3::UnalignedLoad64(_p))
-#define TC3_UNALIGNED_STORE64(_p, _val) \
- (::libtextclassifier3::UnalignedStore64(_p, _val))
-
-#else
-
-// TC3_NEED_ALIGNED_LOADS is defined when the underlying platform
-// doesn't support unaligned access.
-#define TC3_NEED_ALIGNED_LOADS
-
-// These functions are provided for architectures that don't support
-// unaligned loads and stores.
-
-namespace libtextclassifier3 {
-
-inline uint16_t UnalignedLoad16(const void *p) {
- uint16_t t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline uint32_t UnalignedLoad32(const void *p) {
- uint32_t t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline uint64 UnalignedLoad64(const void *p) {
- uint64 t;
- memcpy(&t, p, sizeof t);
- return t;
-}
-
-inline void UnalignedStore16(void *p, uint16_t v) { memcpy(p, &v, sizeof v); }
-
-inline void UnalignedStore32(void *p, uint32_t v) { memcpy(p, &v, sizeof v); }
-
-inline void UnalignedStore64(void *p, uint64 v) { memcpy(p, &v, sizeof v); }
-
-} // namespace libtextclassifier3
-
-#define TC3_UNALIGNED_LOAD16(_p) (::libtextclassifier3::UnalignedLoad16(_p))
-#define TC3_UNALIGNED_LOAD32(_p) (::libtextclassifier3::UnalignedLoad32(_p))
-#define TC3_UNALIGNED_LOAD64(_p) (::libtextclassifier3::UnalignedLoad64(_p))
-
-#define TC3_UNALIGNED_STORE16(_p, _val) \
- (::libtextclassifier3::UnalignedStore16(_p, _val))
-#define TC3_UNALIGNED_STORE32(_p, _val) \
- (::libtextclassifier3::UnalignedStore32(_p, _val))
-#define TC3_UNALIGNED_STORE64(_p, _val) \
- (::libtextclassifier3::UnalignedStore64(_p, _val))
-
-#endif
-
-#endif // defined(__cplusplus), end of unaligned API
-
-#endif // LIBTEXTCLASSIFIER_UTILS_BASE_UNALIGNED_ACCESS_H_
diff --git a/native/utils/calendar/calendar-common.h b/native/utils/calendar/calendar-common.h
index f842300..e6fd076 100644
--- a/native/utils/calendar/calendar-common.h
+++ b/native/utils/calendar/calendar-common.h
@@ -229,7 +229,7 @@
case DatetimeComponent::RelativeQualifier::PAST:
TC3_CALENDAR_CHECK(
AdjustByRelation(relative_date_time_component,
- -relative_date_time_component.relative_count,
+ relative_date_time_component.relative_count,
/*allow_today=*/false, calendar))
return true;
case DatetimeComponent::RelativeQualifier::FUTURE:
diff --git a/native/utils/checksum_test.cc b/native/utils/checksum_test.cc
new file mode 100644
index 0000000..dd04956
--- /dev/null
+++ b/native/utils/checksum_test.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/checksum.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(LuhnTest, CorrectlyHandlesSimpleCases) {
+ EXPECT_TRUE(VerifyLuhnChecksum("3782 8224 6310 005"));
+ EXPECT_FALSE(VerifyLuhnChecksum("0"));
+ EXPECT_FALSE(VerifyLuhnChecksum("1"));
+ EXPECT_FALSE(VerifyLuhnChecksum("0A"));
+}
+
+TEST(LuhnTest, CorrectlyVerifiesPaymentCardNumbers) {
+ // Fake test numbers.
+ EXPECT_TRUE(VerifyLuhnChecksum("3782 8224 6310 005"));
+ EXPECT_TRUE(VerifyLuhnChecksum("371449635398431"));
+ EXPECT_TRUE(VerifyLuhnChecksum("5610591081018250"));
+ EXPECT_TRUE(VerifyLuhnChecksum("38520000023237"));
+ EXPECT_TRUE(VerifyLuhnChecksum("6011000990139424"));
+ EXPECT_TRUE(VerifyLuhnChecksum("3566002020360505"));
+ EXPECT_TRUE(VerifyLuhnChecksum("5105105105105100"));
+ EXPECT_TRUE(VerifyLuhnChecksum("4012 8888 8888 1881"));
+}
+
+TEST(LuhnTest, HandlesWhitespace) {
+ EXPECT_TRUE(
+ VerifyLuhnChecksum("3782 8224 6310 005 ", /*ignore_whitespace=*/true));
+ EXPECT_FALSE(
+ VerifyLuhnChecksum("3782 8224 6310 005 ", /*ignore_whitespace=*/false));
+}
+
+TEST(LuhnTest, HandlesEdgeCases) {
+ EXPECT_FALSE(VerifyLuhnChecksum(" ", /*ignore_whitespace=*/true));
+ EXPECT_FALSE(VerifyLuhnChecksum(" ", /*ignore_whitespace=*/false));
+ EXPECT_FALSE(VerifyLuhnChecksum("", /*ignore_whitespace=*/true));
+ EXPECT_FALSE(VerifyLuhnChecksum("", /*ignore_whitespace=*/false));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/container/sorted-strings-table_test.cc b/native/utils/container/sorted-strings-table_test.cc
new file mode 100644
index 0000000..a93b197
--- /dev/null
+++ b/native/utils/container/sorted-strings-table_test.cc
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/container/sorted-strings-table.h"
+
+#include <vector>
+
+#include "utils/base/integral_types.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(SortedStringsTest, Lookup) {
+ const char pieces[] = "hell\0hello\0o\0there\0";
+ const uint32 offsets[] = {0, 5, 11, 13};
+
+ SortedStringsTable table(/*num_pieces=*/4, offsets, StringPiece(pieces, 18),
+ /*use_linear_scan_threshold=*/1);
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("hello there", &matches));
+ EXPECT_EQ(matches.size(), 2);
+ EXPECT_EQ(matches[0].id, 0 /*hell*/);
+ EXPECT_EQ(matches[0].match_length, 4 /*hell*/);
+ EXPECT_EQ(matches[1].id, 1 /*hello*/);
+ EXPECT_EQ(matches[1].match_length, 5 /*hello*/);
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("he", &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("he", &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("abcd", &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("", &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches("hi there", &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(table.FindAllPrefixMatches(StringPiece("\0", 1), &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ std::vector<StringSet::Match> matches;
+ EXPECT_TRUE(
+ table.FindAllPrefixMatches(StringPiece("\xff, \xfe", 2), &matches));
+ EXPECT_THAT(matches, testing::IsEmpty());
+ }
+
+ {
+ StringSet::Match match;
+ EXPECT_TRUE(table.LongestPrefixMatch("hella there", &match));
+ EXPECT_EQ(match.id, 0 /*hell*/);
+ }
+
+ {
+ StringSet::Match match;
+ EXPECT_TRUE(table.LongestPrefixMatch("hello there", &match));
+ EXPECT_EQ(match.id, 1 /*hello*/);
+ }
+
+ {
+ StringSet::Match match;
+ EXPECT_TRUE(table.LongestPrefixMatch("abcd", &match));
+ EXPECT_EQ(match.id, -1);
+ }
+
+ {
+ StringSet::Match match;
+ EXPECT_TRUE(table.LongestPrefixMatch("", &match));
+ EXPECT_EQ(match.id, -1);
+ }
+
+ {
+ int value;
+ EXPECT_TRUE(table.Find("hell", &value));
+ EXPECT_EQ(value, 0);
+ }
+
+ {
+ int value;
+ EXPECT_FALSE(table.Find("hella", &value));
+ }
+
+ {
+ int value;
+ EXPECT_TRUE(table.Find("hello", &value));
+ EXPECT_EQ(value, 1 /*hello*/);
+ }
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/flatbuffers.cc b/native/utils/flatbuffers.cc
index 73ce0cc..cf4c97f 100644
--- a/native/utils/flatbuffers.cc
+++ b/native/utils/flatbuffers.cc
@@ -24,49 +24,6 @@
namespace libtextclassifier3 {
namespace {
-bool CreateRepeatedField(const reflection::Schema* schema,
- const reflection::Type* type,
- std::unique_ptr<RepeatedField>* repeated_field) {
- switch (type->element()) {
- case reflection::Bool:
- repeated_field->reset(new TypedRepeatedField<bool>);
- return true;
- case reflection::Byte:
- repeated_field->reset(new TypedRepeatedField<char>);
- return true;
- case reflection::UByte:
- repeated_field->reset(new TypedRepeatedField<unsigned char>);
- return true;
- case reflection::Int:
- repeated_field->reset(new TypedRepeatedField<int>);
- return true;
- case reflection::UInt:
- repeated_field->reset(new TypedRepeatedField<uint>);
- return true;
- case reflection::Long:
- repeated_field->reset(new TypedRepeatedField<int64>);
- return true;
- case reflection::ULong:
- repeated_field->reset(new TypedRepeatedField<uint64>);
- return true;
- case reflection::Float:
- repeated_field->reset(new TypedRepeatedField<float>);
- return true;
- case reflection::Double:
- repeated_field->reset(new TypedRepeatedField<double>);
- return true;
- case reflection::String:
- repeated_field->reset(new TypedRepeatedField<std::string>);
- return true;
- case reflection::Obj:
- repeated_field->reset(
- new TypedRepeatedField<ReflectiveFlatbuffer>(schema, type));
- return true;
- default:
- TC3_LOG(ERROR) << "Unsupported type: " << type->element();
- return false;
- }
-}
// Gets the field information for a field name, returns nullptr if the
// field was not defined.
@@ -76,8 +33,8 @@
return type->fields()->LookupByKey(field_name.data());
}
-const reflection::Field* GetFieldByOffsetOrNull(const reflection::Object* type,
- const int field_offset) {
+const reflection::Field* GetFieldOrNull(const reflection::Object* type,
+ const int field_offset) {
if (type->fields() == nullptr) {
return nullptr;
}
@@ -97,14 +54,14 @@
if (!field_name.empty()) {
return GetFieldOrNull(type, field_name.data());
}
- return GetFieldByOffsetOrNull(type, field_offset);
+ return GetFieldOrNull(type, field_offset);
}
const reflection::Field* GetFieldOrNull(const reflection::Object* type,
const FlatbufferField* field) {
TC3_CHECK(type != nullptr && field != nullptr);
if (field->field_name() == nullptr) {
- return GetFieldByOffsetOrNull(type, field->field_offset());
+ return GetFieldOrNull(type, field->field_offset());
}
return GetFieldOrNull(
type,
@@ -118,6 +75,49 @@
return GetFieldOrNull(type, field->field_name, field->field_offset);
}
+bool Parse(const std::string& str_value, float* value) {
+ double double_value;
+ if (!ParseDouble(str_value.data(), &double_value)) {
+ return false;
+ }
+ *value = static_cast<float>(double_value);
+ return true;
+}
+
+bool Parse(const std::string& str_value, double* value) {
+ return ParseDouble(str_value.data(), value);
+}
+
+bool Parse(const std::string& str_value, int64* value) {
+ return ParseInt64(str_value.data(), value);
+}
+
+bool Parse(const std::string& str_value, int32* value) {
+ return ParseInt32(str_value.data(), value);
+}
+
+bool Parse(const std::string& str_value, std::string* value) {
+ *value = str_value;
+ return true;
+}
+
+template <typename T>
+bool ParseAndSetField(const reflection::Field* field,
+ const std::string& str_value,
+ ReflectiveFlatbuffer* buffer) {
+ T value;
+ if (!Parse(str_value, &value)) {
+ TC3_LOG(ERROR) << "Could not parse '" << str_value << "'";
+ return false;
+ }
+ if (field->type()->base_type() == reflection::Vector) {
+ buffer->Repeated(field)->Add(value);
+ return true;
+ } else {
+ return buffer->Set<T>(field, value);
+ }
+}
+
} // namespace
template <>
@@ -178,48 +178,26 @@
return true;
}
-const reflection::Field* ReflectiveFlatbuffer::GetFieldByOffsetOrNull(
+const reflection::Field* ReflectiveFlatbuffer::GetFieldOrNull(
const int field_offset) const {
- return libtextclassifier3::GetFieldByOffsetOrNull(type_, field_offset);
+ return libtextclassifier3::GetFieldOrNull(type_, field_offset);
}
bool ReflectiveFlatbuffer::ParseAndSet(const reflection::Field* field,
const std::string& value) {
- switch (field->type()->base_type()) {
+ switch (field->type()->base_type() == reflection::Vector
+ ? field->type()->element()
+ : field->type()->base_type()) {
case reflection::String:
- return Set(field, value);
- case reflection::Int: {
- int32 int_value;
- if (!ParseInt32(value.data(), &int_value)) {
- TC3_LOG(ERROR) << "Could not parse '" << value << "' as int32.";
- return false;
- }
- return Set(field, int_value);
- }
- case reflection::Long: {
- int64 int_value;
- if (!ParseInt64(value.data(), &int_value)) {
- TC3_LOG(ERROR) << "Could not parse '" << value << "' as int64.";
- return false;
- }
- return Set(field, int_value);
- }
- case reflection::Float: {
- double double_value;
- if (!ParseDouble(value.data(), &double_value)) {
- TC3_LOG(ERROR) << "Could not parse '" << value << "' as float.";
- return false;
- }
- return Set(field, static_cast<float>(double_value));
- }
- case reflection::Double: {
- double double_value;
- if (!ParseDouble(value.data(), &double_value)) {
- TC3_LOG(ERROR) << "Could not parse '" << value << "' as double.";
- return false;
- }
- return Set(field, double_value);
- }
+ return ParseAndSetField<std::string>(field, value, this);
+ case reflection::Int:
+ return ParseAndSetField<int32>(field, value, this);
+ case reflection::Long:
+ return ParseAndSetField<int64>(field, value, this);
+ case reflection::Float:
+ return ParseAndSetField<float>(field, value, this);
+ case reflection::Double:
+ return ParseAndSetField<double>(field, value, this);
default:
TC3_LOG(ERROR) << "Unhandled field type: " << field->type()->base_type();
return false;
@@ -236,6 +214,27 @@
return parent->ParseAndSet(field, value);
}
+ReflectiveFlatbuffer* ReflectiveFlatbuffer::Add(StringPiece field_name) {
+ const reflection::Field* field = GetFieldOrNull(field_name);
+ if (field == nullptr) {
+ return nullptr;
+ }
+
+ if (field->type()->base_type() != reflection::BaseType::Vector) {
+ return nullptr;
+ }
+
+ return Add(field);
+}
+
+ReflectiveFlatbuffer* ReflectiveFlatbuffer::Add(
+ const reflection::Field* field) {
+ if (field == nullptr) {
+ return nullptr;
+ }
+ return Repeated(field)->Add();
+}
+
ReflectiveFlatbuffer* ReflectiveFlatbuffer::Mutable(
const StringPiece field_name) {
if (const reflection::Field* field = GetFieldOrNull(field_name)) {
@@ -285,11 +284,8 @@
}
// Otherwise, create a new instance and store it.
- std::unique_ptr<RepeatedField> repeated_field;
- if (!CreateRepeatedField(schema_, field->type(), &repeated_field)) {
- TC3_LOG(ERROR) << "Could not create repeated field.";
- return nullptr;
- }
+ std::unique_ptr<RepeatedField> repeated_field(
+ new RepeatedField(schema_, field));
const auto it = repeated_fields_.insert(
/*hint=*/entry, std::make_pair(field, std::move(repeated_field)));
return it->second.get();
@@ -309,9 +305,10 @@
// Create strings.
for (const auto& it : fields_) {
- if (it.second.HasString()) {
- offsets.push_back({it.first->offset(),
- builder->CreateString(it.second.StringValue()).o});
+ if (it.second.Has<std::string>()) {
+ offsets.push_back(
+ {it.first->offset(),
+ builder->CreateString(it.second.ConstRefValue<std::string>()).o});
}
}
@@ -328,44 +325,46 @@
switch (it.second.GetType()) {
case Variant::TYPE_BOOL_VALUE:
builder->AddElement<uint8_t>(
- it.first->offset(), static_cast<uint8_t>(it.second.BoolValue()),
+ it.first->offset(), static_cast<uint8_t>(it.second.Value<bool>()),
static_cast<uint8_t>(it.first->default_integer()));
continue;
case Variant::TYPE_INT8_VALUE:
builder->AddElement<int8_t>(
- it.first->offset(), static_cast<int8_t>(it.second.Int8Value()),
+ it.first->offset(), static_cast<int8_t>(it.second.Value<int8>()),
static_cast<int8_t>(it.first->default_integer()));
continue;
case Variant::TYPE_UINT8_VALUE:
builder->AddElement<uint8_t>(
- it.first->offset(), static_cast<uint8_t>(it.second.UInt8Value()),
+ it.first->offset(), static_cast<uint8_t>(it.second.Value<uint8>()),
static_cast<uint8_t>(it.first->default_integer()));
continue;
case Variant::TYPE_INT_VALUE:
builder->AddElement<int32>(
- it.first->offset(), it.second.IntValue(),
+ it.first->offset(), it.second.Value<int>(),
static_cast<int32>(it.first->default_integer()));
continue;
case Variant::TYPE_UINT_VALUE:
builder->AddElement<uint32>(
- it.first->offset(), it.second.UIntValue(),
+ it.first->offset(), it.second.Value<uint>(),
static_cast<uint32>(it.first->default_integer()));
continue;
case Variant::TYPE_INT64_VALUE:
- builder->AddElement<int64>(it.first->offset(), it.second.Int64Value(),
+ builder->AddElement<int64>(it.first->offset(), it.second.Value<int64>(),
it.first->default_integer());
continue;
case Variant::TYPE_UINT64_VALUE:
- builder->AddElement<uint64>(it.first->offset(), it.second.UInt64Value(),
+ builder->AddElement<uint64>(it.first->offset(),
+ it.second.Value<uint64>(),
it.first->default_integer());
continue;
case Variant::TYPE_FLOAT_VALUE:
builder->AddElement<float>(
- it.first->offset(), it.second.FloatValue(),
+ it.first->offset(), it.second.Value<float>(),
static_cast<float>(it.first->default_real()));
continue;
case Variant::TYPE_DOUBLE_VALUE:
- builder->AddElement<double>(it.first->offset(), it.second.DoubleValue(),
+ builder->AddElement<double>(it.first->offset(),
+ it.second.Value<double>(),
it.first->default_real());
continue;
default:
@@ -398,7 +397,7 @@
return false;
}
- TypedRepeatedField<std::string>* to_repeated = Repeated<std::string>(field);
+ RepeatedField* to_repeated = Repeated(field);
for (const flatbuffers::String* element : *from_vector) {
to_repeated->Add(element->str());
}
@@ -414,8 +413,7 @@
return false;
}
- TypedRepeatedField<ReflectiveFlatbuffer>* to_repeated =
- Repeated<ReflectiveFlatbuffer>(field);
+ RepeatedField* to_repeated = Repeated(field);
for (const flatbuffers::Table* const from_element : *from_vector) {
ReflectiveFlatbuffer* to_element = to_repeated->Add();
if (to_element == nullptr) {
@@ -481,7 +479,9 @@
->str());
break;
case reflection::Obj:
- if (!Mutable(field)->MergeFrom(
+ if (ReflectiveFlatbuffer* nested_field = Mutable(field);
+ nested_field == nullptr ||
+ !nested_field->MergeFrom(
from->GetPointer<const flatbuffers::Table* const>(
field->offset()))) {
return false;
@@ -614,4 +614,96 @@
return true;
}
+//
+// Repeated field methods.
+//
+
+ReflectiveFlatbuffer* RepeatedField::Add() {
+ if (is_primitive_) {
+ TC3_LOG(ERROR) << "Trying to add sub-message on a primitive-typed field.";
+ return nullptr;
+ }
+
+ object_items_.emplace_back(new ReflectiveFlatbuffer(
+ schema_, schema_->objects()->Get(field_->type()->index())));
+ return object_items_.back().get();
+}
+
+namespace {
+
+template <typename T>
+flatbuffers::uoffset_t TypedSerialize(const std::vector<Variant>& values,
+ flatbuffers::FlatBufferBuilder* builder) {
+ std::vector<T> typed_values;
+ typed_values.reserve(values.size());
+ for (const Variant& item : values) {
+ typed_values.push_back(item.Value<T>());
+ }
+ return builder->CreateVector(typed_values).o;
+}
+
+} // namespace
+
+flatbuffers::uoffset_t RepeatedField::Serialize(
+ flatbuffers::FlatBufferBuilder* builder) const {
+ switch (field_->type()->element()) {
+ case reflection::String:
+ return SerializeString(builder);
+ break;
+ case reflection::Obj:
+ return SerializeObject(builder);
+ break;
+ case reflection::Bool:
+ return TypedSerialize<bool>(items_, builder);
+ break;
+ case reflection::Byte:
+ return TypedSerialize<int8_t>(items_, builder);
+ break;
+ case reflection::UByte:
+ return TypedSerialize<uint8_t>(items_, builder);
+ break;
+ case reflection::Int:
+ return TypedSerialize<int>(items_, builder);
+ break;
+ case reflection::UInt:
+ return TypedSerialize<uint>(items_, builder);
+ break;
+ case reflection::Long:
+ return TypedSerialize<int64>(items_, builder);
+ break;
+ case reflection::ULong:
+ return TypedSerialize<uint64>(items_, builder);
+ break;
+ case reflection::Float:
+ return TypedSerialize<float>(items_, builder);
+ break;
+ case reflection::Double:
+ return TypedSerialize<double>(items_, builder);
+ break;
+ default:
+ TC3_LOG(FATAL) << "Unsupported type: " << field_->type()->element();
+ break;
+ }
+ TC3_LOG(FATAL) << "Invalid state.";
+ return 0;
+}
+
+flatbuffers::uoffset_t RepeatedField::SerializeString(
+ flatbuffers::FlatBufferBuilder* builder) const {
+ std::vector<flatbuffers::Offset<flatbuffers::String>> offsets(items_.size());
+ for (int i = 0; i < items_.size(); i++) {
+ offsets[i] = builder->CreateString(items_[i].ConstRefValue<std::string>());
+ }
+ return builder->CreateVector(offsets).o;
+}
+
+flatbuffers::uoffset_t RepeatedField::SerializeObject(
+ flatbuffers::FlatBufferBuilder* builder) const {
+ std::vector<flatbuffers::Offset<void>> offsets(object_items_.size());
+ for (int i = 0; i < object_items_.size(); i++) {
+ offsets[i] = object_items_[i]->Serialize(builder);
+ }
+ return builder->CreateVector(offsets).o;
+}
+
} // namespace libtextclassifier3
diff --git a/native/utils/flatbuffers.h b/native/utils/flatbuffers.h
index 81bc0b5..aaf248e 100644
--- a/native/utils/flatbuffers.h
+++ b/native/utils/flatbuffers.h
@@ -19,9 +19,9 @@
#ifndef LIBTEXTCLASSIFIER_UTILS_FLATBUFFERS_H_
#define LIBTEXTCLASSIFIER_UTILS_FLATBUFFERS_H_
-#include <map>
#include <memory>
#include <string>
+#include <unordered_map>
#include "annotator/model_generated.h"
#include "utils/base/logging.h"
@@ -30,13 +30,12 @@
#include "utils/variant.h"
#include "flatbuffers/flatbuffers.h"
#include "flatbuffers/reflection.h"
+#include "flatbuffers/reflection_generated.h"
namespace libtextclassifier3 {
class ReflectiveFlatBuffer;
class RepeatedField;
-template <typename T>
-class TypedRepeatedField;
// Loads and interprets the buffer as 'FlatbufferMessage' and verifies its
// integrity.
@@ -104,6 +103,41 @@
builder.GetSize());
}
+class ReflectiveFlatbuffer;
+
+// Checks whether a variant value type agrees with a field type.
+template <typename T>
+bool IsMatchingType(const reflection::BaseType type) {
+ switch (type) {
+ case reflection::Bool:
+ return std::is_same<T, bool>::value;
+ case reflection::Byte:
+ return std::is_same<T, int8>::value;
+ case reflection::UByte:
+ return std::is_same<T, uint8>::value;
+ case reflection::Int:
+ return std::is_same<T, int32>::value;
+ case reflection::UInt:
+ return std::is_same<T, uint32>::value;
+ case reflection::Long:
+ return std::is_same<T, int64>::value;
+ case reflection::ULong:
+ return std::is_same<T, uint64>::value;
+ case reflection::Float:
+ return std::is_same<T, float>::value;
+ case reflection::Double:
+ return std::is_same<T, double>::value;
+ case reflection::String:
+ return std::is_same<T, std::string>::value ||
+ std::is_same<T, StringPiece>::value ||
+ std::is_same<T, const char*>::value;
+ case reflection::Obj:
+ return std::is_same<T, ReflectiveFlatbuffer>::value;
+ default:
+ return false;
+ }
+}
+
// A flatbuffer that can be built using flatbuffer reflection data of the
// schema.
// Normally, field information is hard-coded in code generated from a flatbuffer
@@ -122,119 +156,58 @@
// field was not defined.
const reflection::Field* GetFieldOrNull(const StringPiece field_name) const;
const reflection::Field* GetFieldOrNull(const FlatbufferField* field) const;
- const reflection::Field* GetFieldByOffsetOrNull(const int field_offset) const;
+ const reflection::Field* GetFieldOrNull(const int field_offset) const;
// Gets a nested field and the message it is defined on.
bool GetFieldWithParent(const FlatbufferFieldPath* field_path,
ReflectiveFlatbuffer** parent,
reflection::Field const** field);
- // Checks whether a variant value type agrees with a field type.
- template <typename T>
- bool IsMatchingType(const reflection::BaseType type) const {
- switch (type) {
- case reflection::Bool:
- return std::is_same<T, bool>::value;
- case reflection::Byte:
- return std::is_same<T, int8>::value;
- case reflection::UByte:
- return std::is_same<T, uint8>::value;
- case reflection::Int:
- return std::is_same<T, int32>::value;
- case reflection::UInt:
- return std::is_same<T, uint32>::value;
- case reflection::Long:
- return std::is_same<T, int64>::value;
- case reflection::ULong:
- return std::is_same<T, uint64>::value;
- case reflection::Float:
- return std::is_same<T, float>::value;
- case reflection::Double:
- return std::is_same<T, double>::value;
- case reflection::String:
- return std::is_same<T, std::string>::value ||
- std::is_same<T, StringPiece>::value ||
- std::is_same<T, const char*>::value;
- case reflection::Obj:
- return std::is_same<T, ReflectiveFlatbuffer>::value;
- default:
- return false;
- }
- }
-
- // Sets a (primitive) field to a specific value.
+ // Sets a field to a specific value.
// Returns true if successful, and false if the field was not found or the
// expected type doesn't match.
template <typename T>
- bool Set(StringPiece field_name, T value) {
- if (const reflection::Field* field = GetFieldOrNull(field_name)) {
- return Set<T>(field, value);
- }
- return false;
- }
+ bool Set(StringPiece field_name, T value);
- // Sets a (primitive) field to a specific value.
+ // Sets a field to a specific value.
// Returns true if successful, and false if the expected type doesn't match.
// Expects `field` to be non-null.
template <typename T>
- bool Set(const reflection::Field* field, T value) {
- if (field == nullptr) {
- TC3_LOG(ERROR) << "Expected non-null field.";
- return false;
- }
- Variant variant_value(value);
- if (!IsMatchingType<T>(field->type()->base_type())) {
- TC3_LOG(ERROR) << "Type mismatch for field `" << field->name()->str()
- << "`, expected: " << field->type()->base_type()
- << ", got: " << variant_value.GetType();
- return false;
- }
- fields_[field] = variant_value;
- return true;
- }
+ bool Set(const reflection::Field* field, T value);
+ // Sets a field to a specific value. Field is specified by path.
template <typename T>
- bool Set(const FlatbufferFieldPath* path, T value) {
- ReflectiveFlatbuffer* parent;
- const reflection::Field* field;
- if (!GetFieldWithParent(path, &parent, &field)) {
- return false;
- }
- return parent->Set<T>(field, value);
- }
+ bool Set(const FlatbufferFieldPath* path, T value);
- // Sets a (primitive) field to a specific value.
- // Parses the string value according to the field type.
- bool ParseAndSet(const reflection::Field* field, const std::string& value);
- bool ParseAndSet(const FlatbufferFieldPath* path, const std::string& value);
-
- // Gets the reflective flatbuffer for a table field.
+ // Sets sub-message field (if not set yet), and returns a pointer to it.
// Returns nullptr if the field was not found, or the field type was not a
// table.
ReflectiveFlatbuffer* Mutable(StringPiece field_name);
ReflectiveFlatbuffer* Mutable(const reflection::Field* field);
+ // Parses the value (according to the type) and sets a primitive field to the
+ // parsed value.
+ bool ParseAndSet(const reflection::Field* field, const std::string& value);
+ bool ParseAndSet(const FlatbufferFieldPath* path, const std::string& value);
+
+ // Adds a primitive value to the repeated field.
+ template <typename T>
+ bool Add(StringPiece field_name, T value);
+
+ // Add a sub-message to the repeated field.
+ ReflectiveFlatbuffer* Add(StringPiece field_name);
+
+ template <typename T>
+ bool Add(const reflection::Field* field, T value);
+
+ ReflectiveFlatbuffer* Add(const reflection::Field* field);
+
// Gets the reflective flatbuffer for a repeated field.
// Returns nullptr if the field was not found, or the field type was not a
// vector.
RepeatedField* Repeated(StringPiece field_name);
RepeatedField* Repeated(const reflection::Field* field);
- template <typename T>
- TypedRepeatedField<T>* Repeated(const reflection::Field* field) {
- if (!IsMatchingType<T>(field->type()->element())) {
- TC3_LOG(ERROR) << "Type mismatch for field `" << field->name()->str()
- << "`";
- return nullptr;
- }
- return static_cast<TypedRepeatedField<T>*>(Repeated(field));
- }
-
- template <typename T>
- TypedRepeatedField<T>* Repeated(StringPiece field_name) {
- return static_cast<TypedRepeatedField<T>*>(Repeated(field_name));
- }
-
// Serializes the flatbuffer.
flatbuffers::uoffset_t Serialize(
flatbuffers::FlatBufferBuilder* builder) const;
@@ -274,14 +247,15 @@
const reflection::Object* const type_;
// Cached primitive fields (scalars and strings).
- std::map<const reflection::Field*, Variant> fields_;
+ std::unordered_map<const reflection::Field*, Variant> fields_;
// Cached sub-messages.
- std::map<const reflection::Field*, std::unique_ptr<ReflectiveFlatbuffer>>
+ std::unordered_map<const reflection::Field*,
+ std::unique_ptr<ReflectiveFlatbuffer>>
children_;
// Cached repeated fields.
- std::map<const reflection::Field*, std::unique_ptr<RepeatedField>>
+ std::unordered_map<const reflection::Field*, std::unique_ptr<RepeatedField>>
repeated_fields_;
// Flattens the flatbuffer as a flat map.
@@ -316,77 +290,132 @@
// Serves as a common base class for repeated fields.
class RepeatedField {
public:
- virtual ~RepeatedField() {}
+ RepeatedField(const reflection::Schema* const schema,
+ const reflection::Field* field)
+ : schema_(schema),
+ field_(field),
+ is_primitive_(field->type()->element() != reflection::BaseType::Obj) {}
- virtual flatbuffers::uoffset_t Serialize(
- flatbuffers::FlatBufferBuilder* builder) const = 0;
-};
+ template <typename T>
+ bool Add(const T value);
-// Represents a repeated field of particular type.
-template <typename T>
-class TypedRepeatedField : public RepeatedField {
- public:
- void Add(const T value) { items_.push_back(value); }
+ ReflectiveFlatbuffer* Add();
- flatbuffers::uoffset_t Serialize(
- flatbuffers::FlatBufferBuilder* builder) const override {
- return builder->CreateVector(items_).o;
+ template <typename T>
+ T Get(int index) const {
+ return items_.at(index).Value<T>();
}
- private:
- std::vector<T> items_;
-};
-
-// Specialization for strings.
-template <>
-class TypedRepeatedField<std::string> : public RepeatedField {
- public:
- void Add(const std::string& value) { items_.push_back(value); }
-
- flatbuffers::uoffset_t Serialize(
- flatbuffers::FlatBufferBuilder* builder) const override {
- std::vector<flatbuffers::Offset<flatbuffers::String>> offsets(
- items_.size());
- for (int i = 0; i < items_.size(); i++) {
- offsets[i] = builder->CreateString(items_[i]);
+ template <>
+ ReflectiveFlatbuffer* Get(int index) const {
+ if (is_primitive_) {
+ TC3_LOG(ERROR) << "Trying to get primitive value out of non-primitive "
+ "repeated field.";
+ return nullptr;
}
- return builder->CreateVector(offsets).o;
+ return object_items_.at(index).get();
}
- private:
- std::vector<std::string> items_;
-};
-
-// Specialization for repeated sub-messages.
-template <>
-class TypedRepeatedField<ReflectiveFlatbuffer> : public RepeatedField {
- public:
- TypedRepeatedField<ReflectiveFlatbuffer>(
- const reflection::Schema* const schema,
- const reflection::Type* const type)
- : schema_(schema), type_(type) {}
-
- ReflectiveFlatbuffer* Add() {
- items_.emplace_back(new ReflectiveFlatbuffer(
- schema_, schema_->objects()->Get(type_->index())));
- return items_.back().get();
+ int Size() const {
+ if (is_primitive_) {
+ return items_.size();
+ } else {
+ return object_items_.size();
+ }
}
flatbuffers::uoffset_t Serialize(
- flatbuffers::FlatBufferBuilder* builder) const override {
- std::vector<flatbuffers::Offset<void>> offsets(items_.size());
- for (int i = 0; i < items_.size(); i++) {
- offsets[i] = items_[i]->Serialize(builder);
- }
- return builder->CreateVector(offsets).o;
- }
+ flatbuffers::FlatBufferBuilder* builder) const;
private:
+ flatbuffers::uoffset_t SerializeString(
+ flatbuffers::FlatBufferBuilder* builder) const;
+ flatbuffers::uoffset_t SerializeObject(
+ flatbuffers::FlatBufferBuilder* builder) const;
+
const reflection::Schema* const schema_;
- const reflection::Type* const type_;
- std::vector<std::unique_ptr<ReflectiveFlatbuffer>> items_;
+ const reflection::Field* field_;
+ bool is_primitive_;
+
+ std::vector<Variant> items_;
+ std::vector<std::unique_ptr<ReflectiveFlatbuffer>> object_items_;
};
+template <typename T>
+bool ReflectiveFlatbuffer::Set(StringPiece field_name, T value) {
+ if (const reflection::Field* field = GetFieldOrNull(field_name)) {
+ if (field->type()->base_type() == reflection::BaseType::Vector ||
+ field->type()->base_type() == reflection::BaseType::Obj) {
+ TC3_LOG(ERROR)
+ << "Trying to set a primitive value on a non-scalar field.";
+ return false;
+ }
+ return Set<T>(field, value);
+ }
+ TC3_LOG(ERROR) << "Couldn't find a field: " << field_name;
+ return false;
+}
+
+template <typename T>
+bool ReflectiveFlatbuffer::Set(const reflection::Field* field, T value) {
+ if (field == nullptr) {
+ TC3_LOG(ERROR) << "Expected non-null field.";
+ return false;
+ }
+ Variant variant_value(value);
+ if (!IsMatchingType<T>(field->type()->base_type())) {
+ TC3_LOG(ERROR) << "Type mismatch for field `" << field->name()->str()
+ << "`, expected: " << field->type()->base_type()
+ << ", got: " << variant_value.GetType();
+ return false;
+ }
+ fields_[field] = variant_value;
+ return true;
+}
+
+template <typename T>
+bool ReflectiveFlatbuffer::Set(const FlatbufferFieldPath* path, T value) {
+ ReflectiveFlatbuffer* parent;
+ const reflection::Field* field;
+ if (!GetFieldWithParent(path, &parent, &field)) {
+ return false;
+ }
+ return parent->Set<T>(field, value);
+}
+
+template <typename T>
+bool ReflectiveFlatbuffer::Add(StringPiece field_name, T value) {
+ const reflection::Field* field = GetFieldOrNull(field_name);
+ if (field == nullptr) {
+ return false;
+ }
+
+ if (field->type()->base_type() != reflection::BaseType::Vector) {
+ return false;
+ }
+
+ return Add<T>(field, value);
+}
+
+template <typename T>
+bool ReflectiveFlatbuffer::Add(const reflection::Field* field, T value) {
+ if (field == nullptr) {
+ return false;
+ }
+ Repeated(field)->Add(value);
+ return true;
+}
+
+template <typename T>
+bool RepeatedField::Add(const T value) {
+ if (!is_primitive_ || !IsMatchingType<T>(field_->type()->element())) {
+ TC3_LOG(ERROR) << "Trying to add value of unmatching type.";
+ return false;
+ }
+ items_.push_back(Variant{value});
+ return true;
+}
+
// Resolves field lookups by name to the concrete field offsets.
bool SwapFieldNamesForOffsetsInPath(const reflection::Schema* schema,
FlatbufferFieldPathT* path);
@@ -400,7 +429,7 @@
return false;
}
- TypedRepeatedField<T>* to_repeated = Repeated<T>(field);
+ RepeatedField* to_repeated = Repeated(field);
for (const T element : *from_vector) {
to_repeated->Add(element);
}
diff --git a/native/utils/flatbuffers_test_extended.fbs b/native/utils/flatbuffers_test_extended.fbs
deleted file mode 100644
index ca679dc..0000000
--- a/native/utils/flatbuffers_test_extended.fbs
+++ /dev/null
@@ -1,50 +0,0 @@
-//
-// Copyright (C) 2018 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-namespace libtextclassifier3.test;
-
-table FlightNumberInfo {
- carrier_code: string;
- flight_code: int;
-}
-
-table ContactInfo {
- first_name: string;
- last_name: string;
- phone_number: string;
- score: float;
-}
-
-table Reminder {
- title: string;
- notes: [string];
-}
-
-table EntityData {
- an_int_field: int;
- a_long_field: int64;
- a_bool_field: bool;
- a_float_field: float;
- a_double_field: double;
- flight_number: FlightNumberInfo;
- contact_info: ContactInfo;
- reminders: [Reminder];
- numbers: [int];
- strings: [string];
- mystic: string; // Extra field.
-}
-
-root_type libtextclassifier3.test.EntityData;
diff --git a/native/utils/grammar/rules-utils.cc b/native/utils/grammar/rules-utils.cc
index e60b9f6..56c928a 100644
--- a/native/utils/grammar/rules-utils.cc
+++ b/native/utils/grammar/rules-utils.cc
@@ -54,12 +54,12 @@
return shards;
}
-std::vector<RuleMatch> DeduplicateMatches(
- const std::vector<RuleMatch>& matches) {
- std::vector<RuleMatch> sorted_candidates = matches;
+std::vector<Derivation> DeduplicateDerivations(
+ const std::vector<Derivation>& derivations) {
+ std::vector<Derivation> sorted_candidates = derivations;
std::stable_sort(
sorted_candidates.begin(), sorted_candidates.end(),
- [](const RuleMatch& a, const RuleMatch& b) {
+ [](const Derivation& a, const Derivation& b) {
// Sort by id.
if (a.rule_id != b.rule_id) {
return a.rule_id < b.rule_id;
@@ -75,9 +75,9 @@
});
// Deduplicate by overlap.
- std::vector<RuleMatch> result;
+ std::vector<Derivation> result;
for (int i = 0; i < sorted_candidates.size(); i++) {
- const RuleMatch& candidate = sorted_candidates[i];
+ const Derivation& candidate = sorted_candidates[i];
bool eliminated = false;
// Due to the sorting above, the candidate can only be completely
diff --git a/native/utils/grammar/rules-utils.h b/native/utils/grammar/rules-utils.h
index 28ed5b3..e6ac541 100644
--- a/native/utils/grammar/rules-utils.h
+++ b/native/utils/grammar/rules-utils.h
@@ -37,18 +37,18 @@
const std::vector<std::vector<Locale>>& shard_locales,
const std::vector<Locale>& locales);
-// Deduplicates rule matches by containing overlap.
+// Deduplicates rule derivations by containing overlap.
// The grammar system can output multiple candidates for optional parts.
// For example if a rule has an optional suffix, we
-// will get two rule matches when the suffix is present: one with and one
+// will get two rule derivations when the suffix is present: one with and one
// without the suffix. We therefore deduplicate by containing overlap, viz. from
// two candidates we keep the longer one if it completely contains the shorter.
-struct RuleMatch {
+struct Derivation {
const Match* match;
int64 rule_id;
};
-std::vector<RuleMatch> DeduplicateMatches(
- const std::vector<RuleMatch>& matches);
+std::vector<Derivation> DeduplicateDerivations(
+ const std::vector<Derivation>& derivations);
// Checks that all assertions of a match tree are fulfilled.
bool VerifyAssertions(const Match* match);
diff --git a/native/utils/grammar/rules-utils_test.cc b/native/utils/grammar/rules-utils_test.cc
new file mode 100644
index 0000000..6391be1
--- /dev/null
+++ b/native/utils/grammar/rules-utils_test.cc
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/grammar/rules-utils.h"
+
+#include <vector>
+
+#include "utils/grammar/match.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3::grammar {
+namespace {
+
+using testing::ElementsAre;
+using testing::Value;
+
+// Create test match object.
+Match CreateMatch(const CodepointIndex begin, const CodepointIndex end) {
+ Match match;
+ match.Init(0, CodepointSpan{begin, end},
+ /*arg_match_offset=*/begin);
+ return match;
+}
+
+MATCHER_P(IsDerivation, candidate, "") {
+ return Value(arg.rule_id, candidate.rule_id) &&
+ Value(arg.match, candidate.match);
+}
+
+TEST(UtilsTest, DeduplicatesMatches) {
+ // Overlapping matches from the same rule.
+ Match matches[] = {CreateMatch(0, 1), CreateMatch(1, 2), CreateMatch(0, 2)};
+ const std::vector<Derivation> candidates = {{&matches[0], /*rule_id=*/0},
+ {&matches[1], /*rule_id=*/0},
+ {&matches[2], /*rule_id=*/0}};
+
+ // Keep longest.
+ EXPECT_THAT(DeduplicateDerivations(candidates),
+ ElementsAre(IsDerivation(candidates[2])));
+}
+
+TEST(UtilsTest, DeduplicatesMatchesPerRule) {
+ // Overlapping matches from different rules.
+ Match matches[] = {CreateMatch(0, 1), CreateMatch(1, 2), CreateMatch(0, 2)};
+ const std::vector<Derivation> candidates = {{&matches[0], /*rule_id=*/0},
+ {&matches[1], /*rule_id=*/0},
+ {&matches[2], /*rule_id=*/0},
+ {&matches[0], /*rule_id=*/1}};
+
+ // Keep longest for rule 0, but also keep match from rule 1.
+ EXPECT_THAT(
+ DeduplicateDerivations(candidates),
+ ElementsAre(IsDerivation(candidates[2]), IsDerivation(candidates[3])));
+}
+
+TEST(UtilsTest, KeepNonoverlapping) {
+ // Non-overlapping matches.
+ Match matches[] = {CreateMatch(0, 1), CreateMatch(1, 2), CreateMatch(2, 3)};
+ const std::vector<Derivation> candidates = {{&matches[0], /*rule_id=*/0},
+ {&matches[1], /*rule_id=*/0},
+ {&matches[2], /*rule_id=*/0}};
+
+ // Keep all matches.
+ EXPECT_THAT(
+ DeduplicateDerivations(candidates),
+ ElementsAre(IsDerivation(candidates[0]), IsDerivation(candidates[1]),
+ IsDerivation(candidates[2])));
+}
+
+} // namespace
+} // namespace libtextclassifier3::grammar
diff --git a/native/utils/grammar/types.h b/native/utils/grammar/types.h
index ae45931..a79532b 100644
--- a/native/utils/grammar/types.h
+++ b/native/utils/grammar/types.h
@@ -37,6 +37,7 @@
kAssertion = -2,
kMapping = -3,
kExclusion = -4,
+ kRootRule = 1,
};
// Special CallbackId indicating that there's no callback associated with a
diff --git a/native/utils/grammar/utils/ir.cc b/native/utils/grammar/utils/ir.cc
index 32c21c6..ce074b8 100644
--- a/native/utils/grammar/utils/ir.cc
+++ b/native/utils/grammar/utils/ir.cc
@@ -25,6 +25,18 @@
constexpr size_t kMaxHashTableSize = 100;
+template <typename T>
+void SortForBinarySearchLookup(T* entries) {
+ std::sort(entries->begin(), entries->end(),
+ [](const auto& a, const auto& b) { return a->key < b->key; });
+}
+
+template <typename T>
+void SortStructsForBinarySearchLookup(T* entries) {
+ std::sort(entries->begin(), entries->end(),
+ [](const auto& a, const auto& b) { return a.key() < b.key(); });
+}
+
bool IsSameLhs(const Ir::Lhs& lhs, const RulesSet_::Lhs& other) {
return (lhs.nonterminal == other.nonterminal() &&
lhs.callback.id == other.callback_id() &&
@@ -129,10 +141,7 @@
rules->unary_rules.push_back(RulesSet_::Rules_::UnaryRulesEntry(
it.first, AddLhsSet(it.second, rules_set)));
}
-
- // Sort for binary search lookup.
- std::sort(rules->unary_rules.begin(), rules->unary_rules.end(),
- [](const auto& a, const auto& b) { return a.key() < b.key(); });
+ SortStructsForBinarySearchLookup(&rules->unary_rules);
}
// // Serializes a binary rules table.
@@ -255,6 +264,10 @@
return lhs;
}
+void Ir::AddAnnotation(const Nonterm lhs, const std::string& annotation) {
+ annotations_.emplace_back(annotation, lhs);
+}
+
// Serializes the terminal rules table.
void Ir::SerializeTerminalRules(
RulesSetT* rules_set,
@@ -398,12 +411,7 @@
output->callback.push_back(RulesSet_::CallbackEntry(
filter_callback_id, RulesSet_::Callback(/*is_filter=*/true)));
}
- // Sort for binary search.
- std::sort(
- output->callback.begin(), output->callback.end(),
- [](const RulesSet_::CallbackEntry& a, const RulesSet_::CallbackEntry& b) {
- return a.key() < b.key();
- });
+ SortStructsForBinarySearchLookup(&output->callback);
// Add information about predefined nonterminal classes.
output->nonterminals.reset(new RulesSet_::NonterminalsT);
@@ -421,6 +429,13 @@
output->nonterminals->n_digits_nt[i - 1] = n_digits_nt;
}
}
+ for (const auto& [annotation, annotation_nt] : annotations_) {
+ output->nonterminals->annotation_nt.emplace_back(
+ new RulesSet_::Nonterminals_::AnnotationNtEntryT);
+ output->nonterminals->annotation_nt.back()->key = annotation;
+ output->nonterminals->annotation_nt.back()->value = annotation_nt;
+ }
+ SortForBinarySearchLookup(&output->nonterminals->annotation_nt);
if (include_debug_information) {
output->debug_information.reset(new RulesSet_::DebugInformationT);
@@ -431,10 +446,7 @@
output->debug_information->nonterminal_names.back()->key = it.first;
output->debug_information->nonterminal_names.back()->value = it.second;
}
- // Sort for binary search lookup.
- std::sort(output->debug_information->nonterminal_names.begin(),
- output->debug_information->nonterminal_names.end(),
- [](const auto& a, const auto& b) { return a->key < b->key; });
+ SortForBinarySearchLookup(&output->debug_information->nonterminal_names);
}
// Add regex rules.
diff --git a/native/utils/grammar/utils/ir.h b/native/utils/grammar/utils/ir.h
index e1fbc0a..b05b87f 100644
--- a/native/utils/grammar/utils/ir.h
+++ b/native/utils/grammar/utils/ir.h
@@ -172,6 +172,9 @@
// Adds a regex rule <lhs> ::= <regex_pattern>.
Nonterm AddRegex(Nonterm lhs, const std::string& regex_pattern);
+ // Adds a definition for a nonterminal provided by a text annotation.
+ void AddAnnotation(Nonterm lhs, const std::string& annotation);
+
// Serializes a rule set in the intermediate representation into the
// memory mappable inference format.
void Serialize(bool include_debug_information, RulesSetT* output) const;
@@ -220,6 +223,9 @@
// The regex rules.
std::vector<std::pair<std::string, Nonterm>> regex_rules_;
+ // Mapping from annotation name to nonterminal.
+ std::vector<std::pair<std::string, Nonterm>> annotations_;
+
// Debug information.
std::unordered_map<Nonterm, std::string> nonterminal_names_;
std::unordered_map<std::string, Nonterm> nonterminal_ids_;
diff --git a/native/utils/grammar/utils/ir_test.cc b/native/utils/grammar/utils/ir_test.cc
new file mode 100644
index 0000000..d2438dd
--- /dev/null
+++ b/native/utils/grammar/utils/ir_test.cc
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/grammar/utils/ir.h"
+
+#include "utils/grammar/rules_generated.h"
+#include "utils/grammar/types.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3::grammar {
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+TEST(IrTest, HandlesSharingWithTerminalRules) {
+ Ir ir;
+
+ // <t1> ::= the
+ const Nonterm t1 = ir.Add(kUnassignedNonterm, "the");
+
+ // <t2> ::= quick
+ const Nonterm t2 = ir.Add(kUnassignedNonterm, "quick");
+
+ // <t3> ::= quick -- should share with <t2>
+ const Nonterm t3 = ir.Add(kUnassignedNonterm, "quick");
+
+ // <t4> ::= quick -- specify unshareable <t4>
+ // <t4> ::= brown
+ const Nonterm t4_unshareable = ir.AddUnshareableNonterminal();
+ ir.Add(t4_unshareable, "quick");
+ ir.Add(t4_unshareable, "brown");
+
+ // <t5> ::= brown -- should not be shared with <t4>
+ const Nonterm t5 = ir.Add(kUnassignedNonterm, "brown");
+
+ // <t6> ::= brown -- specify unshareable <t6>
+ const Nonterm t6_unshareable = ir.AddUnshareableNonterminal();
+ ir.Add(t6_unshareable, "brown");
+
+ // <t7> ::= brown -- should share with <t5>
+ const Nonterm t7 = ir.Add(kUnassignedNonterm, "brown");
+
+ EXPECT_THAT(t1, Ne(kUnassignedNonterm));
+ EXPECT_THAT(t2, Ne(kUnassignedNonterm));
+ EXPECT_THAT(t1, Ne(t2));
+ EXPECT_THAT(t2, Eq(t3));
+ EXPECT_THAT(t4_unshareable, Ne(kUnassignedNonterm));
+ EXPECT_THAT(t4_unshareable, Ne(t3));
+ EXPECT_THAT(t4_unshareable, Ne(t5));
+ EXPECT_THAT(t6_unshareable, Ne(kUnassignedNonterm));
+ EXPECT_THAT(t6_unshareable, Ne(t4_unshareable));
+ EXPECT_THAT(t6_unshareable, Ne(t5));
+ EXPECT_THAT(t7, Eq(t5));
+}
+
+TEST(IrTest, HandlesSharingWithNonterminalRules) {
+ Ir ir;
+
+ // Setup a few terminal rules.
+ const std::vector<Nonterm> rhs = {
+ ir.Add(kUnassignedNonterm, "the"), ir.Add(kUnassignedNonterm, "quick"),
+ ir.Add(kUnassignedNonterm, "brown"), ir.Add(kUnassignedNonterm, "fox")};
+
+ // Check for proper sharing using nonterminal rules.
+ for (int rhs_length = 1; rhs_length <= rhs.size(); rhs_length++) {
+ std::vector<Nonterm> rhs_truncated = rhs;
+ rhs_truncated.resize(rhs_length);
+ const Nonterm nt_u = ir.AddUnshareableNonterminal();
+ ir.Add(nt_u, rhs_truncated);
+ const Nonterm nt_1 = ir.Add(kUnassignedNonterm, rhs_truncated);
+ const Nonterm nt_2 = ir.Add(kUnassignedNonterm, rhs_truncated);
+
+ EXPECT_THAT(nt_1, Eq(nt_2));
+ EXPECT_THAT(nt_1, Ne(nt_u));
+ }
+}
+
+TEST(IrTest, HandlesSharingWithCallbacksWithSameParameters) {
+ // Test sharing in the presence of callbacks.
+ constexpr CallbackId kOutput1 = 1;
+ constexpr CallbackId kOutput2 = 2;
+ constexpr CallbackId kFilter1 = 3;
+ constexpr CallbackId kFilter2 = 4;
+ Ir ir(/*filters=*/{kFilter1, kFilter2});
+
+ const Nonterm x1 = ir.Add(kUnassignedNonterm, "hello");
+ const Nonterm x2 =
+ ir.Add(Ir::Lhs{kUnassignedNonterm, {kOutput1, 0}}, "hello");
+ const Nonterm x3 =
+ ir.Add(Ir::Lhs{kUnassignedNonterm, {kFilter1, 0}}, "hello");
+ const Nonterm x4 =
+ ir.Add(Ir::Lhs{kUnassignedNonterm, {kOutput2, 0}}, "hello");
+ const Nonterm x5 =
+ ir.Add(Ir::Lhs{kUnassignedNonterm, {kFilter2, 0}}, "hello");
+
+ // Duplicate entry.
+ const Nonterm x6 =
+ ir.Add(Ir::Lhs{kUnassignedNonterm, {kOutput2, 0}}, "hello");
+
+ EXPECT_THAT(x2, Eq(x1));
+ EXPECT_THAT(x3, Ne(x1));
+ EXPECT_THAT(x4, Eq(x1));
+ EXPECT_THAT(x5, Ne(x1));
+ EXPECT_THAT(x5, Ne(x3));
+ EXPECT_THAT(x6, Ne(x3));
+}
+
+TEST(IrTest, HandlesSharingWithCallbacksWithDifferentParameters) {
+ // Test sharing in the presence of callbacks.
+ constexpr CallbackId kOutput = 1;
+ constexpr CallbackId kFilter = 2;
+ Ir ir(/*filters=*/{kFilter});
+
+ const Nonterm x1 = ir.Add(Ir::Lhs{kUnassignedNonterm, {kOutput, 0}}, "world");
+ const Nonterm x2 = ir.Add(Ir::Lhs{kUnassignedNonterm, {kOutput, 1}}, "world");
+ const Nonterm x3 = ir.Add(Ir::Lhs{kUnassignedNonterm, {kFilter, 0}}, "world");
+ const Nonterm x4 = ir.Add(Ir::Lhs{kUnassignedNonterm, {kFilter, 1}}, "world");
+
+ EXPECT_THAT(x2, Eq(x1));
+ EXPECT_THAT(x3, Ne(x1));
+ EXPECT_THAT(x4, Ne(x1));
+ EXPECT_THAT(x4, Ne(x3));
+}
+
+TEST(IrTest, SerializesRulesToFlatbufferFormat) {
+ constexpr CallbackId kOutput = 1;
+ Ir ir;
+ const Nonterm verb = ir.AddUnshareableNonterminal();
+ ir.Add(verb, "buy");
+ ir.Add(Ir::Lhs{verb, {kOutput}}, "bring");
+ ir.Add(verb, "upbring");
+ ir.Add(verb, "remind");
+ const Nonterm set_reminder = ir.AddUnshareableNonterminal();
+ ir.Add(set_reminder,
+ std::vector<Nonterm>{ir.Add(kUnassignedNonterm, "remind"),
+ ir.Add(kUnassignedNonterm, "me"),
+ ir.Add(kUnassignedNonterm, "to"), verb});
+ const Nonterm action = ir.AddUnshareableNonterminal();
+ ir.Add(action, set_reminder);
+ RulesSetT rules;
+ ir.Serialize(/*include_debug_information=*/false, &rules);
+
+ EXPECT_THAT(rules.rules, SizeIs(1));
+
+ // Only one rule uses a callback, the rest will be encoded directly.
+ EXPECT_THAT(rules.lhs, SizeIs(1));
+ EXPECT_THAT(rules.lhs.front().callback_id(), kOutput);
+
+ // 6 distinct terminals: "buy", "upbring", "bring", "remind", "me" and "to".
+ EXPECT_THAT(rules.rules.front()->lowercase_terminal_rules->terminal_offsets,
+ SizeIs(6));
+ EXPECT_THAT(rules.rules.front()->terminal_rules->terminal_offsets, IsEmpty());
+
+ // As "bring" is a suffix of "upbring" it is expected to be suffix merged in
+ // the string pool
+ EXPECT_THAT(rules.terminals,
+ Eq(std::string("buy\0me\0remind\0to\0upbring\0", 25)));
+
+ EXPECT_THAT(rules.rules.front()->binary_rules, SizeIs(3));
+
+ // One unary rule: <action> ::= <set_reminder>
+ EXPECT_THAT(rules.rules.front()->unary_rules, SizeIs(1));
+}
+
+TEST(IrTest, HandlesRulesSharding) {
+ Ir ir(/*filters=*/{}, /*num_shards=*/2);
+ const Nonterm verb = ir.AddUnshareableNonterminal();
+ const Nonterm set_reminder = ir.AddUnshareableNonterminal();
+
+ // Shard 0: en
+ ir.Add(verb, "buy");
+ ir.Add(verb, "bring");
+ ir.Add(verb, "remind");
+ ir.Add(set_reminder,
+ std::vector<Nonterm>{ir.Add(kUnassignedNonterm, "remind"),
+ ir.Add(kUnassignedNonterm, "me"),
+ ir.Add(kUnassignedNonterm, "to"), verb});
+
+ // Shard 1: de
+ ir.Add(verb, "kaufen", /*case_sensitive=*/false, /*shard=*/1);
+ ir.Add(verb, "bringen", /*case_sensitive=*/false, /*shard=*/1);
+ ir.Add(verb, "erinnern", /*case_sensitive=*/false, /*shard=*/1);
+ ir.Add(set_reminder,
+ std::vector<Nonterm>{ir.Add(kUnassignedNonterm, "erinnere",
+ /*case_sensitive=*/false, /*shard=*/1),
+ ir.Add(kUnassignedNonterm, "mich",
+ /*case_sensitive=*/false, /*shard=*/1),
+ ir.Add(kUnassignedNonterm, "zu",
+ /*case_sensitive=*/false, /*shard=*/1),
+ verb},
+ /*shard=*/1);
+
+ // Test that terminal strings are correctly merged into the shared
+ // string pool.
+ RulesSetT rules;
+ ir.Serialize(/*include_debug_information=*/false, &rules);
+
+ EXPECT_THAT(rules.rules, SizeIs(2));
+
+ // 5 distinct terminals: "buy", "bring", "remind", "me" and "to".
+ EXPECT_THAT(rules.rules[0]->lowercase_terminal_rules->terminal_offsets,
+ SizeIs(5));
+ EXPECT_THAT(rules.rules[0]->terminal_rules->terminal_offsets, IsEmpty());
+
+ // 6 distinct terminals: "kaufen", "bringen", "erinnern", "erinnere", "mich"
+ // and "zu".
+ EXPECT_THAT(rules.rules[1]->lowercase_terminal_rules->terminal_offsets,
+ SizeIs(6));
+ EXPECT_THAT(rules.rules[1]->terminal_rules->terminal_offsets, IsEmpty());
+
+ EXPECT_THAT(rules.terminals,
+ Eq(std::string("bring\0bringen\0buy\0erinnere\0erinnern\0kaufen\0"
+ "me\0mich\0remind\0to\0zu\0",
+ 64)));
+
+ EXPECT_THAT(rules.rules[0]->binary_rules, SizeIs(3));
+ EXPECT_THAT(rules.rules[1]->binary_rules, SizeIs(3));
+}
+
+} // namespace
+} // namespace libtextclassifier3::grammar
diff --git a/native/utils/grammar/utils/rules.cc b/native/utils/grammar/utils/rules.cc
index e6c01e0..d6e4b76 100644
--- a/native/utils/grammar/utils/rules.cc
+++ b/native/utils/grammar/utils/rules.cc
@@ -20,6 +20,7 @@
#include "utils/grammar/utils/ir.h"
#include "utils/strings/append.h"
+#include "utils/strings/stringpiece.h"
namespace libtextclassifier3::grammar {
namespace {
@@ -136,14 +137,18 @@
} // namespace
-int Rules::AddNonterminal(StringPiece nonterminal_name) {
- const std::string key = nonterminal_name.ToString();
+int Rules::AddNonterminal(const std::string& nonterminal_name) {
+ std::string key = nonterminal_name;
+ auto alias_it = nonterminal_alias_.find(key);
+ if (alias_it != nonterminal_alias_.end()) {
+ key = alias_it->second;
+ }
auto it = nonterminal_names_.find(key);
if (it != nonterminal_names_.end()) {
return it->second;
}
const int index = nonterminals_.size();
- nonterminals_.push_back(NontermInfo{nonterminal_name.ToString()});
+ nonterminals_.push_back(NontermInfo{key});
nonterminal_names_.insert(it, {key, index});
return index;
}
@@ -154,6 +159,39 @@
return index;
}
+void Rules::AddAlias(const std::string& nonterminal_name,
+ const std::string& alias) {
+ TC3_CHECK_EQ(nonterminal_alias_.insert_or_assign(alias, nonterminal_name)
+ .first->second,
+ nonterminal_name)
+ << "Cannot redefine alias: " << alias;
+}
+
+// Defines a nonterminal for an externally provided annotation.
+int Rules::AddAnnotation(const std::string& annotation_name) {
+ auto [it, inserted] =
+ annotation_nonterminals_.insert({annotation_name, nonterminals_.size()});
+ if (inserted) {
+ nonterminals_.push_back(NontermInfo{});
+ }
+ return it->second;
+}
+
+void Rules::BindAnnotation(const std::string& nonterminal_name,
+ const std::string& annotation_name) {
+ auto [_, inserted] = annotation_nonterminals_.insert(
+ {annotation_name, AddNonterminal(nonterminal_name)});
+ TC3_CHECK(inserted);
+}
+
+bool Rules::IsNonterminalOfName(const RhsElement& element,
+ const std::string& nonterminal) const {
+ if (element.is_terminal) {
+ return false;
+ }
+ return (nonterminals_[element.nonterminal].name == nonterminal);
+}
+
// Note: For k optional components this creates 2^k rules, but it would be
// possible to be smarter about this and only use 2k rules instead.
// However that might be slower as it requires an extra rule firing at match
@@ -199,30 +237,95 @@
optional_element_indices_end, omit_these);
}
+std::vector<Rules::RhsElement> Rules::ResolveAnchors(
+ const std::vector<RhsElement>& rhs) const {
+ if (rhs.size() <= 2) {
+ return rhs;
+ }
+ auto begin = rhs.begin();
+ auto end = rhs.end();
+ if (IsNonterminalOfName(rhs.front(), kStartNonterm) &&
+ IsNonterminalOfName(rhs[1], kFiller)) {
+ // Skip start anchor and filler.
+ begin += 2;
+ }
+ if (IsNonterminalOfName(rhs.back(), kEndNonterm) &&
+ IsNonterminalOfName(rhs[rhs.size() - 2], kFiller)) {
+ // Skip filler and end anchor.
+ end -= 2;
+ }
+ return std::vector<Rules::RhsElement>(begin, end);
+}
+
+std::vector<Rules::RhsElement> Rules::ResolveFillers(
+ const std::vector<RhsElement>& rhs) {
+ std::vector<RhsElement> result;
+ for (int i = 0; i < rhs.size();) {
+ if (i == rhs.size() - 1 || IsNonterminalOfName(rhs[i], kFiller) ||
+ rhs[i].is_optional || !IsNonterminalOfName(rhs[i + 1], kFiller)) {
+ result.push_back(rhs[i]);
+ i++;
+ continue;
+ }
+
+ // We have the case:
+ // <a> <filler>
+ // rewrite as:
+ // <a_with_tokens> ::= <a>
+ // <a_with_tokens> ::= <a_with_tokens> <token>
+ const int with_tokens_nonterminal = AddNewNonterminal();
+ const RhsElement token(AddNonterminal(kTokenNonterm),
+ /*is_optional=*/false);
+ if (rhs[i + 1].is_optional) {
+ // <a_with_tokens> ::= <a>
+ Add(with_tokens_nonterminal, {rhs[i]});
+ } else {
+ // <a_with_tokens> ::= <a> <token>
+ Add(with_tokens_nonterminal, {rhs[i], token});
+ }
+ // <a_with_tokens> ::= <a_with_tokens> <token>
+ const RhsElement with_tokens(with_tokens_nonterminal,
+ /*is_optional=*/false);
+ Add(with_tokens_nonterminal, {with_tokens, token});
+ result.push_back(with_tokens);
+ i += 2;
+ }
+ return result;
+}
+
+std::vector<Rules::RhsElement> Rules::OptimizeRhs(
+ const std::vector<RhsElement>& rhs) {
+ return ResolveFillers(ResolveAnchors(rhs));
+}
+
void Rules::Add(const int lhs, const std::vector<RhsElement>& rhs,
const CallbackId callback, const int64 callback_param,
const int8 max_whitespace_gap, const bool case_sensitive,
const int shard) {
+ // Resolve anchors and fillers.
+ const std::vector optimized_rhs = OptimizeRhs(rhs);
+
std::vector<int> optional_element_indices;
- TC3_CHECK_LT(optional_element_indices.size(), rhs.size())
+ TC3_CHECK_LT(optional_element_indices.size(), optimized_rhs.size())
<< "Rhs must contain at least one non-optional element.";
- for (int i = 0; i < rhs.size(); i++) {
- if (rhs[i].is_optional) {
+ for (int i = 0; i < optimized_rhs.size(); i++) {
+ if (optimized_rhs[i].is_optional) {
optional_element_indices.push_back(i);
}
}
- std::vector<bool> omit_these(rhs.size(), false);
- ExpandOptionals(lhs, rhs, callback, callback_param, max_whitespace_gap,
- case_sensitive, shard, optional_element_indices.begin(),
+ std::vector<bool> omit_these(optimized_rhs.size(), false);
+ ExpandOptionals(lhs, optimized_rhs, callback, callback_param,
+ max_whitespace_gap, case_sensitive, shard,
+ optional_element_indices.begin(),
optional_element_indices.end(), &omit_these);
}
-void Rules::Add(StringPiece lhs, const std::vector<std::string>& rhs,
+void Rules::Add(const std::string& lhs, const std::vector<std::string>& rhs,
const CallbackId callback, const int64 callback_param,
const int8 max_whitespace_gap, const bool case_sensitive,
const int shard) {
TC3_CHECK(!rhs.empty()) << "Rhs cannot be empty (Lhs=" << lhs << ")";
- TC3_CHECK(!IsPredefinedNonterminal(lhs.ToString()));
+ TC3_CHECK(!IsPredefinedNonterminal(lhs));
std::vector<RhsElement> rhs_elements;
rhs_elements.reserve(rhs.size());
for (StringPiece rhs_component : rhs) {
@@ -235,7 +338,7 @@
// Check whether this component is a non-terminal.
if (IsNonterminal(rhs_component)) {
rhs_elements.push_back(
- RhsElement(AddNonterminal(rhs_component), is_optional));
+ RhsElement(AddNonterminal(rhs_component.ToString()), is_optional));
} else {
// A terminal.
// Sanity check for common typos -- '<' or '>' in a terminal.
@@ -247,9 +350,9 @@
max_whitespace_gap, case_sensitive, shard);
}
-void Rules::AddWithExclusion(StringPiece lhs,
+void Rules::AddWithExclusion(const std::string& lhs,
const std::vector<std::string>& rhs,
- StringPiece excluded_nonterminal,
+ const std::string& excluded_nonterminal,
const int8 max_whitespace_gap,
const bool case_sensitive, const int shard) {
Add(lhs, rhs,
@@ -258,7 +361,8 @@
max_whitespace_gap, case_sensitive, shard);
}
-void Rules::AddAssertion(StringPiece lhs, const std::vector<std::string>& rhs,
+void Rules::AddAssertion(const std::string& lhs,
+ const std::vector<std::string>& rhs,
const bool negative, const int8 max_whitespace_gap,
const bool case_sensitive, const int shard) {
Add(lhs, rhs,
@@ -266,7 +370,7 @@
/*callback_param=*/negative, max_whitespace_gap, case_sensitive, shard);
}
-void Rules::AddValueMapping(StringPiece lhs,
+void Rules::AddValueMapping(const std::string& lhs,
const std::vector<std::string>& rhs,
const int64 value, const int8 max_whitespace_gap,
const bool case_sensitive, const int shard) {
@@ -275,7 +379,7 @@
/*callback_param=*/value, max_whitespace_gap, case_sensitive, shard);
}
-void Rules::AddRegex(StringPiece lhs, const std::string& regex_pattern) {
+void Rules::AddRegex(const std::string& lhs, const std::string& regex_pattern) {
AddRegex(AddNonterminal(lhs), regex_pattern);
}
@@ -305,7 +409,8 @@
for (int i = 0; i < nonterminals_.size(); i++) {
const NontermInfo& nonterminal = nonterminals_[i];
bool unmergeable =
- (nonterminal.rules.size() > 1 || !nonterminal.regex_rules.empty());
+ (nonterminal.from_annotation || nonterminal.rules.size() > 1 ||
+ !nonterminal.regex_rules.empty());
for (const int rule_index : nonterminal.rules) {
const Rule& rule = rules_[rule_index];
@@ -331,6 +436,11 @@
}
}
+ // Define annotations.
+ for (const auto& [annotation, nonterminal] : annotation_nonterminals_) {
+ rules.AddAnnotation(nonterminal_ids[nonterminal], annotation);
+ }
+
// Now, keep adding eligible rules (rules whose rhs is completely assigned)
// until we can't make any more progress.
// Note: The following code is quadratic in the worst case.
diff --git a/native/utils/grammar/utils/rules.h b/native/utils/grammar/utils/rules.h
index dc7d424..5a2cbc2 100644
--- a/native/utils/grammar/utils/rules.h
+++ b/native/utils/grammar/utils/rules.h
@@ -14,6 +14,9 @@
* limitations under the License.
*/
+// Utility functions for pre-processing, creating and testing context free
+// grammars.
+
#ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
#define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
@@ -22,13 +25,12 @@
#include "utils/grammar/types.h"
#include "utils/grammar/utils/ir.h"
-#include "utils/strings/stringpiece.h"
namespace libtextclassifier3::grammar {
-// Utility functions for pre-processing, creating and testing context free
-// grammars.
-//
+// Special nonterminals.
+constexpr const char* kFiller = "<filler>";
+
// All rules for a grammar will be collected in a rules object.
//
// Rules r;
@@ -89,6 +91,9 @@
// The name of the non-terminal, if defined.
std::string name;
+ // Whether the nonterminal is provided via an annotation.
+ bool from_annotation = false;
+
// Rules that have this non-terminal as the lhs.
std::vector<int> rules;
@@ -103,7 +108,7 @@
// * A terminal
// optionally followed by a `?` which indicates that the component is
// optional. The `rhs` must contain at least one non-optional component.
- void Add(StringPiece lhs, const std::vector<std::string>& rhs,
+ void Add(const std::string& lhs, const std::vector<std::string>& rhs,
const CallbackId callback = kNoCallback,
const int64 callback_param = 0, int8 max_whitespace_gap = -1,
bool case_sensitive = false, int shard = 0);
@@ -118,31 +123,44 @@
// Adds a rule `lhs ::= rhs` with exclusion.
// The rule only matches, if `excluded_nonterminal` doesn't match the same
// span.
- void AddWithExclusion(StringPiece lhs, const std::vector<std::string>& rhs,
- StringPiece excluded_nonterminal,
+ void AddWithExclusion(const std::string& lhs,
+ const std::vector<std::string>& rhs,
+ const std::string& excluded_nonterminal,
int8 max_whitespace_gap = -1,
bool case_sensitive = false, int shard = 0);
// Adds an assertion callback.
- void AddAssertion(StringPiece lhs, const std::vector<std::string>& rhs,
+ void AddAssertion(const std::string& lhs, const std::vector<std::string>& rhs,
bool negative = true, int8 max_whitespace_gap = -1,
bool case_sensitive = false, int shard = 0);
// Adds a mapping callback.
- void AddValueMapping(StringPiece lhs, const std::vector<std::string>& rhs,
- int64 value, int8 max_whitespace_gap = -1,
+ void AddValueMapping(const std::string& lhs,
+ const std::vector<std::string>& rhs, int64 value,
+ int8 max_whitespace_gap = -1,
bool case_sensitive = false, int shard = 0);
// Adds a regex rule.
- void AddRegex(StringPiece lhs, const std::string& regex_pattern);
+ void AddRegex(const std::string& lhs, const std::string& regex_pattern);
void AddRegex(int lhs, const std::string& regex_pattern);
// Creates a nonterminal with the given name, if one doesn't already exist.
- int AddNonterminal(StringPiece nonterminal_name);
+ int AddNonterminal(const std::string& nonterminal_name);
// Creates a new nonterminal.
int AddNewNonterminal();
+ // Defines a nonterminal for an externally provided annotation.
+ int AddAnnotation(const std::string& annotation_name);
+
+ // Defines a nonterminal for an externally provided annotation.
+ void BindAnnotation(const std::string& nonterminal_name,
+ const std::string& annotation_name);
+
+ // Adds an alias for a nonterminal. This is a separate name for the same
+ // nonterminal.
+ void AddAlias(const std::string& nonterminal_name, const std::string& alias);
+
// Defines a new filter id.
void DefineFilter(const CallbackId filter_id) { filters_.insert(filter_id); }
@@ -161,11 +179,38 @@
std::vector<int>::const_iterator optional_element_indices_end,
std::vector<bool>* omit_these);
+ // Applies optimizations to the right hand side of a rule.
+ std::vector<RhsElement> OptimizeRhs(const std::vector<RhsElement>& rhs);
+
+ // Removes start and end anchors in case they are followed (respectively
+ // preceded) by unbounded filler.
+ std::vector<RhsElement> ResolveAnchors(
+ const std::vector<RhsElement>& rhs) const;
+
+ // Rewrites fillers in a rule.
+ // Fillers in a rule such as `lhs ::= <a> <filler> <b>` could be lowered as
+ // <tokens> ::= <token>
+ // <tokens> ::= <tokens> <token>
+ // This has the disadvantage that it will produce a match for each possible
+ // span in the text, which is quadratic in the number of tokens.
+ // It can be more efficiently written as:
+ // `lhs ::= <a_with_tokens> <b>` with
+ // `<a_with_tokens> ::= <a>`
+ // `<a_with_tokens> ::= <a_with_tokens> <token>`
+ // In this each occurrence of `<a>` can start a sequence of tokens.
+ std::vector<RhsElement> ResolveFillers(const std::vector<RhsElement>& rhs);
+
+ // Checks whether an element denotes a specific nonterminal.
+ bool IsNonterminalOfName(const RhsElement& element,
+ const std::string& nonterminal) const;
+
const int num_shards_;
// Non-terminal to id map.
std::unordered_map<std::string, int> nonterminal_names_;
std::vector<NontermInfo> nonterminals_;
+ std::unordered_map<std::string, std::string> nonterminal_alias_;
+ std::unordered_map<std::string, int> annotation_nonterminals_;
// Rules.
std::vector<Rule> rules_;
diff --git a/native/utils/grammar/utils/rules_test.cc b/native/utils/grammar/utils/rules_test.cc
new file mode 100644
index 0000000..6761118
--- /dev/null
+++ b/native/utils/grammar/utils/rules_test.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/grammar/utils/rules.h"
+
+#include "utils/grammar/rules_generated.h"
+#include "utils/grammar/utils/ir.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3::grammar {
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+TEST(SerializeRulesTest, HandlesSimpleRuleSet) {
+ Rules rules;
+
+ rules.Add("<verb>", {"buy"});
+ rules.Add("<verb>", {"bring"});
+ rules.Add("<verb>", {"remind"});
+ rules.Add("<reminder>", {"remind", "me", "to", "<verb>"});
+ rules.Add("<action>", {"<reminder>"});
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_THAT(frozen_rules.lhs, IsEmpty());
+ EXPECT_EQ(frozen_rules.terminals,
+ std::string("bring\0buy\0me\0remind\0to\0", 23));
+ EXPECT_THAT(frozen_rules.rules.front()->binary_rules, SizeIs(3));
+ EXPECT_THAT(frozen_rules.rules.front()->unary_rules, SizeIs(1));
+}
+
+TEST(SerializeRulesTest, HandlesRulesSetWithCallbacks) {
+ Rules rules;
+ const CallbackId output = 1;
+ const CallbackId filter = 2;
+ rules.DefineFilter(filter);
+
+ rules.Add("<verb>", {"buy"});
+ rules.Add("<verb>", {"bring"}, output, 0);
+ rules.Add("<verb>", {"remind"}, output, 0);
+ rules.Add("<reminder>", {"remind", "me", "to", "<verb>"});
+ rules.Add("<action>", {"<reminder>"}, filter, 0);
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_EQ(frozen_rules.terminals,
+ std::string("bring\0buy\0me\0remind\0to\0", 23));
+
+ // We have two identical output calls and one filter call in the rule set
+ // definition above.
+ EXPECT_THAT(frozen_rules.lhs, SizeIs(2));
+
+ EXPECT_THAT(frozen_rules.rules.front()->binary_rules, SizeIs(3));
+ EXPECT_THAT(frozen_rules.rules.front()->unary_rules, SizeIs(1));
+}
+
+TEST(SerializeRulesTest, HandlesRulesWithWhitespaceGapLimits) {
+ Rules rules;
+ rules.Add("<iata>", {"lx"});
+ rules.Add("<iata>", {"aa"});
+ rules.Add("<flight>", {"<iata>", "<4_digits>"}, kNoCallback, 0,
+ /*max_whitespace_gap=*/0);
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_EQ(frozen_rules.terminals, std::string("aa\0lx\0", 6));
+ EXPECT_THAT(frozen_rules.lhs, SizeIs(1));
+}
+
+TEST(SerializeRulesTest, HandlesCaseSensitiveTerminals) {
+ Rules rules;
+ rules.Add("<iata>", {"LX"}, kNoCallback, 0, /*max_whitespace_gap=*/-1,
+ /*case_sensitive=*/true);
+ rules.Add("<iata>", {"AA"}, kNoCallback, 0, /*max_whitespace_gap=*/-1,
+ /*case_sensitive=*/true);
+ rules.Add("<iata>", {"dl"}, kNoCallback, 0, /*max_whitespace_gap=*/-1,
+ /*case_sensitive=*/false);
+ rules.Add("<flight>", {"<iata>", "<4_digits>"}, kNoCallback, 0,
+ /*max_whitespace_gap=*/0);
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_EQ(frozen_rules.terminals, std::string("AA\0LX\0dl\0", 9));
+ EXPECT_THAT(frozen_rules.lhs, SizeIs(1));
+}
+
+TEST(SerializeRulesTest, HandlesMultipleShards) {
+ Rules rules(/*num_shards=*/2);
+ rules.Add("<iata>", {"LX"}, kNoCallback, 0, /*max_whitespace_gap=*/-1,
+ /*case_sensitive=*/true, /*shard=*/0);
+ rules.Add("<iata>", {"aa"}, kNoCallback, 0, /*max_whitespace_gap=*/-1,
+ /*case_sensitive=*/false, /*shard=*/1);
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(2));
+ EXPECT_EQ(frozen_rules.terminals, std::string("LX\0aa\0", 6));
+}
+
+TEST(SerializeRulesTest, HandlesRegexRules) {
+ Rules rules;
+ rules.AddRegex("<code>", "[A-Z]+");
+ rules.AddRegex("<numbers>", "\\d+");
+ RulesSetT frozen_rules;
+ rules.Finalize().Serialize(/*include_debug_information=*/false,
+ &frozen_rules);
+ EXPECT_THAT(frozen_rules.regex_annotator, SizeIs(2));
+}
+
+TEST(SerializeRulesTest, HandlesAlias) {
+ Rules rules;
+ rules.Add("<iata>", {"lx"});
+ rules.Add("<iata>", {"aa"});
+ rules.Add("<flight>", {"<iata>", "<4_digits>"});
+ rules.AddAlias("<flight_number>", "<flight>");
+
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_EQ(frozen_rules.terminals, std::string("aa\0lx\0", 6));
+ EXPECT_THAT(frozen_rules.rules.front()->binary_rules, SizeIs(1));
+
+ // Only alias, no rule.
+ EXPECT_THAT(frozen_rules.rules.front()->unary_rules, IsEmpty());
+
+ EXPECT_THAT(frozen_rules.lhs, IsEmpty());
+}
+
+TEST(SerializeRulesTest, ResolvesAnchorsAndFillers) {
+ Rules rules;
+ rules.Add("<code>",
+ {"<^>", "<filler>", "this", "is", "a", "test", "<filler>", "<$>"});
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_EQ(frozen_rules.terminals, std::string("a\0test\0this\0", 12));
+
+ // Expect removal of anchors and fillers in this case.
+ // The rule above is equivalent to: <code> ::= this is a test, binarized into
+ // <tmp_0> ::= this is
+ // <tmp_1> ::= <tmp_0> a
+ // <code> ::= <tmp_1> test
+ EXPECT_THAT(frozen_rules.rules.front()->binary_rules, SizeIs(3));
+
+ EXPECT_THAT(frozen_rules.rules.front()->unary_rules, IsEmpty());
+ EXPECT_THAT(frozen_rules.lhs, IsEmpty());
+}
+
+TEST(SerializeRulesTest, HandlesAnnotations) {
+ Rules rules;
+ rules.AddAnnotation("phone");
+ rules.AddAnnotation("url");
+ rules.AddAnnotation("tracking_number");
+ const Ir ir = rules.Finalize();
+ RulesSetT frozen_rules;
+ ir.Serialize(/*include_debug_information=*/false, &frozen_rules);
+
+ EXPECT_THAT(frozen_rules.rules, SizeIs(1));
+ EXPECT_THAT(frozen_rules.nonterminals->annotation_nt, SizeIs(3));
+ EXPECT_EQ(frozen_rules.nonterminals->annotation_nt[0]->key, "phone");
+ EXPECT_EQ(frozen_rules.nonterminals->annotation_nt[1]->key,
+ "tracking_number");
+ EXPECT_EQ(frozen_rules.nonterminals->annotation_nt[2]->key, "url");
+}
+
+} // namespace
+} // namespace libtextclassifier3::grammar
diff --git a/native/utils/i18n/locale_test.cc b/native/utils/i18n/locale_test.cc
new file mode 100644
index 0000000..faea4f6
--- /dev/null
+++ b/native/utils/i18n/locale_test.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/i18n/locale.h"
+
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(LocaleTest, ParseUnknown) {
+ Locale locale = Locale::Invalid();
+ EXPECT_FALSE(locale.IsValid());
+}
+
+TEST(LocaleTest, ParseSwissEnglish) {
+ Locale locale = Locale::FromBCP47("en-CH");
+ EXPECT_TRUE(locale.IsValid());
+ EXPECT_EQ(locale.Language(), "en");
+ EXPECT_EQ(locale.Script(), "");
+ EXPECT_EQ(locale.Region(), "CH");
+}
+
+TEST(LocaleTest, ParseChineseChina) {
+ Locale locale = Locale::FromBCP47("zh-CN");
+ EXPECT_TRUE(locale.IsValid());
+ EXPECT_EQ(locale.Language(), "zh");
+ EXPECT_EQ(locale.Script(), "");
+ EXPECT_EQ(locale.Region(), "CN");
+}
+
+TEST(LocaleTest, ParseChineseTaiwan) {
+ Locale locale = Locale::FromBCP47("zh-Hant-TW");
+ EXPECT_TRUE(locale.IsValid());
+ EXPECT_EQ(locale.Language(), "zh");
+ EXPECT_EQ(locale.Script(), "Hant");
+ EXPECT_EQ(locale.Region(), "TW");
+}
+
+TEST(LocaleTest, ParseEnglish) {
+ Locale locale = Locale::FromBCP47("en");
+ EXPECT_TRUE(locale.IsValid());
+ EXPECT_EQ(locale.Language(), "en");
+ EXPECT_EQ(locale.Script(), "");
+ EXPECT_EQ(locale.Region(), "");
+}
+
+TEST(LocaleTest, ParseCineseTraditional) {
+ Locale locale = Locale::FromBCP47("zh-Hant");
+ EXPECT_TRUE(locale.IsValid());
+ EXPECT_EQ(locale.Language(), "zh");
+ EXPECT_EQ(locale.Script(), "Hant");
+ EXPECT_EQ(locale.Region(), "");
+}
+
+TEST(LocaleTest, IsAnyLocaleSupportedMatch) {
+ std::vector<Locale> locales = {Locale::FromBCP47("zh-HK"),
+ Locale::FromBCP47("en-UK")};
+ std::vector<Locale> supported_locales = {Locale::FromBCP47("en")};
+
+ EXPECT_TRUE(Locale::IsAnyLocaleSupported(locales, supported_locales,
+ /*default_value=*/false));
+}
+
+TEST(LocaleTest, IsAnyLocaleSupportedNotMatch) {
+ std::vector<Locale> locales = {Locale::FromBCP47("zh-tw")};
+ std::vector<Locale> supported_locales = {Locale::FromBCP47("en"),
+ Locale::FromBCP47("fr")};
+
+ EXPECT_FALSE(Locale::IsAnyLocaleSupported(locales, supported_locales,
+ /*default_value=*/false));
+}
+
+TEST(LocaleTest, IsAnyLocaleSupportedAnyLocale) {
+ std::vector<Locale> locales = {Locale::FromBCP47("zh-tw")};
+ std::vector<Locale> supported_locales = {Locale::FromBCP47("*")};
+
+ EXPECT_TRUE(Locale::IsAnyLocaleSupported(locales, supported_locales,
+ /*default_value=*/false));
+}
+
+TEST(LocaleTest, IsAnyLocaleSupportedEmptyLocales) {
+ std::vector<Locale> supported_locales = {Locale::FromBCP47("en")};
+
+ EXPECT_TRUE(Locale::IsAnyLocaleSupported({}, supported_locales,
+ /*default_value=*/true));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/intents/intent-config.fbs b/native/utils/intents/intent-config.fbs
index 76a0ddc..672eb9d 100755
--- a/native/utils/intents/intent-config.fbs
+++ b/native/utils/intents/intent-config.fbs
@@ -132,7 +132,6 @@
type:AndroidSimpleIntentGeneratorExtraType;
string_:string (shared);
-
bool_:bool;
int32_:int;
}
@@ -141,9 +140,7 @@
namespace libtextclassifier3;
table AndroidSimpleIntentGeneratorCondition {
type:AndroidSimpleIntentGeneratorConditionType;
-
string_:string (shared);
-
int32_:int;
int64_:long;
}
diff --git a/native/utils/intents/jni.cc b/native/utils/intents/jni.cc
index 1c6c283..051d078 100644
--- a/native/utils/intents/jni.cc
+++ b/native/utils/intents/jni.cc
@@ -175,40 +175,41 @@
case Variant::TYPE_INT_VALUE:
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_int_, name.get(),
- value.IntValue());
+ value.Value<int>());
case Variant::TYPE_INT64_VALUE:
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_long_, name.get(),
- value.Int64Value());
+ value.Value<int64>());
case Variant::TYPE_FLOAT_VALUE:
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_float_, name.get(),
- value.FloatValue());
+ value.Value<float>());
case Variant::TYPE_DOUBLE_VALUE:
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_double_, name.get(),
- value.DoubleValue());
+ value.Value<double>());
case Variant::TYPE_BOOL_VALUE:
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_bool_, name.get(),
- value.BoolValue());
+ value.Value<bool>());
case Variant::TYPE_STRING_VALUE: {
TC3_ASSIGN_OR_RETURN(
ScopedLocalRef<jstring> value_jstring,
- jni_cache_->ConvertToJavaString(value.StringValue()));
+ jni_cache_->ConvertToJavaString(value.ConstRefValue<std::string>()));
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_string_, name.get(),
value_jstring.get());
}
case Variant::TYPE_STRING_VECTOR_VALUE: {
- TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobjectArray> value_jstring_array,
- AsStringArray(value.StringVectorValue()));
+ TC3_ASSIGN_OR_RETURN(
+ ScopedLocalRef<jobjectArray> value_jstring_array,
+ AsStringArray(value.ConstRefValue<std::vector<std::string>>()));
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_string_array_, name.get(),
@@ -216,8 +217,9 @@
}
case Variant::TYPE_FLOAT_VECTOR_VALUE: {
- TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jfloatArray> value_jfloat_array,
- AsFloatArray(value.FloatVectorValue()));
+ TC3_ASSIGN_OR_RETURN(
+ ScopedLocalRef<jfloatArray> value_jfloat_array,
+ AsFloatArray(value.ConstRefValue<std::vector<float>>()));
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_float_array_, name.get(),
@@ -226,7 +228,7 @@
case Variant::TYPE_INT_VECTOR_VALUE: {
TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jintArray> value_jint_array,
- AsIntArray(value.IntVectorValue()));
+ AsIntArray(value.ConstRefValue<std::vector<int>>()));
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_int_array_, name.get(),
@@ -234,8 +236,10 @@
}
case Variant::TYPE_STRING_VARIANT_MAP_VALUE: {
- TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobjectArray> value_jobect_array,
- AsNamedVariantArray(value.StringVariantMapValue()));
+ TC3_ASSIGN_OR_RETURN(
+ ScopedLocalRef<jobjectArray> value_jobect_array,
+ AsNamedVariantArray(
+ value.ConstRefValue<std::map<std::string, Variant>>()));
return JniHelper::NewObject(env, named_variant_class_.get(),
named_variant_from_named_variant_array_,
name.get(), value_jobect_array.get());
diff --git a/native/utils/lua-utils.cc b/native/utils/lua-utils.cc
index fa19923..d6fe2c4 100644
--- a/native/utils/lua-utils.cc
+++ b/native/utils/lua-utils.cc
@@ -223,6 +223,11 @@
int LuaEnvironment::ReadFlatbuffer(const int index,
ReflectiveFlatbuffer* buffer) const {
+ if (buffer == nullptr) {
+ TC3_LOG(ERROR) << "Called ReadFlatbuffer with null buffer: " << index;
+ lua_error(state_);
+ return LUA_ERRRUN;
+ }
if (lua_type(state_, /*idx=*/index) != LUA_TTABLE) {
TC3_LOG(ERROR) << "Expected table, got: "
<< lua_type(state_, /*idx=*/kIndexStackTop);
@@ -278,48 +283,48 @@
// Read repeated field.
switch (field->type()->element()) {
case reflection::Bool:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<bool>(field));
+ ReadRepeatedField<bool>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Byte:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<int8>(field));
+ ReadRepeatedField<int8>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::UByte:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<uint8>(field));
+ ReadRepeatedField<uint8>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Int:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<int32>(field));
+ ReadRepeatedField<int32>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::UInt:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<uint32>(field));
+ ReadRepeatedField<uint32>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Long:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<int64>(field));
+ ReadRepeatedField<int64>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::ULong:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<uint64>(field));
+ ReadRepeatedField<uint64>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Float:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<float>(field));
+ ReadRepeatedField<float>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Double:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<double>(field));
+ ReadRepeatedField<double>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::String:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<std::string>(field));
+ ReadRepeatedField<std::string>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
case reflection::Obj:
- ReadRepeatedField(/*index=*/kIndexStackTop,
- buffer->Repeated<ReflectiveFlatbuffer>(field));
+ ReadRepeatedField<ReflectiveFlatbuffer>(/*index=*/kIndexStackTop,
+ buffer->Repeated(field));
break;
default:
TC3_LOG(ERROR) << "Unsupported repeated field type: "
diff --git a/native/utils/lua-utils.h b/native/utils/lua-utils.h
index f602aa0..b01471a 100644
--- a/native/utils/lua-utils.h
+++ b/native/utils/lua-utils.h
@@ -506,15 +506,15 @@
// Reads a repeated field from lua.
template <typename T>
- void ReadRepeatedField(const int index, TypedRepeatedField<T>* result) const {
+ void ReadRepeatedField(const int index, RepeatedField* result) const {
for (const auto& element : ReadVector<T>(index)) {
result->Add(element);
}
}
template <>
- void ReadRepeatedField<ReflectiveFlatbuffer>(
- const int index, TypedRepeatedField<ReflectiveFlatbuffer>* result) const {
+ void ReadRepeatedField<ReflectiveFlatbuffer>(const int index,
+ RepeatedField* result) const {
lua_pushnil(state_);
while (Next(index - 1)) {
ReadFlatbuffer(index, result->Add());
diff --git a/native/utils/lua-utils_test.cc b/native/utils/lua-utils_test.cc
new file mode 100644
index 0000000..8c9f8de
--- /dev/null
+++ b/native/utils/lua-utils_test.cc
@@ -0,0 +1,333 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/lua-utils.h"
+
+#include <string>
+
+#include "utils/flatbuffers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::ElementsAre;
+using testing::Eq;
+using testing::FloatEq;
+
+std::string TestFlatbufferSchema() {
+ // Creates a test schema for flatbuffer passing tests.
+ // Cannot use the object oriented API here as that is not available for the
+ // reflection schema.
+ flatbuffers::FlatBufferBuilder schema_builder;
+ std::vector<flatbuffers::Offset<reflection::Field>> fields = {
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("float_field"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::Float),
+ /*id=*/0,
+ /*offset=*/4),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("nested_field"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::Obj,
+ /*element=*/reflection::None,
+ /*index=*/0 /* self */),
+ /*id=*/1,
+ /*offset=*/6),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("repeated_nested_field"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::Vector,
+ /*element=*/reflection::Obj,
+ /*index=*/0 /* self */),
+ /*id=*/2,
+ /*offset=*/8),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("repeated_string_field"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::Vector,
+ /*element=*/reflection::String),
+ /*id=*/3,
+ /*offset=*/10),
+ reflection::CreateField(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("string_field"),
+ /*type=*/
+ reflection::CreateType(schema_builder,
+ /*base_type=*/reflection::String),
+ /*id=*/4,
+ /*offset=*/12)};
+
+ std::vector<flatbuffers::Offset<reflection::Enum>> enums;
+ std::vector<flatbuffers::Offset<reflection::Object>> objects = {
+ reflection::CreateObject(
+ schema_builder,
+ /*name=*/schema_builder.CreateString("TestData"),
+ /*fields=*/
+ schema_builder.CreateVectorOfSortedTables(&fields))};
+ schema_builder.Finish(reflection::CreateSchema(
+ schema_builder, schema_builder.CreateVectorOfSortedTables(&objects),
+ schema_builder.CreateVectorOfSortedTables(&enums),
+ /*(unused) file_ident=*/0,
+ /*(unused) file_ext=*/0,
+ /*root_table*/ objects[0]));
+ return std::string(
+ reinterpret_cast<const char*>(schema_builder.GetBufferPointer()),
+ schema_builder.GetSize());
+}
+
+class LuaUtilsTest : public testing::Test, protected LuaEnvironment {
+ protected:
+ LuaUtilsTest()
+ : serialized_flatbuffer_schema_(TestFlatbufferSchema()),
+ schema_(flatbuffers::GetRoot<reflection::Schema>(
+ serialized_flatbuffer_schema_.data())),
+ flatbuffer_builder_(schema_) {
+ EXPECT_THAT(RunProtected([this] {
+ LoadDefaultLibraries();
+ return LUA_OK;
+ }),
+ Eq(LUA_OK));
+ }
+
+ void RunScript(StringPiece script) {
+ EXPECT_THAT(luaL_loadbuffer(state_, script.data(), script.size(),
+ /*name=*/nullptr),
+ Eq(LUA_OK));
+ EXPECT_THAT(
+ lua_pcall(state_, /*nargs=*/0, /*num_results=*/1, /*errfunc=*/0),
+ Eq(LUA_OK));
+ }
+
+ const std::string serialized_flatbuffer_schema_;
+ const reflection::Schema* schema_;
+ ReflectiveFlatbufferBuilder flatbuffer_builder_;
+};
+
+TEST_F(LuaUtilsTest, HandlesVectors) {
+ {
+ PushVector(std::vector<int64>{1, 2, 3, 4, 5});
+ EXPECT_THAT(ReadVector<int64>(), ElementsAre(1, 2, 3, 4, 5));
+ }
+ {
+ PushVector(std::vector<std::string>{"hello", "there"});
+ EXPECT_THAT(ReadVector<std::string>(), ElementsAre("hello", "there"));
+ }
+ {
+ PushVector(std::vector<bool>{true, true, false});
+ EXPECT_THAT(ReadVector<bool>(), ElementsAre(true, true, false));
+ }
+}
+
+TEST_F(LuaUtilsTest, HandlesVectorIterators) {
+ {
+ const std::vector<int64> elements = {1, 2, 3, 4, 5};
+ PushVectorIterator(&elements);
+ EXPECT_THAT(ReadVector<int64>(), ElementsAre(1, 2, 3, 4, 5));
+ }
+ {
+ const std::vector<std::string> elements = {"hello", "there"};
+ PushVectorIterator(&elements);
+ EXPECT_THAT(ReadVector<std::string>(), ElementsAre("hello", "there"));
+ }
+ {
+ const std::vector<bool> elements = {true, true, false};
+ PushVectorIterator(&elements);
+ EXPECT_THAT(ReadVector<bool>(), ElementsAre(true, true, false));
+ }
+}
+
+TEST_F(LuaUtilsTest, ReadsFlatbufferResults) {
+ // Setup.
+ RunScript(R"lua(
+ return {
+ float_field = 42.1,
+ string_field = "hello there",
+
+ -- Nested field.
+ nested_field = {
+ float_field = 64,
+ string_field = "hello nested",
+ },
+
+ -- Repeated fields.
+ repeated_string_field = { "a", "bold", "one" },
+ repeated_nested_field = {
+ { string_field = "a" },
+ { string_field = "b" },
+ { repeated_string_field = { "nested", "nested2" } },
+ },
+ }
+ )lua");
+
+ // Read the flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = flatbuffer_builder_.NewRoot();
+ ReadFlatbuffer(/*index=*/-1, buffer.get());
+ const std::string serialized_buffer = buffer->Serialize();
+
+ // Check fields. As we do not have flatbuffer compiled generated code for the
+ // ad hoc generated test schema, we have to read by manually using field
+ // offsets.
+ const flatbuffers::Table* flatbuffer_data =
+ flatbuffers::GetRoot<flatbuffers::Table>(serialized_buffer.data());
+ EXPECT_THAT(flatbuffer_data->GetField<float>(/*field=*/4, /*defaultval=*/0),
+ FloatEq(42.1));
+ EXPECT_THAT(
+ flatbuffer_data->GetPointer<const flatbuffers::String*>(/*field=*/12)
+ ->str(),
+ "hello there");
+
+ // Read the nested field.
+ const flatbuffers::Table* nested_field =
+ flatbuffer_data->GetPointer<const flatbuffers::Table*>(/*field=*/6);
+ EXPECT_THAT(nested_field->GetField<float>(/*field=*/4, /*defaultval=*/0),
+ FloatEq(64));
+ EXPECT_THAT(
+ nested_field->GetPointer<const flatbuffers::String*>(/*field=*/12)->str(),
+ "hello nested");
+
+ // Read the repeated string field.
+ auto repeated_strings = flatbuffer_data->GetPointer<
+ flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*>(
+ /*field=*/10);
+ EXPECT_THAT(repeated_strings->size(), Eq(3));
+ EXPECT_THAT(repeated_strings->GetAsString(0)->str(), Eq("a"));
+ EXPECT_THAT(repeated_strings->GetAsString(1)->str(), Eq("bold"));
+ EXPECT_THAT(repeated_strings->GetAsString(2)->str(), Eq("one"));
+
+ // Read the repeated nested field.
+ auto repeated_nested_fields = flatbuffer_data->GetPointer<
+ flatbuffers::Vector<flatbuffers::Offset<flatbuffers::Table>>*>(
+ /*field=*/8);
+ EXPECT_THAT(repeated_nested_fields->size(), Eq(3));
+ EXPECT_THAT(repeated_nested_fields->Get(0)
+ ->GetPointer<const flatbuffers::String*>(/*field=*/12)
+ ->str(),
+ "a");
+ EXPECT_THAT(repeated_nested_fields->Get(1)
+ ->GetPointer<const flatbuffers::String*>(/*field=*/12)
+ ->str(),
+ "b");
+}
+
+TEST_F(LuaUtilsTest, HandlesSimpleFlatbufferFields) {
+ // Create test flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = flatbuffer_builder_.NewRoot();
+ buffer->Set("float_field", 42.f);
+ const std::string serialized_buffer = buffer->Serialize();
+ PushFlatbuffer(schema_, flatbuffers::GetRoot<flatbuffers::Table>(
+ serialized_buffer.data()));
+ lua_setglobal(state_, "arg");
+
+ // Setup.
+ RunScript(R"lua(
+ return arg.float_field
+ )lua");
+
+ EXPECT_THAT(Read<float>(), FloatEq(42));
+}
+
+TEST_F(LuaUtilsTest, HandlesRepeatedFlatbufferFields) {
+ // Create test flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = flatbuffer_builder_.NewRoot();
+ RepeatedField* repeated_field = buffer->Repeated("repeated_string_field");
+ repeated_field->Add("this");
+ repeated_field->Add("is");
+ repeated_field->Add("a");
+ repeated_field->Add("test");
+ const std::string serialized_buffer = buffer->Serialize();
+ PushFlatbuffer(schema_, flatbuffers::GetRoot<flatbuffers::Table>(
+ serialized_buffer.data()));
+ lua_setglobal(state_, "arg");
+
+ // Return flatbuffer repeated field as vector.
+ RunScript(R"lua(
+ return arg.repeated_string_field
+ )lua");
+
+ EXPECT_THAT(ReadVector<std::string>(),
+ ElementsAre("this", "is", "a", "test"));
+}
+
+TEST_F(LuaUtilsTest, HandlesRepeatedNestedFlatbufferFields) {
+ // Create test flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = flatbuffer_builder_.NewRoot();
+ RepeatedField* repeated_field = buffer->Repeated("repeated_nested_field");
+ repeated_field->Add()->Set("string_field", "hello");
+ repeated_field->Add()->Set("string_field", "my");
+ ReflectiveFlatbuffer* nested = repeated_field->Add();
+ nested->Set("string_field", "old");
+ RepeatedField* nested_repeated = nested->Repeated("repeated_string_field");
+ nested_repeated->Add("friend");
+ nested_repeated->Add("how");
+ nested_repeated->Add("are");
+ repeated_field->Add()->Set("string_field", "you?");
+ const std::string serialized_buffer = buffer->Serialize();
+ PushFlatbuffer(schema_, flatbuffers::GetRoot<flatbuffers::Table>(
+ serialized_buffer.data()));
+ lua_setglobal(state_, "arg");
+
+ RunScript(R"lua(
+ result = {}
+ for _, nested in pairs(arg.repeated_nested_field) do
+ result[#result + 1] = nested.string_field
+ for _, nested_string in pairs(nested.repeated_string_field) do
+ result[#result + 1] = nested_string
+ end
+ end
+ return result
+ )lua");
+
+ EXPECT_THAT(
+ ReadVector<std::string>(),
+ ElementsAre("hello", "my", "old", "friend", "how", "are", "you?"));
+}
+
+TEST_F(LuaUtilsTest, CorrectlyReadsTwoFlatbuffersSimultaneously) {
+ // The first flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer = flatbuffer_builder_.NewRoot();
+ buffer->Set("string_field", "first");
+ const std::string serialized_buffer = buffer->Serialize();
+ PushFlatbuffer(schema_, flatbuffers::GetRoot<flatbuffers::Table>(
+ serialized_buffer.data()));
+ lua_setglobal(state_, "arg");
+ // The second flatbuffer.
+ std::unique_ptr<ReflectiveFlatbuffer> buffer2 = flatbuffer_builder_.NewRoot();
+ buffer2->Set("string_field", "second");
+ const std::string serialized_buffer2 = buffer2->Serialize();
+ PushFlatbuffer(schema_, flatbuffers::GetRoot<flatbuffers::Table>(
+ serialized_buffer2.data()));
+ lua_setglobal(state_, "arg2");
+
+ RunScript(R"lua(
+ return {arg.string_field, arg2.string_field}
+ )lua");
+
+ EXPECT_THAT(ReadVector<std::string>(), ElementsAre("first", "second"));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/normalization_test.cc b/native/utils/normalization_test.cc
new file mode 100644
index 0000000..1f731c7
--- /dev/null
+++ b/native/utils/normalization_test.cc
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/normalization.h"
+
+#include <string>
+
+#include "utils/base/integral_types.h"
+#include "utils/utf8/unicodetext.h"
+#include "utils/utf8/unilib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::Eq;
+
+class NormalizationTest : public testing::Test {
+ protected:
+ NormalizationTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
+
+ std::string NormalizeTextCodepointWise(const std::string& text,
+ const int32 codepointwise_ops) {
+ return libtextclassifier3::NormalizeTextCodepointWise(
+ unilib_, codepointwise_ops,
+ UTF8ToUnicodeText(text, /*do_copy=*/false))
+ .ToUTF8String();
+ }
+
+ UniLib unilib_;
+};
+
+TEST_F(NormalizationTest, ReturnsIdenticalStringWhenNoNormalization) {
+ EXPECT_THAT(NormalizeTextCodepointWise(
+ "Never gonna let you down.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_NONE),
+ Eq("Never gonna let you down."));
+}
+
+#if !defined(TC3_UNILIB_DUMMY)
+TEST_F(NormalizationTest, DropsWhitespace) {
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Never gonna let you down.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
+ Eq("Nevergonnaletyoudown."));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Never\tgonna\t\tlet\tyou\tdown.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
+ Eq("Nevergonnaletyoudown."));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Never\u2003gonna\u2003let\u2003you\u2003down.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
+ Eq("Nevergonnaletyoudown."));
+}
+
+TEST_F(NormalizationTest, DropsPunctuation) {
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Never gonna let you down.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
+ Eq("Never gonna let you down"));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
+ Eq("αʹ Σημεῖόν ἐστιν οὗ μέρος οὐθέν"));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "978—3—16—148410—0",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
+ Eq("9783161484100"));
+}
+
+TEST_F(NormalizationTest, LowercasesUnicodeText) {
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
+ Eq("αʹ. σημεῖόν ἐστιν, οὗ μέρος οὐθέν."));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
+ NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
+ Eq("αʹ.σημεῖόνἐστιν,οὗμέροςοὐθέν."));
+}
+
+TEST_F(NormalizationTest, UppercasesUnicodeText) {
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Κανένας άνθρωπος δεν ξέρει",
+ NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
+ Eq("ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ"));
+ EXPECT_THAT(
+ NormalizeTextCodepointWise(
+ "Κανένας άνθρωπος δεν ξέρει",
+ NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
+ NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
+ Eq("ΚΑΝΈΝΑΣΆΝΘΡΩΠΟΣΔΕΝΞΈΡΕΙ"));
+}
+#endif
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/regex-match_test.cc b/native/utils/regex-match_test.cc
new file mode 100644
index 0000000..c45fb29
--- /dev/null
+++ b/native/utils/regex-match_test.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/regex-match.h"
+
+#include <memory>
+
+#include "utils/utf8/unicodetext.h"
+#include "utils/utf8/unilib.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+class RegexMatchTest : public testing::Test {
+ protected:
+ RegexMatchTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
+ UniLib unilib_;
+};
+
+#ifdef TC3_UNILIB_ICU
+#ifndef TC3_DISABLE_LUA
+TEST_F(RegexMatchTest, HandlesSimpleVerification) {
+ EXPECT_TRUE(VerifyMatch(/*context=*/"", /*matcher=*/nullptr, "return true;"));
+}
+#endif // TC3_DISABLE_LUA
+
+#ifndef TC3_DISABLE_LUA
+TEST_F(RegexMatchTest, HandlesCustomVerification) {
+ UnicodeText pattern = UTF8ToUnicodeText("(\\d{16})",
+ /*do_copy=*/true);
+ UnicodeText message = UTF8ToUnicodeText("cc: 4012888888881881",
+ /*do_copy=*/true);
+ const std::string verifier = R"(
+function luhn(candidate)
+ local sum = 0
+ local num_digits = string.len(candidate)
+ local parity = num_digits % 2
+ for pos = 1,num_digits do
+ d = tonumber(string.sub(candidate, pos, pos))
+ if pos % 2 ~= parity then
+ d = d * 2
+ end
+ if d > 9 then
+ d = d - 9
+ end
+ sum = sum + d
+ end
+ return (sum % 10) == 0
+end
+return luhn(match[1].text);
+ )";
+ const std::unique_ptr<UniLib::RegexPattern> regex_pattern =
+ unilib_.CreateRegexPattern(pattern);
+ ASSERT_TRUE(regex_pattern != nullptr);
+ const std::unique_ptr<UniLib::RegexMatcher> matcher =
+ regex_pattern->Matcher(message);
+ ASSERT_TRUE(matcher != nullptr);
+ int status = UniLib::RegexMatcher::kNoError;
+ ASSERT_TRUE(matcher->Find(&status) &&
+ status == UniLib::RegexMatcher::kNoError);
+
+ EXPECT_TRUE(VerifyMatch(message.ToUTF8String(), matcher.get(), verifier));
+}
+#endif // TC3_DISABLE_LUA
+
+TEST_F(RegexMatchTest, RetrievesMatchGroupTest) {
+ UnicodeText pattern =
+ UTF8ToUnicodeText("never gonna (?:give (you) up|let (you) down)",
+ /*do_copy=*/true);
+ const std::unique_ptr<UniLib::RegexPattern> regex_pattern =
+ unilib_.CreateRegexPattern(pattern);
+ ASSERT_TRUE(regex_pattern != nullptr);
+ UnicodeText message =
+ UTF8ToUnicodeText("never gonna give you up - never gonna let you down");
+ const std::unique_ptr<UniLib::RegexMatcher> matcher =
+ regex_pattern->Matcher(message);
+ ASSERT_TRUE(matcher != nullptr);
+ int status = UniLib::RegexMatcher::kNoError;
+
+ ASSERT_TRUE(matcher->Find(&status) &&
+ status == UniLib::RegexMatcher::kNoError);
+ EXPECT_THAT(GetCapturingGroupText(matcher.get(), 0).value(),
+ testing::Eq("never gonna give you up"));
+ EXPECT_THAT(GetCapturingGroupText(matcher.get(), 1).value(),
+ testing::Eq("you"));
+ EXPECT_FALSE(GetCapturingGroupText(matcher.get(), 2).has_value());
+
+ ASSERT_TRUE(matcher->Find(&status) &&
+ status == UniLib::RegexMatcher::kNoError);
+ EXPECT_THAT(GetCapturingGroupText(matcher.get(), 0).value(),
+ testing::Eq("never gonna let you down"));
+ EXPECT_FALSE(GetCapturingGroupText(matcher.get(), 1).has_value());
+ EXPECT_THAT(GetCapturingGroupText(matcher.get(), 2).value(),
+ testing::Eq("you"));
+}
+#endif
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/resources_test.cc b/native/utils/resources_test.cc
new file mode 100644
index 0000000..c385f39
--- /dev/null
+++ b/native/utils/resources_test.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/resources.h"
+#include "utils/i18n/locale.h"
+#include "utils/resources_generated.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+class ResourcesTest
+ : public testing::TestWithParam<testing::tuple<bool, bool>> {
+ protected:
+ ResourcesTest() {}
+
+ std::string BuildTestResources(bool add_default_language = true) const {
+ ResourcePoolT test_resources;
+
+ // Test locales.
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "en";
+ test_resources.locale.back()->region = "US";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "en";
+ test_resources.locale.back()->region = "GB";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "de";
+ test_resources.locale.back()->region = "DE";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "fr";
+ test_resources.locale.back()->region = "FR";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "pt";
+ test_resources.locale.back()->region = "PT";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "pt";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "zh";
+ test_resources.locale.back()->script = "Hans";
+ test_resources.locale.back()->region = "CN";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "zh";
+ test_resources.locale.emplace_back(new LanguageTagT);
+ test_resources.locale.back()->language = "fr";
+ test_resources.locale.back()->language = "fr-CA";
+ if (add_default_language) {
+ test_resources.locale.emplace_back(new LanguageTagT); // default
+ }
+
+ // Test entries.
+ test_resources.resource_entry.emplace_back(new ResourceEntryT);
+ test_resources.resource_entry.back()->name = /*resource_name=*/"A";
+
+ // en-US, default
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content = "localize";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(0);
+ if (add_default_language) {
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(
+ 9);
+ }
+
+ // en-GB
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content = "localise";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(1);
+
+ // de-DE
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content =
+ "lokalisieren";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(2);
+
+ // fr-FR, fr-CA
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content =
+ "localiser";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(3);
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(8);
+
+ // pt-PT
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content =
+ "localizar";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(4);
+
+ // pt
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content =
+ "concentrar";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(5);
+
+ // zh-Hans-CN
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content = "龙";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(6);
+
+ // zh
+ test_resources.resource_entry.back()->resource.emplace_back(new ResourceT);
+ test_resources.resource_entry.back()->resource.back()->content = "龍";
+ test_resources.resource_entry.back()->resource.back()->locale.push_back(7);
+
+ if (compress()) {
+ EXPECT_TRUE(CompressResources(
+ &test_resources,
+ /*build_compression_dictionary=*/build_dictionary()));
+ }
+
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(ResourcePool::Pack(builder, &test_resources));
+
+ return std::string(
+ reinterpret_cast<const char*>(builder.GetBufferPointer()),
+ builder.GetSize());
+ }
+
+ bool compress() const { return testing::get<0>(GetParam()); }
+
+ bool build_dictionary() const { return testing::get<1>(GetParam()); }
+};
+
+INSTANTIATE_TEST_SUITE_P(Compression, ResourcesTest,
+ testing::Combine(testing::Bool(), testing::Bool()));
+
+TEST_P(ResourcesTest, CorrectlyHandlesExactMatch) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("en-US")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localize", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("en-GB")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localise", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("pt-PT")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localizar", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-Hans-CN")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龙", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龍", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("fr-CA")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localiser", content);
+}
+
+TEST_P(ResourcesTest, CorrectlyHandlesTie) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ // Uses first best match in case of a tie.
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("en-CA")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localize", content);
+}
+
+TEST_P(ResourcesTest, RequiresLanguageMatch) {
+ {
+ std::string test_resources =
+ BuildTestResources(/*add_default_language=*/false);
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ EXPECT_FALSE(resources.GetResourceContent({Locale::FromBCP47("es-US")},
+ /*resource_name=*/"A",
+ /*result=*/nullptr));
+ }
+ {
+ std::string test_resources =
+ BuildTestResources(/*add_default_language=*/true);
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("es-US")},
+ /*resource_name=*/"A",
+ /*result=*/&content));
+ EXPECT_EQ("localize", content);
+ }
+}
+
+TEST_P(ResourcesTest, HandlesFallback) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("fr-CH")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localiser", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-Hans")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龙", content);
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-Hans-ZZ")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龙", content);
+
+ // Fallback to default, en-US.
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("ru")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localize", content);
+}
+
+TEST_P(ResourcesTest, HandlesFallbackMultipleLocales) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+
+ // Still use inexact match with primary locale if language matches,
+ // even though secondary locale would match exactly.
+ EXPECT_TRUE(resources.GetResourceContent(
+ {Locale::FromBCP47("fr-CH"), Locale::FromBCP47("en-US")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localiser", content);
+
+ // Use secondary language instead of default fallback if that is an exact
+ // language match.
+ EXPECT_TRUE(resources.GetResourceContent(
+ {Locale::FromBCP47("ru"), Locale::FromBCP47("de")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("lokalisieren", content);
+
+ // Use tertiary language.
+ EXPECT_TRUE(resources.GetResourceContent(
+ {Locale::FromBCP47("ru"), Locale::FromBCP47("it-IT"),
+ Locale::FromBCP47("de")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("lokalisieren", content);
+
+ // Default fallback if no locale matches.
+ EXPECT_TRUE(resources.GetResourceContent(
+ {Locale::FromBCP47("ru"), Locale::FromBCP47("it-IT"),
+ Locale::FromBCP47("es")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("localize", content);
+}
+
+TEST_P(ResourcesTest, PreferGenericCallback) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("pt-BR")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("concentrar", content); // Falls back to pt, not pt-PT.
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-Hant")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龍", content); // Falls back to zh, not zh-Hans-CN.
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-Hant-CN")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龍", content); // Falls back to zh, not zh-Hans-CN.
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("zh-CN")},
+ /*resource_name=*/"A", &content));
+ EXPECT_EQ("龍", content); // Falls back to zh, not zh-Hans-CN.
+}
+
+TEST_P(ResourcesTest, PreferGenericWhenGeneric) {
+ std::string test_resources = BuildTestResources();
+ Resources resources(
+ flatbuffers::GetRoot<ResourcePool>(test_resources.data()));
+ std::string content;
+ EXPECT_TRUE(resources.GetResourceContent({Locale::FromBCP47("pt")},
+ /*resource_name=*/"A", &content));
+
+ // Uses pt, not pt-PT.
+ EXPECT_EQ("concentrar", content);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/sentencepiece/encoder_test.cc b/native/utils/sentencepiece/encoder_test.cc
new file mode 100644
index 0000000..740db35
--- /dev/null
+++ b/native/utils/sentencepiece/encoder_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/sentencepiece/encoder.h"
+
+#include <memory>
+#include <vector>
+
+#include "utils/base/integral_types.h"
+#include "utils/container/sorted-strings-table.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::ElementsAre;
+
+TEST(EncoderTest, SimpleTokenization) {
+ const char pieces_table[] = "hell\0hello\0o\0there\0";
+ const uint32 offsets[] = {0, 5, 11, 13};
+ float scores[] = {-0.5, -1.0, -10.0, -1.0};
+ std::unique_ptr<StringSet> pieces(new SortedStringsTable(
+ /*num_pieces=*/4, offsets, StringPiece(pieces_table, 18)));
+ const Encoder encoder(pieces.get(),
+ /*num_pieces=*/4, scores);
+
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellothere", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 3, 5, 1));
+ }
+
+ // Make probability of hello very low:
+ // hello gets now tokenized as hell + o.
+ scores[1] = -100.0;
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellothere", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 2, 4, 5, 1));
+ }
+}
+
+TEST(EncoderTest, HandlesEdgeCases) {
+ const char pieces_table[] = "hell\0hello\0o\0there\0";
+ const uint32 offsets[] = {0, 5, 11, 13};
+ float scores[] = {-0.5, -1.0, -10.0, -1.0};
+ std::unique_ptr<StringSet> pieces(new SortedStringsTable(
+ /*num_pieces=*/4, offsets, StringPiece(pieces_table, 18)));
+ const Encoder encoder(pieces.get(),
+ /*num_pieces=*/4, scores);
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellhello", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 2, 3, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellohell", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 3, 2, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellathere", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 1));
+ }
+}
+
+TEST(EncoderTest, HandlesOutOfDictionary) {
+ const char pieces_table[] = "hell\0hello\0o\0there\0";
+ const uint32 offsets[] = {0, 5, 11, 13};
+ float scores[] = {-0.5, -1.0, -10.0, -1.0};
+ std::unique_ptr<StringSet> pieces(new SortedStringsTable(
+ /*num_pieces=*/4, offsets, StringPiece(pieces_table, 18)));
+ const Encoder encoder(pieces.get(),
+ /*num_pieces=*/4, scores,
+ /*start_code=*/0, /*end_code=*/1,
+ /*encoding_offset=*/3, /*unknown_code=*/2,
+ /*unknown_score=*/-100.0);
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellhello", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 3, 4, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellohell", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 4, 3, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("", &encoded_text));
+ EXPECT_THAT(encoded_text, ElementsAre(0, 1));
+ }
+ {
+ std::vector<int> encoded_text;
+ EXPECT_TRUE(encoder.Encode("hellathere", &encoded_text));
+ EXPECT_THAT(encoded_text,
+ ElementsAre(0, /*hell*/ 3, /*unknown*/ 2, /*there*/ 6, 1));
+ }
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/sentencepiece/test_utils.cc b/native/utils/sentencepiece/test_utils.cc
deleted file mode 100644
index f277a14..0000000
--- a/native/utils/sentencepiece/test_utils.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "utils/sentencepiece/test_utils.h"
-
-#include <memory>
-
-#include "utils/base/integral_types.h"
-#include "utils/container/double-array-trie.h"
-#include "utils/strings/stringpiece.h"
-
-namespace libtextclassifier3 {
-
-SentencePieceNormalizer NormalizerFromSpec(StringPiece spec,
- bool add_dummy_prefix,
- bool remove_extra_whitespaces,
- bool escape_whitespaces) {
- const uint32 trie_blob_size = reinterpret_cast<const uint32*>(spec.data())[0];
- spec.RemovePrefix(sizeof(trie_blob_size));
- const TrieNode* trie_blob = reinterpret_cast<const TrieNode*>(spec.data());
- spec.RemovePrefix(trie_blob_size);
- const int num_nodes = trie_blob_size / sizeof(TrieNode);
- return SentencePieceNormalizer(
- DoubleArrayTrie(trie_blob, num_nodes),
- /*charsmap_normalized=*/StringPiece(spec.data(), spec.size()),
- add_dummy_prefix, remove_extra_whitespaces, escape_whitespaces);
-}
-
-} // namespace libtextclassifier3
diff --git a/native/utils/sentencepiece/test_utils.h b/native/utils/sentencepiece/test_utils.h
deleted file mode 100644
index 0c833da..0000000
--- a/native/utils/sentencepiece/test_utils.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_TEST_UTILS_H_
-#define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_TEST_UTILS_H_
-
-#include <string>
-#include <vector>
-
-#include "utils/sentencepiece/normalizer.h"
-#include "utils/strings/stringpiece.h"
-
-namespace libtextclassifier3 {
-
-SentencePieceNormalizer NormalizerFromSpec(StringPiece spec,
- bool add_dummy_prefix,
- bool remove_extra_whitespaces,
- bool escape_whitespaces);
-
-} // namespace libtextclassifier3
-
-#endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_TEST_UTILS_H_
diff --git a/native/utils/strings/append_test.cc b/native/utils/strings/append_test.cc
new file mode 100644
index 0000000..8950761
--- /dev/null
+++ b/native/utils/strings/append_test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/strings/append.h"
+
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace strings {
+
+TEST(StringUtilTest, SStringAppendF) {
+ std::string str;
+ SStringAppendF(&str, 5, "%d %d", 0, 1);
+ EXPECT_EQ(str, "0 1");
+
+ SStringAppendF(&str, 1, "%d", 9);
+ EXPECT_EQ(str, "0 19");
+
+ SStringAppendF(&str, 1, "%d", 10);
+ EXPECT_EQ(str, "0 191");
+
+ str.clear();
+
+ SStringAppendF(&str, 5, "%d", 100);
+ EXPECT_EQ(str, "100");
+}
+
+TEST(StringUtilTest, SStringAppendFBufCalc) {
+ std::string str;
+ SStringAppendF(&str, 0, "%d %s %d", 1, "hello", 2);
+ EXPECT_EQ(str, "1 hello 2");
+}
+
+TEST(StringUtilTest, JoinStrings) {
+ std::vector<std::string> vec;
+ vec.push_back("1");
+ vec.push_back("2");
+ vec.push_back("3");
+
+ EXPECT_EQ("1,2,3", JoinStrings(",", vec));
+ EXPECT_EQ("123", JoinStrings("", vec));
+ EXPECT_EQ("1, 2, 3", JoinStrings(", ", vec));
+ EXPECT_EQ("", JoinStrings(",", std::vector<std::string>()));
+}
+
+} // namespace strings
+} // namespace libtextclassifier3
diff --git a/native/utils/strings/numbers_test.cc b/native/utils/strings/numbers_test.cc
new file mode 100644
index 0000000..bf2f84a
--- /dev/null
+++ b/native/utils/strings/numbers_test.cc
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/strings/numbers.h"
+
+#include "utils/base/integral_types.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+void TestParseInt32(const char *c_str, bool expected_parsing_success,
+ int32 expected_parsed_value = 0) {
+ int32 parsed_value = 0;
+ EXPECT_EQ(expected_parsing_success, ParseInt32(c_str, &parsed_value));
+ if (expected_parsing_success) {
+ EXPECT_EQ(expected_parsed_value, parsed_value);
+ }
+}
+
+TEST(ParseInt32Test, Normal) {
+ TestParseInt32("2", true, 2);
+ TestParseInt32("-357", true, -357);
+ TestParseInt32("7", true, 7);
+ TestParseInt32("+7", true, 7);
+ TestParseInt32(" +7", true, 7);
+ TestParseInt32("-23", true, -23);
+ TestParseInt32(" -23", true, -23);
+ TestParseInt32("04", true, 4);
+ TestParseInt32("07", true, 7);
+ TestParseInt32("08", true, 8);
+ TestParseInt32("09", true, 9);
+}
+
+TEST(ParseInt32Test, ErrorCases) {
+ TestParseInt32("", false);
+ TestParseInt32(" ", false);
+ TestParseInt32("not-a-number", false);
+ TestParseInt32("123a", false);
+}
+
+void TestParseInt64(const char *c_str, bool expected_parsing_success,
+ int64 expected_parsed_value = 0) {
+ int64 parsed_value = 0;
+ EXPECT_EQ(expected_parsing_success, ParseInt64(c_str, &parsed_value));
+ if (expected_parsing_success) {
+ EXPECT_EQ(expected_parsed_value, parsed_value);
+ }
+}
+
+TEST(ParseInt64Test, Normal) {
+ TestParseInt64("2", true, 2);
+ TestParseInt64("-357", true, -357);
+ TestParseInt64("7", true, 7);
+ TestParseInt64("+7", true, 7);
+ TestParseInt64(" +7", true, 7);
+ TestParseInt64("-23", true, -23);
+ TestParseInt64(" -23", true, -23);
+ TestParseInt64("07", true, 7);
+ TestParseInt64("08", true, 8);
+}
+
+TEST(ParseInt64Test, ErrorCases) {
+ TestParseInt64("", false);
+ TestParseInt64(" ", false);
+ TestParseInt64("not-a-number", false);
+ TestParseInt64("23z", false);
+}
+
+void TestParseDouble(const char *c_str, bool expected_parsing_success,
+ double expected_parsed_value = 0.0) {
+ double parsed_value = 0.0;
+ EXPECT_EQ(expected_parsing_success, ParseDouble(c_str, &parsed_value));
+ if (expected_parsing_success) {
+ EXPECT_NEAR(expected_parsed_value, parsed_value, 0.00001);
+ }
+}
+
+TEST(ParseDoubleTest, Normal) {
+ TestParseDouble("2", true, 2.0);
+ TestParseDouble("-357.023", true, -357.023);
+ TestParseDouble("7.04", true, 7.04);
+ TestParseDouble("+7.2", true, 7.2);
+ TestParseDouble(" +7.236", true, 7.236);
+ TestParseDouble("-23.4", true, -23.4);
+ TestParseDouble(" -23.4", true, -23.4);
+}
+
+TEST(ParseDoubleTest, ErrorCases) {
+ TestParseDouble("", false);
+ TestParseDouble(" ", false);
+ TestParseDouble("not-a-number", false);
+ TestParseDouble("23.5a", false);
+}
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/strings/stringpiece_test.cc b/native/utils/strings/stringpiece_test.cc
new file mode 100644
index 0000000..64808d3
--- /dev/null
+++ b/native/utils/strings/stringpiece_test.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "utils/strings/stringpiece.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(StringPieceTest, EndsWith) {
+ EXPECT_TRUE(EndsWith("hello there!", "there!"));
+ EXPECT_TRUE(EndsWith("hello there!", "!"));
+ EXPECT_FALSE(EndsWith("hello there!", "there"));
+ EXPECT_FALSE(EndsWith("hello there!", " hello there!"));
+ EXPECT_TRUE(EndsWith("hello there!", ""));
+ EXPECT_FALSE(EndsWith("", "hello there!"));
+}
+
+TEST(StringPieceTest, StartsWith) {
+ EXPECT_TRUE(StartsWith("hello there!", "hello"));
+ EXPECT_TRUE(StartsWith("hello there!", "hello "));
+ EXPECT_FALSE(StartsWith("hello there!", "there!"));
+ EXPECT_FALSE(StartsWith("hello there!", " hello there! "));
+ EXPECT_TRUE(StartsWith("hello there!", ""));
+ EXPECT_FALSE(StartsWith("", "hello there!"));
+}
+
+TEST(StringPieceTest, ConsumePrefix) {
+ StringPiece str("hello there!");
+ EXPECT_TRUE(ConsumePrefix(&str, "hello "));
+ EXPECT_EQ(str.ToString(), "there!");
+ EXPECT_TRUE(ConsumePrefix(&str, "there"));
+ EXPECT_EQ(str.ToString(), "!");
+ EXPECT_FALSE(ConsumePrefix(&str, "!!"));
+ EXPECT_TRUE(ConsumePrefix(&str, ""));
+ EXPECT_TRUE(ConsumePrefix(&str, "!"));
+ EXPECT_EQ(str.ToString(), "");
+ EXPECT_TRUE(ConsumePrefix(&str, ""));
+ EXPECT_FALSE(ConsumePrefix(&str, "!"));
+}
+
+TEST(StringPieceTest, ConsumeSuffix) {
+ StringPiece str("hello there!");
+ EXPECT_TRUE(ConsumeSuffix(&str, "!"));
+ EXPECT_EQ(str.ToString(), "hello there");
+ EXPECT_TRUE(ConsumeSuffix(&str, " there"));
+ EXPECT_EQ(str.ToString(), "hello");
+ EXPECT_FALSE(ConsumeSuffix(&str, "!!"));
+ EXPECT_TRUE(ConsumeSuffix(&str, ""));
+ EXPECT_TRUE(ConsumeSuffix(&str, "hello"));
+ EXPECT_EQ(str.ToString(), "");
+ EXPECT_TRUE(ConsumeSuffix(&str, ""));
+ EXPECT_FALSE(ConsumeSuffix(&str, "!"));
+}
+
+TEST(StringPieceTest, Find) {
+ StringPiece str("<hello there!>");
+ EXPECT_EQ(str.find('<'), 0);
+ EXPECT_EQ(str.find('>'), str.length() - 1);
+ EXPECT_EQ(str.find('?'), StringPiece::npos);
+ EXPECT_EQ(str.find('<', str.length() - 1), StringPiece::npos);
+ EXPECT_EQ(str.find('<', 0), 0);
+ EXPECT_EQ(str.find('>', str.length() - 1), str.length() - 1);
+}
+
+TEST(StringPieceTest, FindStringPiece) {
+ StringPiece str("<foo bar baz!>");
+ EXPECT_EQ(str.find("foo"), 1);
+ EXPECT_EQ(str.find("bar"), 5);
+ EXPECT_EQ(str.find("baz"), 9);
+ EXPECT_EQ(str.find("qux"), StringPiece::npos);
+ EXPECT_EQ(str.find("?"), StringPiece::npos);
+ EXPECT_EQ(str.find(">"), str.length() - 1);
+ EXPECT_EQ(str.find("<", str.length() - 1), StringPiece::npos);
+ EXPECT_EQ(str.find("<", 0), 0);
+ EXPECT_EQ(str.find(">", str.length() - 1), str.length() - 1);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/strings/substitute_test.cc b/native/utils/strings/substitute_test.cc
new file mode 100644
index 0000000..94b37ab
--- /dev/null
+++ b/native/utils/strings/substitute_test.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/strings/substitute.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "utils/strings/stringpiece.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(SubstituteTest, Substitute) {
+ EXPECT_EQ("Hello, world!",
+ strings::Substitute("$0, $1!", {"Hello", "world"}));
+
+ // Out of order.
+ EXPECT_EQ("world, Hello!",
+ strings::Substitute("$1, $0!", {"Hello", "world"}));
+ EXPECT_EQ("b, a, c, b",
+ strings::Substitute("$1, $0, $2, $1", {"a", "b", "c"}));
+
+ // Literal $
+ EXPECT_EQ("$", strings::Substitute("$$", {}));
+ EXPECT_EQ("$1", strings::Substitute("$$1", {}));
+
+ const char* null_cstring = nullptr;
+ EXPECT_EQ("Text: ''", strings::Substitute("Text: '$0'", {null_cstring}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/strings/utf8_test.cc b/native/utils/strings/utf8_test.cc
new file mode 100644
index 0000000..28d971b
--- /dev/null
+++ b/native/utils/strings/utf8_test.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/strings/utf8.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(Utf8Test, ComputesUtf8LengthOfUnicodeCharacters) {
+ EXPECT_EQ(GetNumBytesForUTF8Char("\x00"), 1);
+ EXPECT_EQ(GetNumBytesForUTF8Char("h"), 1);
+ EXPECT_EQ(GetNumBytesForUTF8Char("😋"), 4);
+ EXPECT_EQ(GetNumBytesForUTF8Char("㍿"), 3);
+}
+
+TEST(Utf8Test, IsValidUTF8) {
+ EXPECT_TRUE(IsValidUTF8("1234😋hello", 13));
+ EXPECT_TRUE(IsValidUTF8("\u304A\u00B0\u106B", 8));
+ EXPECT_TRUE(IsValidUTF8("this is a test😋😋😋", 26));
+ EXPECT_TRUE(IsValidUTF8("\xf0\x9f\x98\x8b", 4));
+ // Too short (string is too short).
+ EXPECT_FALSE(IsValidUTF8("\xf0\x9f", 2));
+ // Too long (too many trailing bytes).
+ EXPECT_FALSE(IsValidUTF8("\xf0\x9f\x98\x8b\x8b", 5));
+ // Too short (too few trailing bytes).
+ EXPECT_FALSE(IsValidUTF8("\xf0\x9f\x98\x61\x61", 5));
+}
+
+TEST(Utf8Test, ValidUTF8CharLength) {
+ EXPECT_EQ(ValidUTF8CharLength("1234😋hello", 13), 1);
+ EXPECT_EQ(ValidUTF8CharLength("\u304A\u00B0\u106B", 8), 3);
+ EXPECT_EQ(ValidUTF8CharLength("this is a test😋😋😋", 26), 1);
+ EXPECT_EQ(ValidUTF8CharLength("\xf0\x9f\x98\x8b", 4), 4);
+ // Too short (string is too short).
+ EXPECT_EQ(ValidUTF8CharLength("\xf0\x9f", 2), -1);
+ // Too long (too many trailing bytes). First character is valid.
+ EXPECT_EQ(ValidUTF8CharLength("\xf0\x9f\x98\x8b\x8b", 5), 4);
+ // Too short (too few trailing bytes).
+ EXPECT_EQ(ValidUTF8CharLength("\xf0\x9f\x98\x61\x61", 5), -1);
+}
+
+TEST(Utf8Test, CorrectlyTruncatesStrings) {
+ EXPECT_EQ(SafeTruncateLength("FooBar", 3), 3);
+ EXPECT_EQ(SafeTruncateLength("früh", 3), 2);
+ EXPECT_EQ(SafeTruncateLength("مَمِمّمَّمِّ", 5), 4);
+}
+
+TEST(Utf8Test, CorrectlyConvertsFromUtf8) {
+ EXPECT_EQ(ValidCharToRune("a"), 97);
+ EXPECT_EQ(ValidCharToRune("\0"), 0);
+ EXPECT_EQ(ValidCharToRune("\u304A"), 0x304a);
+ EXPECT_EQ(ValidCharToRune("\xe3\x81\x8a"), 0x304a);
+}
+
+TEST(Utf8Test, CorrectlyConvertsToUtf8) {
+ char utf8_encoding[4];
+ EXPECT_EQ(ValidRuneToChar(97, utf8_encoding), 1);
+ EXPECT_EQ(ValidRuneToChar(0, utf8_encoding), 1);
+ EXPECT_EQ(ValidRuneToChar(0x304a, utf8_encoding), 3);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/tensor-view_test.cc b/native/utils/tensor-view_test.cc
new file mode 100644
index 0000000..9467264
--- /dev/null
+++ b/native/utils/tensor-view_test.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/tensor-view.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(TensorViewTest, TestSize) {
+ std::vector<float> data{0.1, 0.2, 0.3, 0.4, 0.5, 0.6};
+ const TensorView<float> tensor(data.data(), {3, 1, 2});
+ EXPECT_TRUE(tensor.is_valid());
+ EXPECT_EQ(tensor.shape(), (std::vector<int>{3, 1, 2}));
+ EXPECT_EQ(tensor.data(), data.data());
+ EXPECT_EQ(tensor.size(), 6);
+ EXPECT_EQ(tensor.dims(), 3);
+ EXPECT_EQ(tensor.dim(0), 3);
+ EXPECT_EQ(tensor.dim(1), 1);
+ EXPECT_EQ(tensor.dim(2), 2);
+ std::vector<float> output_data(6);
+ EXPECT_TRUE(tensor.copy_to(output_data.data(), output_data.size()));
+ EXPECT_EQ(data, output_data);
+
+ // Should not copy when the output is small.
+ std::vector<float> small_output_data{-1, -1, -1};
+ EXPECT_FALSE(
+ tensor.copy_to(small_output_data.data(), small_output_data.size()));
+ // The output buffer should not be changed.
+ EXPECT_EQ(small_output_data, (std::vector<float>{-1, -1, -1}));
+
+ const TensorView<float> invalid_tensor = TensorView<float>::Invalid();
+ EXPECT_FALSE(invalid_tensor.is_valid());
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/test-utils.cc b/native/utils/test-utils.cc
new file mode 100644
index 0000000..8996a4a
--- /dev/null
+++ b/native/utils/test-utils.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/test-utils.h"
+
+#include <iterator>
+
+#include "utils/codepoint-range.h"
+#include "utils/strings/utf8.h"
+#include "utils/utf8/unicodetext.h"
+
+namespace libtextclassifier3 {
+
+using libtextclassifier3::Token;
+
+std::vector<Token> TokenizeOnSpace(const std::string& text) {
+ return TokenizeOnDelimiters(text, {' '});
+}
+
+std::vector<Token> TokenizeOnDelimiters(
+ const std::string& text, const std::unordered_set<char32>& delimiters) {
+ const UnicodeText unicode_text = UTF8ToUnicodeText(text, /*do_copy=*/false);
+
+ std::vector<Token> result;
+
+ int token_start_codepoint = 0;
+ auto token_start_it = unicode_text.begin();
+ int codepoint_idx = 0;
+
+ UnicodeText::const_iterator it;
+ for (it = unicode_text.begin(); it < unicode_text.end(); it++) {
+ if (delimiters.find(*it) != delimiters.end()) {
+ // Only add a token when the string is non-empty.
+ if (token_start_it != it) {
+ result.push_back(Token{UnicodeText::UTF8Substring(token_start_it, it),
+ token_start_codepoint, codepoint_idx});
+ }
+
+ token_start_codepoint = codepoint_idx + 1;
+ token_start_it = it;
+ token_start_it++;
+ }
+
+ codepoint_idx++;
+ }
+ // Only add a token when the string is non-empty.
+ if (token_start_it != it) {
+ result.push_back(Token{UnicodeText::UTF8Substring(token_start_it, it),
+ token_start_codepoint, codepoint_idx});
+ }
+
+ return result;
+}
+
+} // namespace libtextclassifier3
diff --git a/native/utils/test-utils.h b/native/utils/test-utils.h
new file mode 100644
index 0000000..0e75190
--- /dev/null
+++ b/native/utils/test-utils.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Utilities for tests.
+
+#ifndef LIBTEXTCLASSIFIER_UTILS_TEST_UTILS_H_
+#define LIBTEXTCLASSIFIER_UTILS_TEST_UTILS_H_
+
+#include <string>
+
+#include "annotator/types.h"
+
+namespace libtextclassifier3 {
+
+// Returns a list of Tokens for a given input string, by tokenizing on space.
+std::vector<Token> TokenizeOnSpace(const std::string& text);
+
+// Returns a list of Tokens for a given input string, by tokenizing on the
+// given set of delimiter codepoints.
+std::vector<Token> TokenizeOnDelimiters(
+ const std::string& text, const std::unordered_set<char32>& delimiters);
+
+} // namespace libtextclassifier3
+
+#endif // LIBTEXTCLASSIFIER_UTILS_TEST_UTILS_H_
diff --git a/native/utils/test-utils_test.cc b/native/utils/test-utils_test.cc
new file mode 100644
index 0000000..bdaa285
--- /dev/null
+++ b/native/utils/test-utils_test.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/test-utils.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(TestUtilTest, TokenizeOnSpace) {
+ std::vector<Token> tokens =
+ TokenizeOnSpace("Where is Jörg Borg located? Maybe in Zürich ...");
+
+ EXPECT_EQ(tokens.size(), 9);
+
+ EXPECT_EQ(tokens[0].value, "Where");
+ EXPECT_EQ(tokens[0].start, 0);
+ EXPECT_EQ(tokens[0].end, 5);
+
+ EXPECT_EQ(tokens[1].value, "is");
+ EXPECT_EQ(tokens[1].start, 6);
+ EXPECT_EQ(tokens[1].end, 8);
+
+ EXPECT_EQ(tokens[2].value, "Jörg");
+ EXPECT_EQ(tokens[2].start, 9);
+ EXPECT_EQ(tokens[2].end, 13);
+
+ EXPECT_EQ(tokens[3].value, "Borg");
+ EXPECT_EQ(tokens[3].start, 14);
+ EXPECT_EQ(tokens[3].end, 18);
+
+ EXPECT_EQ(tokens[4].value, "located?");
+ EXPECT_EQ(tokens[4].start, 19);
+ EXPECT_EQ(tokens[4].end, 27);
+
+ EXPECT_EQ(tokens[5].value, "Maybe");
+ EXPECT_EQ(tokens[5].start, 28);
+ EXPECT_EQ(tokens[5].end, 33);
+
+ EXPECT_EQ(tokens[6].value, "in");
+ EXPECT_EQ(tokens[6].start, 34);
+ EXPECT_EQ(tokens[6].end, 36);
+
+ EXPECT_EQ(tokens[7].value, "Zürich");
+ EXPECT_EQ(tokens[7].start, 37);
+ EXPECT_EQ(tokens[7].end, 43);
+
+ EXPECT_EQ(tokens[8].value, "...");
+ EXPECT_EQ(tokens[8].start, 44);
+ EXPECT_EQ(tokens[8].end, 47);
+}
+
+TEST(TestUtilTest, TokenizeOnDelimiters) {
+ std::vector<Token> tokens = TokenizeOnDelimiters(
+ "This might be čomplíčateď?!: Oder?", {' ', '?', '!'});
+
+ EXPECT_EQ(tokens.size(), 6);
+
+ EXPECT_EQ(tokens[0].value, "This");
+ EXPECT_EQ(tokens[0].start, 0);
+ EXPECT_EQ(tokens[0].end, 4);
+
+ EXPECT_EQ(tokens[1].value, "might");
+ EXPECT_EQ(tokens[1].start, 7);
+ EXPECT_EQ(tokens[1].end, 12);
+
+ EXPECT_EQ(tokens[2].value, "be");
+ EXPECT_EQ(tokens[2].start, 13);
+ EXPECT_EQ(tokens[2].end, 15);
+
+ EXPECT_EQ(tokens[3].value, "čomplíčateď");
+ EXPECT_EQ(tokens[3].start, 16);
+ EXPECT_EQ(tokens[3].end, 27);
+
+ EXPECT_EQ(tokens[4].value, ":");
+ EXPECT_EQ(tokens[4].start, 29);
+ EXPECT_EQ(tokens[4].end, 30);
+
+ EXPECT_EQ(tokens[5].value, "Oder");
+ EXPECT_EQ(tokens[5].start, 31);
+ EXPECT_EQ(tokens[5].end, 35);
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/token-feature-extractor_test.cc b/native/utils/token-feature-extractor_test.cc
new file mode 100644
index 0000000..15a434c
--- /dev/null
+++ b/native/utils/token-feature-extractor_test.cc
@@ -0,0 +1,579 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/token-feature-extractor.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+class TokenFeatureExtractorTest : public ::testing::Test {
+ protected:
+ explicit TokenFeatureExtractorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
+ UniLib unilib_;
+};
+
+class TestingTokenFeatureExtractor : public TokenFeatureExtractor {
+ public:
+ using TokenFeatureExtractor::HashToken;
+ using TokenFeatureExtractor::TokenFeatureExtractor;
+};
+
+TEST_F(TokenFeatureExtractorTest, ExtractAscii) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2, 3};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = false;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("H"),
+ extractor.HashToken("e"),
+ extractor.HashToken("l"),
+ extractor.HashToken("l"),
+ extractor.HashToken("o"),
+ extractor.HashToken("^H"),
+ extractor.HashToken("He"),
+ extractor.HashToken("el"),
+ extractor.HashToken("ll"),
+ extractor.HashToken("lo"),
+ extractor.HashToken("o$"),
+ extractor.HashToken("^He"),
+ extractor.HashToken("Hel"),
+ extractor.HashToken("ell"),
+ extractor.HashToken("llo"),
+ extractor.HashToken("lo$")
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("w"),
+ extractor.HashToken("o"),
+ extractor.HashToken("r"),
+ extractor.HashToken("l"),
+ extractor.HashToken("d"),
+ extractor.HashToken("!"),
+ extractor.HashToken("^w"),
+ extractor.HashToken("wo"),
+ extractor.HashToken("or"),
+ extractor.HashToken("rl"),
+ extractor.HashToken("ld"),
+ extractor.HashToken("d!"),
+ extractor.HashToken("!$"),
+ extractor.HashToken("^wo"),
+ extractor.HashToken("wor"),
+ extractor.HashToken("orl"),
+ extractor.HashToken("rld"),
+ extractor.HashToken("ld!"),
+ extractor.HashToken("d!$"),
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractAsciiNoChargrams) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = false;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({extractor.HashToken("^Hello$")}));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({extractor.HashToken("^world!$")}));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractUnicode) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2, 3};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = true;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("H"),
+ extractor.HashToken("ě"),
+ extractor.HashToken("l"),
+ extractor.HashToken("l"),
+ extractor.HashToken("ó"),
+ extractor.HashToken("^H"),
+ extractor.HashToken("Hě"),
+ extractor.HashToken("ěl"),
+ extractor.HashToken("ll"),
+ extractor.HashToken("ló"),
+ extractor.HashToken("ó$"),
+ extractor.HashToken("^Hě"),
+ extractor.HashToken("Hěl"),
+ extractor.HashToken("ěll"),
+ extractor.HashToken("lló"),
+ extractor.HashToken("ló$")
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("w"),
+ extractor.HashToken("o"),
+ extractor.HashToken("r"),
+ extractor.HashToken("l"),
+ extractor.HashToken("d"),
+ extractor.HashToken("!"),
+ extractor.HashToken("^w"),
+ extractor.HashToken("wo"),
+ extractor.HashToken("or"),
+ extractor.HashToken("rl"),
+ extractor.HashToken("ld"),
+ extractor.HashToken("d!"),
+ extractor.HashToken("!$"),
+ extractor.HashToken("^wo"),
+ extractor.HashToken("wor"),
+ extractor.HashToken("orl"),
+ extractor.HashToken("rld"),
+ extractor.HashToken("ld!"),
+ extractor.HashToken("d!$"),
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractUnicodeNoChargrams) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = true;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({extractor.HashToken("^Hělló$")}));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray({
+ extractor.HashToken("^world!$"),
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+}
+
+#ifdef TC3_TEST_ICU
+TEST_F(TokenFeatureExtractorTest, ICUCaseFeature) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = true;
+ options.extract_selection_mask_feature = false;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"Ř", 23, 29}, false, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"ř", 23, 29}, false, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
+}
+#endif
+
+TEST_F(TokenFeatureExtractorTest, DigitRemapping) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.remap_digits = true;
+ options.unicode_aware_features = false;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
+ &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+ extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features,
+ testing::Not(testing::ElementsAreArray(sparse_features2)));
+}
+
+TEST_F(TokenFeatureExtractorTest, DigitRemappingUnicode) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.remap_digits = true;
+ options.unicode_aware_features = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
+ &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+ extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features,
+ testing::Not(testing::ElementsAreArray(sparse_features2)));
+}
+
+TEST_F(TokenFeatureExtractorTest, LowercaseAscii) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.lowercase_tokens = true;
+ options.unicode_aware_features = false;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"AABB", 0, 6}, true, &sparse_features,
+ &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"aaBB", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+ extractor.Extract(Token{"aAbB", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
+#ifdef TC3_TEST_ICU
+TEST_F(TokenFeatureExtractorTest, LowercaseUnicode) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.lowercase_tokens = true;
+ options.unicode_aware_features = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"ŘŘ", 0, 6}, true, &sparse_features, &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"řř", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+#endif
+
+#ifdef TC3_TEST_ICU
+TEST_F(TokenFeatureExtractorTest, RegexFeatures) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.remap_digits = false;
+ options.unicode_aware_features = false;
+ options.regexp_features.push_back("^[a-z]+$"); // all lower case.
+ options.regexp_features.push_back("^[0-9]+$"); // all digits.
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"abCde", 0, 6}, true, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+
+ dense_features.clear();
+ extractor.Extract(Token{"abcde", 0, 6}, true, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, -1.0}));
+
+ dense_features.clear();
+ extractor.Extract(Token{"12c45", 0, 6}, true, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+
+ dense_features.clear();
+ extractor.Extract(Token{"12345", 0, 6}, true, &sparse_features,
+ &dense_features);
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 1.0}));
+}
+#endif
+
+TEST_F(TokenFeatureExtractorTest, ExtractTooLongWord) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{22};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = true;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ // Test that this runs. ASAN should catch problems.
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"abcdefghijklmnopqřstuvwxyz", 0, 0}, true,
+ &sparse_features, &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("^abcdefghij\1qřstuvwxyz"),
+ extractor.HashToken("abcdefghij\1qřstuvwxyz$"),
+ // clang-format on
+ }));
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractAsciiUnicodeMatches) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2, 3, 4, 5};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = true;
+ options.extract_selection_mask_feature = true;
+
+ TestingTokenFeatureExtractor extractor_unicode(options, &unilib_);
+
+ options.unicode_aware_features = false;
+ TestingTokenFeatureExtractor extractor_ascii(options, &unilib_);
+
+ for (const std::string& input :
+ {"https://www.abcdefgh.com/in/xxxkkkvayio",
+ "https://www.fjsidofj.om/xx/abadfy/xxxx/?xfjiis=ffffiijiihil",
+ "asdfhasdofjiasdofj#%()*%#*(aisdojfaosdifjiaofjdsiofjdi_fdis3w", "abcd",
+ "x", "Hello", "Hey,", "Hi", ""}) {
+ std::vector<int> sparse_features_unicode;
+ std::vector<float> dense_features_unicode;
+ extractor_unicode.Extract(Token{input, 0, 0}, true,
+ &sparse_features_unicode,
+ &dense_features_unicode);
+
+ std::vector<int> sparse_features_ascii;
+ std::vector<float> dense_features_ascii;
+ extractor_ascii.Extract(Token{input, 0, 0}, true, &sparse_features_ascii,
+ &dense_features_ascii);
+
+ EXPECT_THAT(sparse_features_unicode, sparse_features_ascii) << input;
+ EXPECT_THAT(dense_features_unicode, dense_features_ascii) << input;
+ }
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractForPadToken) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = false;
+ options.extract_selection_mask_feature = true;
+
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token(), false, &sparse_features, &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({extractor.HashToken("<PAD>")}));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractFiltered) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2, 3};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = false;
+ options.extract_selection_mask_feature = true;
+ options.allowed_chargrams.insert("^H");
+ options.allowed_chargrams.insert("ll");
+ options.allowed_chargrams.insert("llo");
+ options.allowed_chargrams.insert("w");
+ options.allowed_chargrams.insert("!");
+ options.allowed_chargrams.insert("\xc4"); // UTF8 control character.
+
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ extractor.Extract(Token{"Hěllo", 0, 5}, true, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features,
+ testing::ElementsAreArray({
+ // clang-format off
+ 0,
+ extractor.HashToken("\xc4"),
+ 0,
+ 0,
+ 0,
+ 0,
+ extractor.HashToken("^H"),
+ 0,
+ 0,
+ 0,
+ extractor.HashToken("ll"),
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ extractor.HashToken("llo"),
+ 0
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+ sparse_features.clear();
+ dense_features.clear();
+ extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+ &dense_features);
+
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("w"),
+ 0,
+ 0,
+ 0,
+ 0,
+ extractor.HashToken("!"),
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+ EXPECT_EQ(extractor.HashToken("<PAD>"), 1);
+}
+
+TEST_F(TokenFeatureExtractorTest, ExtractEmptyToken) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2, 3};
+ options.extract_case_feature = true;
+ options.unicode_aware_features = false;
+ options.extract_selection_mask_feature = true;
+ TestingTokenFeatureExtractor extractor(options, &unilib_);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+
+ // Should not crash.
+ extractor.Extract(Token(), true, &sparse_features, &dense_features);
+
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray({
+ // clang-format off
+ extractor.HashToken("<PAD>"),
+ // clang-format on
+ }));
+ EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 1.0}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/tokenizer_test.cc b/native/utils/tokenizer_test.cc
new file mode 100644
index 0000000..f73f8f8
--- /dev/null
+++ b/native/utils/tokenizer_test.cc
@@ -0,0 +1,626 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/tokenizer.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+using testing::ElementsAreArray;
+
+class TestingTokenizer : public Tokenizer {
+ public:
+ TestingTokenizer(
+ const TokenizationType type, const UniLib* unilib,
+ const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
+ const std::vector<const CodepointRange*>&
+ internal_tokenizer_codepoint_ranges,
+ const bool split_on_script_change,
+ const bool icu_preserve_whitespace_tokens,
+ const bool preserve_floating_numbers)
+ : Tokenizer(type, unilib, codepoint_ranges,
+ internal_tokenizer_codepoint_ranges, split_on_script_change,
+ icu_preserve_whitespace_tokens, preserve_floating_numbers) {}
+
+ using Tokenizer::FindTokenizationRange;
+};
+
+class TestingTokenizerProxy {
+ public:
+ TestingTokenizerProxy(
+ TokenizationType type,
+ const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
+ const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
+ const bool split_on_script_change,
+ const bool icu_preserve_whitespace_tokens,
+ const bool preserve_floating_numbers)
+ : INIT_UNILIB_FOR_TESTING(unilib_) {
+ const int num_configs = codepoint_range_configs.size();
+ std::vector<const TokenizationCodepointRange*> configs_fb;
+ configs_fb.reserve(num_configs);
+ const int num_internal_configs = internal_codepoint_range_configs.size();
+ std::vector<const CodepointRange*> internal_configs_fb;
+ internal_configs_fb.reserve(num_internal_configs);
+ buffers_.reserve(num_configs + num_internal_configs);
+ for (int i = 0; i < num_configs; i++) {
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(CreateTokenizationCodepointRange(
+ builder, &codepoint_range_configs[i]));
+ buffers_.push_back(builder.Release());
+ configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
+ buffers_.back().data()));
+ }
+ for (int i = 0; i < num_internal_configs; i++) {
+ flatbuffers::FlatBufferBuilder builder;
+ builder.Finish(
+ CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
+ buffers_.push_back(builder.Release());
+ internal_configs_fb.push_back(
+ flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
+ }
+ tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
+ type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
+ icu_preserve_whitespace_tokens, preserve_floating_numbers));
+ }
+
+ TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
+ const TokenizationCodepointRangeT* range =
+ tokenizer_->FindTokenizationRange(c);
+ if (range != nullptr) {
+ return range->role;
+ } else {
+ return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+ }
+ }
+
+ std::vector<Token> Tokenize(const std::string& utf8_text) const {
+ return tokenizer_->Tokenize(utf8_text);
+ }
+
+ private:
+ UniLib unilib_;
+ std::vector<flatbuffers::DetachedBuffer> buffers_;
+ std::unique_ptr<TestingTokenizer> tokenizer_;
+};
+
+TEST(TokenizerTest, FindTokenizationRange) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0;
+ config->end = 10;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 1234;
+ config->end = 12345;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
+ {}, /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+
+ // Test hits to the first group.
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
+ TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
+ TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+
+ // Test a hit to the second group.
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
+ TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+
+ // Test hits to the third group.
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
+ TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
+ TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+
+ // Test a hit outside.
+ EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
+ TokenizationCodepointRange_::Role_DEFAULT_ROLE);
+}
+
+TEST(TokenizerTest, TokenizeOnSpace) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ configs.emplace_back();
+ config = &configs.back();
+ // Space character.
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
+ {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
+
+ EXPECT_THAT(tokens,
+ ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
+}
+
+TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ // Latin.
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0;
+ config->end = 32;
+ config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+ config->script_id = 1;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+ config->script_id = 1;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 33;
+ config->end = 0x77F + 1;
+ config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+ config->script_id = 1;
+
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
+ {},
+ /*split_on_script_change=*/true,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
+ std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
+ Token("전화", 7, 10), Token("(123)", 10, 15),
+ Token("456-789", 16, 23),
+ Token("웹사이트", 23, 28)}));
+} // namespace
+
+TEST(TokenizerTest, TokenizeComplex) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
+ // Latin - cyrilic.
+ // 0000..007F; Basic Latin
+ // 0080..00FF; Latin-1 Supplement
+ // 0100..017F; Latin Extended-A
+ // 0180..024F; Latin Extended-B
+ // 0250..02AF; IPA Extensions
+ // 02B0..02FF; Spacing Modifier Letters
+ // 0300..036F; Combining Diacritical Marks
+ // 0370..03FF; Greek and Coptic
+ // 0400..04FF; Cyrillic
+ // 0500..052F; Cyrillic Supplement
+ // 0530..058F; Armenian
+ // 0590..05FF; Hebrew
+ // 0600..06FF; Arabic
+ // 0700..074F; Syriac
+ // 0750..077F; Arabic Supplement
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0;
+ config->end = 32;
+ config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 33;
+ config->end = 0x77F + 1;
+ config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+
+ // CJK
+ // 2E80..2EFF; CJK Radicals Supplement
+ // 3000..303F; CJK Symbols and Punctuation
+ // 3040..309F; Hiragana
+ // 30A0..30FF; Katakana
+ // 3100..312F; Bopomofo
+ // 3130..318F; Hangul Compatibility Jamo
+ // 3190..319F; Kanbun
+ // 31A0..31BF; Bopomofo Extended
+ // 31C0..31EF; CJK Strokes
+ // 31F0..31FF; Katakana Phonetic Extensions
+ // 3200..32FF; Enclosed CJK Letters and Months
+ // 3300..33FF; CJK Compatibility
+ // 3400..4DBF; CJK Unified Ideographs Extension A
+ // 4DC0..4DFF; Yijing Hexagram Symbols
+ // 4E00..9FFF; CJK Unified Ideographs
+ // A000..A48F; Yi Syllables
+ // A490..A4CF; Yi Radicals
+ // A4D0..A4FF; Lisu
+ // A500..A63F; Vai
+ // F900..FAFF; CJK Compatibility Ideographs
+ // FE30..FE4F; CJK Compatibility Forms
+ // 20000..2A6DF; CJK Unified Ideographs Extension B
+ // 2A700..2B73F; CJK Unified Ideographs Extension C
+ // 2B740..2B81F; CJK Unified Ideographs Extension D
+ // 2B820..2CEAF; CJK Unified Ideographs Extension E
+ // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
+ // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2E80;
+ config->end = 0x2EFF + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x3000;
+ config->end = 0xA63F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0xF900;
+ config->end = 0xFAFF + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0xFE30;
+ config->end = 0xFE4F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x20000;
+ config->end = 0x2A6DF + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2A700;
+ config->end = 0x2B73F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2B740;
+ config->end = 0x2B81F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2B820;
+ config->end = 0x2CEAF + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2CEB0;
+ config->end = 0x2EBEF + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x2F800;
+ config->end = 0x2FA1F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+
+ // Thai.
+ // 0E00..0E7F; Thai
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0x0E00;
+ config->end = 0x0E7F + 1;
+ config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
+
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
+ {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens;
+
+ tokens = tokenizer.Tokenize(
+ "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
+ EXPECT_EQ(tokens.size(), 30);
+
+ tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
+ // clang-format off
+ EXPECT_THAT(
+ tokens,
+ ElementsAreArray({Token("問", 0, 1),
+ Token("少", 1, 2),
+ Token("目", 2, 3),
+ Token("hello", 4, 9),
+ Token("木", 10, 11),
+ Token("輸", 11, 12),
+ Token("ย", 12, 13),
+ Token("า", 13, 14),
+ Token("ม", 14, 15),
+ Token("き", 15, 16),
+ Token("ゃ", 16, 17)}));
+ // clang-format on
+}
+
+#if defined(TC3_TEST_ICU) || defined(__APPLE__)
+TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
+ TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/true,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
+ // clang-format off
+ ASSERT_EQ(tokens,
+ std::vector<Token>({Token("พระบาท", 0, 6),
+ Token(" ", 6, 7),
+ Token("สมเด็จ", 7, 13),
+ Token(" ", 13, 14),
+ Token("พระ", 14, 17),
+ Token(" ", 17, 18),
+ Token("ปร", 18, 20),
+ Token(" ", 20, 21),
+ Token("มิ", 21, 23)}));
+ // clang-format on
+}
+
+TEST(TokenizerTest, ICUTokenizePunctuation) {
+ TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/true,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens =
+ tokenizer.Tokenize("The interval is: -(12, 138*)");
+ // clang-format off
+ ASSERT_EQ(
+ tokens,
+ std::vector<Token>({Token("The", 0, 3),
+ Token(" ", 3, 4),
+ Token("interval", 4, 12),
+ Token(" ", 12, 13),
+ Token("is", 13, 15),
+ Token(":", 15, 16),
+ Token(" ", 16, 17),
+ Token("-", 17, 18),
+ Token("(", 18, 19),
+ Token("12", 19, 21),
+ Token(",", 21, 22),
+ Token(" ", 22, 23),
+ Token("138", 23, 26),
+ Token("*", 26, 27),
+ Token(")", 27, 28)}));
+ // clang-format on
+}
+
+TEST(TokenizerTest, ICUTokenizeWithNumbers) {
+ TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/true,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3");
+ // clang-format off
+ ASSERT_EQ(tokens,
+ std::vector<Token>({Token("3.1", 0, 3),
+ Token(" ", 3, 4),
+ Token("3﹒2", 4, 7),
+ Token(" ", 7, 8),
+ Token("3.3", 8, 11)}));
+ // clang-format on
+}
+#endif
+
+#if defined(TC3_TEST_ICU)
+TEST(TokenizerTest, ICUTokenize) {
+ TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
+ // clang-format off
+ ASSERT_EQ(tokens,
+ std::vector<Token>({Token("พระบาท", 0, 6),
+ Token("สมเด็จ", 6, 12),
+ Token("พระ", 12, 15),
+ Token("ปร", 15, 17),
+ Token("มิ", 17, 19)}));
+ // clang-format on
+}
+
+TEST(TokenizerTest, MixedTokenize) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 32;
+ config->end = 33;
+ config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
+
+ std::vector<CodepointRangeT> internal_configs;
+ CodepointRangeT* interal_config;
+
+ internal_configs.emplace_back();
+ interal_config = &internal_configs.back();
+ interal_config->start = 0;
+ interal_config->end = 128;
+
+ internal_configs.emplace_back();
+ interal_config = &internal_configs.back();
+ interal_config->start = 128;
+ interal_config->end = 256;
+
+ internal_configs.emplace_back();
+ interal_config = &internal_configs.back();
+ interal_config->start = 256;
+ interal_config->end = 384;
+
+ internal_configs.emplace_back();
+ interal_config = &internal_configs.back();
+ interal_config->start = 384;
+ interal_config->end = 592;
+
+ TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
+ internal_configs,
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+
+ std::vector<Token> tokens = tokenizer.Tokenize(
+ "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/");
+ ASSERT_EQ(
+ tokens,
+ // clang-format off
+ std::vector<Token>({Token("こんにちは", 0, 5),
+ Token("Japanese-ląnguagę", 5, 22),
+ Token("text", 23, 27),
+ Token("你好", 28, 30),
+ Token("世界", 30, 32),
+ Token("http://www.google.com/", 33, 55)}));
+ // clang-format on
+}
+
+TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
+ std::vector<TokenizationCodepointRangeT> configs;
+ TokenizationCodepointRangeT* config;
+
+ configs.emplace_back();
+ config = &configs.back();
+ config->start = 0;
+ config->end = 256;
+ config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+
+ {
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
+ configs, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+
+ EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
+ std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
+ }
+
+ {
+ TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
+ configs, {},
+ /*split_on_script_change=*/true,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
+ std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
+ Token("웹사이트", 7, 11)}));
+ }
+}
+#endif
+
+TEST(TokenizerTest, LetterDigitTokenize) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/true);
+ std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18.");
+ ASSERT_EQ(tokens,
+ std::vector<Token>(
+ {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3),
+ Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9),
+ Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15),
+ Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18),
+ Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22),
+ Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26),
+ Token(".", 26, 27)}));
+}
+
+TEST(TokenizerTest, LetterDigitTokenizeUnicode) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/true);
+ std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント");
+ ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
+ Token("pércént", 2, 9),
+ Token(" ", 9, 10), Token("3", 10, 11),
+ Token("パーセント", 11, 16)}));
+}
+
+TEST(TokenizerTest, LetterDigitTokenizeWithDots) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/true);
+ std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%");
+ ASSERT_EQ(tokens,
+ std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2),
+ Token("3﹒2", 2, 5), Token(" ", 5, 6),
+ Token("3.3", 6, 9), Token("%", 9, 10)}));
+}
+
+TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2");
+ ASSERT_EQ(tokens,
+ std::vector<Token>(
+ {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5),
+ Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11),
+ Token("january", 11, 18), Token("'", 18, 19),
+ Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22),
+ Token(".", 22, 23), Token("2", 23, 24)}));
+}
+
+TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+");
+ ASSERT_EQ(tokens,
+ std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4),
+ Token("2345", 4, 8), Token("+", 8, 9),
+ Token("+", 9, 10), Token("the", 10, 13),
+ Token(" ", 13, 14), Token("+", 14, 15),
+ Token("íí", 15, 17), Token("+", 17, 18)}));
+}
+
+TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) {
+ TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
+ /*split_on_script_change=*/false,
+ /*icu_preserve_whitespace_tokens=*/false,
+ /*preserve_floating_numbers=*/false);
+ std::vector<Token> tokens = tokenizer.Tokenize("2 3 4 5");
+ ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
+ Token("3", 2, 3), Token(" ", 3, 5),
+ Token("4", 5, 6), Token(" ", 6, 9),
+ Token("5", 9, 10)}));
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/utf8/NSString+Unicode.h b/native/utils/utf8/NSString+Unicode.h
deleted file mode 100644
index 734d58f..0000000
--- a/native/utils/utf8/NSString+Unicode.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#import <Foundation/Foundation.h>
-
-/// Defines utility methods for operating with Unicode in @c NSString.
-/// @discussion Unicode has 1,114,112 code points ( http://en.wikipedia.org/wiki/Code_point ),
-/// and multiple encodings that map these code points into code units.
-/// @c NSString API exposes the string as if it were encoded in UTF-16, which makes use
-/// of surrogate pairs ( http://en.wikipedia.org/wiki/UTF-16 ).
-/// The methods in this category translate indices between Unicode codepoints and
-/// UTF-16 unichars.
-@interface NSString (Unicode)
-
-/// Returns the number of Unicode codepoints for a string slice.
-/// @param start The NSString start index.
-/// @param length The number of unichar units.
-/// @return The number of Unicode code points in the specified unichar range.
-- (NSUInteger)tc_countChar32:(NSUInteger)start withLength:(NSUInteger)length;
-
-/// Returns the length of the string in terms of Unicode codepoints.
-/// @return The number of Unicode codepoints in this string.
-- (NSUInteger)tc_codepointLength;
-
-@end
diff --git a/native/utils/utf8/unicodetext_test.cc b/native/utils/utf8/unicodetext_test.cc
new file mode 100644
index 0000000..4e8883b
--- /dev/null
+++ b/native/utils/utf8/unicodetext_test.cc
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/utf8/unicodetext.h"
+
+#include "utils/strings/stringpiece.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+class UnicodeTextTest : public testing::Test {
+ protected:
+ UnicodeTextTest() : empty_text_() {
+ text_.push_back(0x1C0);
+ text_.push_back(0x4E8C);
+ text_.push_back(0xD7DB);
+ text_.push_back(0x34);
+ text_.push_back(0x1D11E);
+ }
+
+ UnicodeText empty_text_;
+ UnicodeText text_;
+};
+
+TEST(UnicodeTextTest, ConstructionFromUnicodeText) {
+ UnicodeText text = UTF8ToUnicodeText("1234😋hello", /*do_copy=*/false);
+ EXPECT_EQ(UnicodeText(text).ToUTF8String(), "1234😋hello");
+ EXPECT_EQ(UnicodeText(text, /*do_copy=*/false).ToUTF8String(), "1234😋hello");
+}
+
+// Tests for our modifications of UnicodeText.
+TEST(UnicodeTextTest, Custom) {
+ UnicodeText text = UTF8ToUnicodeText("1234😋hello", /*do_copy=*/false);
+ EXPECT_EQ(text.ToUTF8String(), "1234😋hello");
+ EXPECT_EQ(text.size_codepoints(), 10);
+ EXPECT_EQ(text.size_bytes(), 13);
+
+ auto it_begin = text.begin();
+ std::advance(it_begin, 4);
+ auto it_end = text.begin();
+ std::advance(it_end, 6);
+ EXPECT_EQ(text.UTF8Substring(it_begin, it_end), "😋h");
+}
+
+TEST(UnicodeTextTest, StringPieceView) {
+ std::string raw_text = "1234😋hello";
+ UnicodeText text =
+ UTF8ToUnicodeText(StringPiece(raw_text), /*do_copy=*/false);
+ EXPECT_EQ(text.ToUTF8String(), "1234😋hello");
+ EXPECT_EQ(text.size_codepoints(), 10);
+ EXPECT_EQ(text.size_bytes(), 13);
+
+ auto it_begin = text.begin();
+ std::advance(it_begin, 4);
+ auto it_end = text.begin();
+ std::advance(it_end, 6);
+ EXPECT_EQ(text.UTF8Substring(it_begin, it_end), "😋h");
+}
+
+TEST(UnicodeTextTest, Substring) {
+ UnicodeText text = UTF8ToUnicodeText("1234😋hello", /*do_copy=*/false);
+
+ EXPECT_EQ(
+ UnicodeText::Substring(std::next(text.begin(), 4),
+ std::next(text.begin(), 6), /*do_copy=*/true),
+ UTF8ToUnicodeText("😋h"));
+ EXPECT_EQ(
+ UnicodeText::Substring(std::next(text.begin(), 4),
+ std::next(text.begin(), 6), /*do_copy=*/false),
+ UTF8ToUnicodeText("😋h"));
+ EXPECT_EQ(UnicodeText::Substring(text, 4, 6, /*do_copy=*/true),
+ UTF8ToUnicodeText("😋h"));
+ EXPECT_EQ(UnicodeText::Substring(text, 4, 6, /*do_copy=*/false),
+ UTF8ToUnicodeText("😋h"));
+}
+
+TEST(UnicodeTextTest, Ownership) {
+ const std::string src = "\u304A\u00B0\u106B";
+
+ UnicodeText alias;
+ alias.PointToUTF8(src.data(), src.size());
+ EXPECT_EQ(alias.data(), src.data());
+ UnicodeText::const_iterator it = alias.begin();
+ EXPECT_EQ(*it++, 0x304A);
+ EXPECT_EQ(*it++, 0x00B0);
+ EXPECT_EQ(*it++, 0x106B);
+ EXPECT_EQ(it, alias.end());
+
+ UnicodeText t = alias; // Copy initialization copies the data.
+ EXPECT_NE(t.data(), alias.data());
+}
+
+TEST(UnicodeTextTest, Validation) {
+ EXPECT_TRUE(UTF8ToUnicodeText("1234😋hello", /*do_copy=*/false).is_valid());
+ EXPECT_TRUE(
+ UTF8ToUnicodeText("\u304A\u00B0\u106B", /*do_copy=*/false).is_valid());
+ EXPECT_TRUE(
+ UTF8ToUnicodeText("this is a test😋😋😋", /*do_copy=*/false).is_valid());
+ EXPECT_TRUE(
+ UTF8ToUnicodeText("\xf0\x9f\x98\x8b", /*do_copy=*/false).is_valid());
+ // Too short (string is too short).
+ EXPECT_FALSE(UTF8ToUnicodeText("\xf0\x9f", /*do_copy=*/false).is_valid());
+ // Too long (too many trailing bytes).
+ EXPECT_FALSE(
+ UTF8ToUnicodeText("\xf0\x9f\x98\x8b\x8b", /*do_copy=*/false).is_valid());
+ // Too short (too few trailing bytes).
+ EXPECT_FALSE(
+ UTF8ToUnicodeText("\xf0\x9f\x98\x61\x61", /*do_copy=*/false).is_valid());
+ // Invalid with context.
+ EXPECT_FALSE(
+ UTF8ToUnicodeText("hello \xf0\x9f\x98\x61\x61 world1", /*do_copy=*/false)
+ .is_valid());
+}
+
+class IteratorTest : public UnicodeTextTest {};
+
+TEST_F(IteratorTest, Iterates) {
+ UnicodeText::const_iterator iter = text_.begin();
+ EXPECT_EQ(0x1C0, *iter);
+ EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
+ EXPECT_EQ(0x4E8C, *iter++);
+ EXPECT_EQ(0xD7DB, *iter);
+ // Make sure you can dereference more than once.
+ EXPECT_EQ(0xD7DB, *iter);
+ EXPECT_EQ(0x34, *++iter);
+ EXPECT_EQ(0x1D11E, *++iter);
+ ASSERT_TRUE(iter != text_.end());
+ iter++;
+ EXPECT_TRUE(iter == text_.end());
+}
+
+TEST_F(IteratorTest, MultiPass) {
+ // Also tests Default Constructible and Assignable.
+ UnicodeText::const_iterator i1, i2;
+ i1 = text_.begin();
+ i2 = i1;
+ EXPECT_EQ(0x4E8C, *++i1);
+ EXPECT_TRUE(i1 != i2);
+ EXPECT_EQ(0x1C0, *i2);
+ ++i2;
+ EXPECT_TRUE(i1 == i2);
+ EXPECT_EQ(0x4E8C, *i2);
+}
+
+TEST_F(IteratorTest, ReverseIterates) {
+ UnicodeText::const_iterator iter = text_.end();
+ EXPECT_TRUE(iter == text_.end());
+ iter--;
+ ASSERT_TRUE(iter != text_.end());
+ EXPECT_EQ(0x1D11E, *iter--);
+ EXPECT_EQ(0x34, *iter);
+ EXPECT_EQ(0xD7DB, *--iter);
+ // Make sure you can dereference more than once.
+ EXPECT_EQ(0xD7DB, *iter);
+ --iter;
+ EXPECT_EQ(0x4E8C, *iter--);
+ EXPECT_EQ(0x1C0, *iter);
+ EXPECT_TRUE(iter == text_.begin());
+}
+
+TEST_F(IteratorTest, Comparable) {
+ UnicodeText::const_iterator i1, i2;
+ i1 = text_.begin();
+ i2 = i1;
+ ++i2;
+
+ EXPECT_TRUE(i1 < i2);
+ EXPECT_TRUE(text_.begin() <= i1);
+ EXPECT_FALSE(i1 >= i2);
+ EXPECT_FALSE(i1 > text_.end());
+}
+
+TEST_F(IteratorTest, Advance) {
+ UnicodeText::const_iterator iter = text_.begin();
+ EXPECT_EQ(0x1C0, *iter);
+ std::advance(iter, 4);
+ EXPECT_EQ(0x1D11E, *iter);
+ ++iter;
+ EXPECT_TRUE(iter == text_.end());
+}
+
+TEST_F(IteratorTest, Distance) {
+ UnicodeText::const_iterator iter = text_.begin();
+ EXPECT_EQ(0, std::distance(text_.begin(), iter));
+ EXPECT_EQ(5, std::distance(iter, text_.end()));
+ ++iter;
+ ++iter;
+ EXPECT_EQ(2, std::distance(text_.begin(), iter));
+ EXPECT_EQ(3, std::distance(iter, text_.end()));
+ ++iter;
+ ++iter;
+ EXPECT_EQ(4, std::distance(text_.begin(), iter));
+ ++iter;
+ EXPECT_EQ(0, std::distance(iter, text_.end()));
+}
+
+class OperatorTest : public UnicodeTextTest {};
+
+TEST_F(OperatorTest, Clear) {
+ UnicodeText empty_text(UTF8ToUnicodeText("", /*do_copy=*/false));
+ EXPECT_FALSE(text_ == empty_text);
+ text_.clear();
+ EXPECT_TRUE(text_ == empty_text);
+}
+
+TEST_F(OperatorTest, Empty) {
+ EXPECT_TRUE(empty_text_.empty());
+ EXPECT_FALSE(text_.empty());
+ text_.clear();
+ EXPECT_TRUE(text_.empty());
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/native/utils/variant.cc b/native/utils/variant.cc
index 9cdc0b6..0513440 100644
--- a/native/utils/variant.cc
+++ b/native/utils/variant.cc
@@ -21,26 +21,26 @@
std::string Variant::ToString() const {
switch (GetType()) {
case Variant::TYPE_BOOL_VALUE:
- if (BoolValue()) {
+ if (Value<bool>()) {
return "true";
} else {
return "false";
}
break;
case Variant::TYPE_INT_VALUE:
- return std::to_string(IntValue());
+ return std::to_string(Value<int>());
break;
case Variant::TYPE_INT64_VALUE:
- return std::to_string(Int64Value());
+ return std::to_string(Value<int64>());
break;
case Variant::TYPE_FLOAT_VALUE:
- return std::to_string(FloatValue());
+ return std::to_string(Value<float>());
break;
case Variant::TYPE_DOUBLE_VALUE:
- return std::to_string(DoubleValue());
+ return std::to_string(Value<double>());
break;
case Variant::TYPE_STRING_VALUE:
- return StringValue();
+ return ConstRefValue<std::string>();
break;
default:
TC3_LOG(FATAL) << "Unsupported variant type: " << GetType();
diff --git a/native/utils/variant.h b/native/utils/variant.h
index 11c361c..551a822 100644
--- a/native/utils/variant.h
+++ b/native/utils/variant.h
@@ -85,110 +85,178 @@
Variant& operator=(const Variant&) = default;
- int Int8Value() const {
- TC3_CHECK(HasInt8());
+ template <class T>
+ struct dependent_false : std::false_type {};
+
+ template <typename T>
+ T Value() const {
+ static_assert(dependent_false<T>::value, "Not supported.");
+ }
+
+ template <>
+ int8 Value() const {
+ TC3_CHECK(Has<int8>());
return int8_value_;
}
- int UInt8Value() const {
- TC3_CHECK(HasUInt8());
+ template <>
+ uint8 Value() const {
+ TC3_CHECK(Has<uint8>());
return uint8_value_;
}
- int IntValue() const {
- TC3_CHECK(HasInt());
+ template <>
+ int Value() const {
+ TC3_CHECK(Has<int>());
return int_value_;
}
- uint UIntValue() const {
- TC3_CHECK(HasUInt());
+ template <>
+ uint Value() const {
+ TC3_CHECK(Has<uint>());
return uint_value_;
}
- int64 Int64Value() const {
- TC3_CHECK(HasInt64());
+ template <>
+ int64 Value() const {
+ TC3_CHECK(Has<int64>());
return long_value_;
}
- uint64 UInt64Value() const {
- TC3_CHECK(HasUInt64());
+ template <>
+ uint64 Value() const {
+ TC3_CHECK(Has<uint64>());
return ulong_value_;
}
- float FloatValue() const {
- TC3_CHECK(HasFloat());
+ template <>
+ float Value() const {
+ TC3_CHECK(Has<float>());
return float_value_;
}
- double DoubleValue() const {
- TC3_CHECK(HasDouble());
+ template <>
+ double Value() const {
+ TC3_CHECK(Has<double>());
return double_value_;
}
- bool BoolValue() const {
- TC3_CHECK(HasBool());
+ template <>
+ bool Value() const {
+ TC3_CHECK(Has<bool>());
return bool_value_;
}
- const std::string& StringValue() const {
- TC3_CHECK(HasString());
+ template <typename T>
+ const T& ConstRefValue() const;
+
+ template <>
+ const std::string& ConstRefValue() const {
+ TC3_CHECK(Has<std::string>());
return string_value_;
}
- const std::vector<std::string>& StringVectorValue() const {
- TC3_CHECK(HasStringVector());
+ template <>
+ const std::vector<std::string>& ConstRefValue() const {
+ TC3_CHECK(Has<std::vector<std::string>>());
return string_vector_value_;
}
- const std::vector<float>& FloatVectorValue() const {
- TC3_CHECK(HasFloatVector());
+ template <>
+ const std::vector<float>& ConstRefValue() const {
+ TC3_CHECK(Has<std::vector<float>>());
return float_vector_value_;
}
- const std::vector<int>& IntVectorValue() const {
- TC3_CHECK(HasIntVector());
+ template <>
+ const std::vector<int>& ConstRefValue() const {
+ TC3_CHECK(Has<std::vector<int>>());
return int_vector_value_;
}
- const std::map<std::string, Variant>& StringVariantMapValue() const {
- TC3_CHECK(HasStringVariantMap());
+ template <>
+ const std::map<std::string, Variant>& ConstRefValue() const {
+ TC3_CHECK((Has<std::map<std::string, Variant>>()));
return string_variant_map_value_;
}
+ template <typename T>
+ bool Has() const;
+
+ template <>
+ bool Has<int8>() const {
+ return type_ == TYPE_INT8_VALUE;
+ }
+
+ template <>
+ bool Has<uint8>() const {
+ return type_ == TYPE_UINT8_VALUE;
+ }
+
+ template <>
+ bool Has<int>() const {
+ return type_ == TYPE_INT_VALUE;
+ }
+
+ template <>
+ bool Has<uint>() const {
+ return type_ == TYPE_UINT_VALUE;
+ }
+
+ template <>
+ bool Has<int64>() const {
+ return type_ == TYPE_INT64_VALUE;
+ }
+
+ template <>
+ bool Has<uint64>() const {
+ return type_ == TYPE_UINT64_VALUE;
+ }
+
+ template <>
+ bool Has<float>() const {
+ return type_ == TYPE_FLOAT_VALUE;
+ }
+
+ template <>
+ bool Has<double>() const {
+ return type_ == TYPE_DOUBLE_VALUE;
+ }
+
+ template <>
+ bool Has<bool>() const {
+ return type_ == TYPE_BOOL_VALUE;
+ }
+
+ template <>
+ bool Has<std::string>() const {
+ return type_ == TYPE_STRING_VALUE;
+ }
+
+ template <>
+ bool Has<std::vector<std::string>>() const {
+ return type_ == TYPE_STRING_VECTOR_VALUE;
+ }
+
+ template <>
+ bool Has<std::vector<float>>() const {
+ return type_ == TYPE_FLOAT_VECTOR_VALUE;
+ }
+
+ template <>
+ bool Has<std::vector<int>>() const {
+ return type_ == TYPE_INT_VECTOR_VALUE;
+ }
+
+ template <>
+ bool Has<std::map<std::string, Variant>>() const {
+ return type_ == TYPE_STRING_VARIANT_MAP_VALUE;
+ }
+
// Converts the value of this variant to its string representation, regardless
// of the type of the actual value.
std::string ToString() const;
- bool HasInt8() const { return type_ == TYPE_INT8_VALUE; }
-
- bool HasUInt8() const { return type_ == TYPE_UINT8_VALUE; }
-
- bool HasInt() const { return type_ == TYPE_INT_VALUE; }
-
- bool HasUInt() const { return type_ == TYPE_UINT_VALUE; }
-
- bool HasInt64() const { return type_ == TYPE_INT64_VALUE; }
-
- bool HasUInt64() const { return type_ == TYPE_UINT64_VALUE; }
-
- bool HasFloat() const { return type_ == TYPE_FLOAT_VALUE; }
-
- bool HasDouble() const { return type_ == TYPE_DOUBLE_VALUE; }
-
- bool HasBool() const { return type_ == TYPE_BOOL_VALUE; }
-
- bool HasString() const { return type_ == TYPE_STRING_VALUE; }
-
- bool HasStringVector() const { return type_ == TYPE_STRING_VECTOR_VALUE; }
-
- bool HasFloatVector() const { return type_ == TYPE_FLOAT_VECTOR_VALUE; }
-
- bool HasIntVector() const { return type_ == TYPE_INT_VECTOR_VALUE; }
-
- bool HasStringVariantMap() const {
- return type_ == TYPE_STRING_VARIANT_MAP_VALUE;
- }
-
Type GetType() const { return type_; }
bool HasValue() const { return type_ != TYPE_EMPTY; }
diff --git a/native/utils/variant_test.cc b/native/utils/variant_test.cc
new file mode 100644
index 0000000..cf0acfb
--- /dev/null
+++ b/native/utils/variant_test.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/variant.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier3 {
+namespace {
+
+TEST(VariantTest, GetType) {
+ EXPECT_EQ(Variant().GetType(), Variant::TYPE_EMPTY);
+ EXPECT_EQ(Variant(static_cast<int8_t>(9)).GetType(),
+ Variant::TYPE_INT8_VALUE);
+ EXPECT_EQ(Variant(static_cast<uint8_t>(9)).GetType(),
+ Variant::TYPE_UINT8_VALUE);
+ EXPECT_EQ(Variant(static_cast<int>(9)).GetType(), Variant::TYPE_INT_VALUE);
+ EXPECT_EQ(Variant(static_cast<uint>(9)).GetType(), Variant::TYPE_UINT_VALUE);
+ EXPECT_EQ(Variant(static_cast<int64>(9)).GetType(),
+ Variant::TYPE_INT64_VALUE);
+ EXPECT_EQ(Variant(static_cast<uint64>(9)).GetType(),
+ Variant::TYPE_UINT64_VALUE);
+ EXPECT_EQ(Variant(static_cast<float>(9)).GetType(),
+ Variant::TYPE_FLOAT_VALUE);
+ EXPECT_EQ(Variant(static_cast<double>(9)).GetType(),
+ Variant::TYPE_DOUBLE_VALUE);
+ EXPECT_EQ(Variant(true).GetType(), Variant::TYPE_BOOL_VALUE);
+ EXPECT_EQ(Variant("hello").GetType(), Variant::TYPE_STRING_VALUE);
+}
+
+TEST(VariantTest, HasValue) {
+ EXPECT_FALSE(Variant().HasValue());
+ EXPECT_TRUE(Variant(static_cast<int8_t>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<uint8_t>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<int>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<uint>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<int64>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<uint64>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<float>(9)).HasValue());
+ EXPECT_TRUE(Variant(static_cast<double>(9)).HasValue());
+ EXPECT_TRUE(Variant(true).HasValue());
+ EXPECT_TRUE(Variant("hello").HasValue());
+}
+
+TEST(VariantTest, Value) {
+ EXPECT_EQ(Variant(static_cast<int8_t>(9)).Value<int8>(), 9);
+ EXPECT_EQ(Variant(static_cast<uint8_t>(9)).Value<uint8>(), 9);
+ EXPECT_EQ(Variant(static_cast<int>(9)).Value<int>(), 9);
+ EXPECT_EQ(Variant(static_cast<uint>(9)).Value<uint>(), 9);
+ EXPECT_EQ(Variant(static_cast<int64>(9)).Value<int64>(), 9);
+ EXPECT_EQ(Variant(static_cast<uint64>(9)).Value<uint64>(), 9);
+ EXPECT_EQ(Variant(static_cast<float>(9)).Value<float>(), 9);
+ EXPECT_EQ(Variant(static_cast<double>(9)).Value<double>(), 9);
+ EXPECT_EQ(Variant(true).Value<bool>(), true);
+ EXPECT_EQ(Variant("hello").ConstRefValue<std::string>(), "hello");
+}
+
+} // namespace
+} // namespace libtextclassifier3
diff --git a/notification/res/values-es-rUS/strings.xml b/notification/res/values-b+es+419/strings.xml
similarity index 100%
rename from notification/res/values-es-rUS/strings.xml
rename to notification/res/values-b+es+419/strings.xml
diff --git a/notification/res/values-b+sr+Latn/strings.xml b/notification/res/values-b+sr+Latn/strings.xml
new file mode 100755
index 0000000..480ef86
--- /dev/null
+++ b/notification/res/values-b+sr+Latn/strings.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources xmlns:xliff="urn:oasis:names:tc:xliff:document:1.2">
+ <string name="tc_notif_copy_code_desc">Kopiraj „%1$s“</string>
+ <string name="tc_notif_code_copied_to_clipboard">Kôd je kopiran</string>
+</resources>
diff --git a/notification/res/values-in/strings.xml b/notification/res/values-id/strings.xml
similarity index 100%
rename from notification/res/values-in/strings.xml
rename to notification/res/values-id/strings.xml
diff --git a/notification/res/values-nb/strings.xml b/notification/res/values-no/strings.xml
similarity index 100%
rename from notification/res/values-nb/strings.xml
rename to notification/res/values-no/strings.xml
diff --git a/notification/res/values-zh-rCN/strings.xml b/notification/res/values-zh/strings.xml
similarity index 100%
rename from notification/res/values-zh-rCN/strings.xml
rename to notification/res/values-zh/strings.xml
diff --git a/notification/src/com/android/textclassifier/notification/SmartSuggestionsHelper.java b/notification/src/com/android/textclassifier/notification/SmartSuggestionsHelper.java
index fab0dd1..0a2cce7 100644
--- a/notification/src/com/android/textclassifier/notification/SmartSuggestionsHelper.java
+++ b/notification/src/com/android/textclassifier/notification/SmartSuggestionsHelper.java
@@ -35,9 +35,12 @@
import android.util.Pair;
import android.view.textclassifier.ConversationAction;
import android.view.textclassifier.ConversationActions;
+import android.view.textclassifier.TextClassification;
import android.view.textclassifier.TextClassificationContext;
import android.view.textclassifier.TextClassificationManager;
import android.view.textclassifier.TextClassifier;
+
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import java.time.Instant;
@@ -48,6 +51,7 @@
import java.util.List;
import java.util.Map;
import java.util.Objects;
+import java.util.Optional;
import javax.annotation.Nullable;
/**
@@ -76,14 +80,24 @@
private static final int MAX_RESULT_ID_TO_CACHE = 20;
private static final ImmutableList<String> HINTS =
ImmutableList.of(ConversationActions.Request.HINT_FOR_NOTIFICATION);
- private static final ConversationActions EMPTY_CONVERSATION_ACTIONS =
- new ConversationActions(ImmutableList.of(), null);
+ private static final SuggestConversationActionsResult EMPTY_SUGGEST_CONVERSATION_ACTION_RESULT =
+ new SuggestConversationActionsResult(
+ Optional.empty(), new ConversationActions(ImmutableList.of(), /* id= */ null));
private final Context context;
private final TextClassificationManager textClassificationManager;
private final SmartSuggestionsConfig config;
private final LruCache<String, SmartSuggestionsLogSession> sessionCache =
- new LruCache<>(MAX_RESULT_ID_TO_CACHE);
+ new LruCache<String, SmartSuggestionsLogSession>(MAX_RESULT_ID_TO_CACHE) {
+ @Override
+ protected void entryRemoved(
+ boolean evicted,
+ String key,
+ SmartSuggestionsLogSession oldSession,
+ SmartSuggestionsLogSession newSession) {
+ oldSession.destroy();
+ }
+ };
private final TextClassificationContext textClassificationContext;
public SmartSuggestionsHelper(Context context, SmartSuggestionsConfig config) {
@@ -103,26 +117,20 @@
public SmartSuggestions onNotificationEnqueued(StatusBarNotification statusBarNotification) {
// Whenever onNotificationEnqueued() is called again on the same notification key, its
// previous session is ended.
- removeAndDestroySession(statusBarNotification.getKey());
+ sessionCache.remove(statusBarNotification.getKey());
boolean eligibleForReplyAdjustment =
config.shouldGenerateReplies() && isEligibleForReplyAdjustment(statusBarNotification);
boolean eligibleForActionAdjustment =
config.shouldGenerateActions() && isEligibleForActionAdjustment(statusBarNotification);
- TextClassifier textClassifier =
- textClassificationManager.createTextClassificationSession(textClassificationContext);
-
- ConversationActions conversationActionsResult =
+ SuggestConversationActionsResult suggestConversationActionsResult =
suggestConversationActions(
- textClassifier,
- statusBarNotification,
- eligibleForReplyAdjustment,
- eligibleForActionAdjustment);
+ statusBarNotification, eligibleForReplyAdjustment, eligibleForActionAdjustment);
- String resultId = conversationActionsResult.getId();
+ String resultId = suggestConversationActionsResult.conversationActions.getId();
List<ConversationAction> conversationActions =
- conversationActionsResult.getConversationActions();
+ suggestConversationActionsResult.conversationActions.getConversationActions();
ArrayList<CharSequence> replies = new ArrayList<>();
Map<CharSequence, Float> repliesScore = new ArrayMap<>();
@@ -156,33 +164,34 @@
}
}
- if (!TextUtils.isEmpty(resultId)) {
- SmartSuggestionsLogSession session =
- new SmartSuggestionsLogSession(
- resultId, repliesScore, textClassifier, textClassificationContext);
- session.onSuggestionsGenerated(conversationActions);
+ suggestConversationActionsResult.textClassifier.ifPresent(
+ textClassifier -> {
+ if (TextUtils.isEmpty(resultId)) {
+ // Missing the result id, skip logging.
+ textClassifier.destroy();
+ } else {
+ SmartSuggestionsLogSession session =
+ new SmartSuggestionsLogSession(
+ resultId,
+ repliesScore,
+ textClassifier,
+ textClassificationContext);
+ session.onSuggestionsGenerated(conversationActions);
- // Store the session if we expect more logging from it, destroy it otherwise.
- if (!conversationActions.isEmpty()
- && suggestionsMightBeUsedInNotification(
- statusBarNotification, !actions.isEmpty(), !replies.isEmpty())) {
- sessionCache.put(statusBarNotification.getKey(), session);
- } else {
- session.destroy();
- }
- }
+ // Store the session if we expect more logging from it, destroy it otherwise.
+ if (!conversationActions.isEmpty()
+ && suggestionsMightBeUsedInNotification(
+ statusBarNotification, !actions.isEmpty(), !replies.isEmpty())) {
+ sessionCache.put(statusBarNotification.getKey(), session);
+ } else {
+ session.destroy();
+ }
+ }
+ });
return new SmartSuggestions(replies, actions);
}
- private void removeAndDestroySession(String notificationKey) {
- SmartSuggestionsLogSession session = sessionCache.get(notificationKey);
- if (session != null) {
- session.destroy();
- }
- sessionCache.remove(notificationKey);
- }
-
/**
* Creates notification action from ConversationAction that does not come up a RemoteAction. It
* could happen because we don't have common intents for some actions, like copying text.
@@ -258,23 +267,20 @@
}
/** Adds action adjustments based on the notification contents. */
- private ConversationActions suggestConversationActions(
- TextClassifier textClassifier,
- StatusBarNotification statusBarNotification,
- boolean includeReplies,
- boolean includeActions) {
+ private SuggestConversationActionsResult suggestConversationActions(
+ StatusBarNotification statusBarNotification, boolean includeReplies, boolean includeActions) {
if (!includeReplies && !includeActions) {
- return EMPTY_CONVERSATION_ACTIONS;
+ return EMPTY_SUGGEST_CONVERSATION_ACTION_RESULT;
}
ImmutableList<ConversationActions.Message> messages =
extractMessages(statusBarNotification.getNotification());
if (messages.isEmpty()) {
- return EMPTY_CONVERSATION_ACTIONS;
+ return EMPTY_SUGGEST_CONVERSATION_ACTION_RESULT;
}
// Do not generate smart actions if the last message is from the local user.
ConversationActions.Message lastMessage = Iterables.getLast(messages);
if (arePersonsEqual(ConversationActions.Message.PERSON_USER_SELF, lastMessage.getAuthor())) {
- return EMPTY_CONVERSATION_ACTIONS;
+ return EMPTY_SUGGEST_CONVERSATION_ACTION_RESULT;
}
TextClassifier.EntityConfig.Builder typeConfigBuilder =
@@ -298,7 +304,9 @@
.setTypeConfig(typeConfigBuilder.build())
.build();
- return textClassifier.suggestConversationActions(request);
+ TextClassifier textClassifier = createTextClassificationSession();
+ return new SuggestConversationActionsResult(
+ Optional.of(textClassifier), textClassifier.suggestConversationActions(request));
}
/**
@@ -462,9 +470,30 @@
return ImmutableList.copyOf(new ArrayList<>(extractMessages));
}
+ @VisibleForTesting
+ TextClassifier createTextClassificationSession() {
+ return textClassificationManager.createTextClassificationSession(textClassificationContext);
+ }
+
private static boolean arePersonsEqual(Person left, Person right) {
return Objects.equals(left.getKey(), right.getKey())
&& TextUtils.equals(left.getName(), right.getName())
&& Objects.equals(left.getUri(), right.getUri());
}
+
+ /**
+ * Result object of {@link #suggestConversationActions(StatusBarNotification, boolean, boolean)}.
+ */
+ private static class SuggestConversationActionsResult {
+ /** The text classifier session that was involved to make suggestions, if any. */
+ final Optional<TextClassifier> textClassifier;
+ /** The resultant suggestions. */
+ final ConversationActions conversationActions;
+
+ SuggestConversationActionsResult(
+ Optional<TextClassifier> textClassifier, ConversationActions conversationActions) {
+ this.textClassifier = textClassifier;
+ this.conversationActions = conversationActions;
+ }
+ }
}
diff --git a/notification/tests/AndroidTest.xml b/notification/tests/AndroidTest.xml
new file mode 100644
index 0000000..1890e75
--- /dev/null
+++ b/notification/tests/AndroidTest.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2020 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- This test config file is auto-generated. -->
+<configuration description="Runs TextClassifierNotificationTests.">
+ <option name="test-suite-tag" value="apct" />
+ <option name="test-suite-tag" value="apct-instrumentation" />
+ <target_preparer class="com.android.tradefed.targetprep.suite.SuiteApkInstaller">
+ <option name="cleanup-apks" value="true" />
+ <option name="test-file-name" value="TextClassifierNotificationTests.apk" />
+ </target_preparer>
+
+ <test class="com.android.tradefed.testtype.AndroidJUnitTest" >
+ <option name="package" value="com.android.textclassifier.notification" />
+ <option name="runner" value="androidx.test.runner.AndroidJUnitRunner" />
+ </test>
+
+ <object type="module_controller" class="com.android.tradefed.testtype.suite.module.MainlineTestModuleController">
+ <option name="mainline-module-package-name" value="com.google.android.extservices" />
+ </object>
+</configuration>
diff --git a/notification/tests/src/com/android/textclassifier/notification/CopyCodeActivityTest.java b/notification/tests/src/com/android/textclassifier/notification/CopyCodeActivityTest.java
index 0682ff0..966fbe0 100644
--- a/notification/tests/src/com/android/textclassifier/notification/CopyCodeActivityTest.java
+++ b/notification/tests/src/com/android/textclassifier/notification/CopyCodeActivityTest.java
@@ -47,12 +47,14 @@
@Test
public void onCreate_emptyCode() throws Exception {
- activityRule.launchActivity(EMPTY_INTENT);
-
ClipboardManager clipboardManager =
ApplicationProvider.getApplicationContext().getSystemService(ClipboardManager.class);
// Use shell's permissions to ensure we can access the clipboard
InstrumentationRegistry.getInstrumentation().getUiAutomation().adoptShellPermissionIdentity();
+ clipboardManager.clearPrimaryClip();
+
+ activityRule.launchActivity(EMPTY_INTENT);
+
try {
assertThat(clipboardManager.hasPrimaryClip()).isFalse();
} finally {
@@ -62,12 +64,14 @@
@Test
public void onCreate_codeCopied() throws Exception {
- activityRule.launchActivity(CODE_INTENT);
-
ClipboardManager clipboardManager =
ApplicationProvider.getApplicationContext().getSystemService(ClipboardManager.class);
// Use shell's permissions to ensure we can access the clipboard
InstrumentationRegistry.getInstrumentation().getUiAutomation().adoptShellPermissionIdentity();
+ clipboardManager.clearPrimaryClip();
+
+ activityRule.launchActivity(CODE_INTENT);
+
ClipData clipFromClipboard;
try {
assertThat(clipboardManager.hasPrimaryClip()).isTrue();
diff --git a/notification/tests/src/com/android/textclassifier/notification/SmartSuggestionsHelperTest.java b/notification/tests/src/com/android/textclassifier/notification/SmartSuggestionsHelperTest.java
index bc10cc0..9d0a720 100644
--- a/notification/tests/src/com/android/textclassifier/notification/SmartSuggestionsHelperTest.java
+++ b/notification/tests/src/com/android/textclassifier/notification/SmartSuggestionsHelperTest.java
@@ -47,6 +47,7 @@
import java.util.Collection;
import java.util.List;
import org.junit.Before;
+import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -64,22 +65,36 @@
private final Context context = ApplicationProvider.getApplicationContext();
private final FakeTextClassifier fakeTextClassifier = new FakeTextClassifier();
private final TestConfig config = new TestConfig();
- private SmartSuggestionsHelper smartActions;
+ private TestableSmartSuggestionsHelper smartActions;
private Notification.Builder notificationBuilder;
@Before
public void setup() {
TextClassificationManager textClassificationManager =
context.getSystemService(TextClassificationManager.class);
- // Workaround b/144163980.
- // TODO(tonymak) Remove this workaround once the latest emulator image is dropped.
- textClassificationManager.setTextClassificationSessionFactory(
- classificationContext -> fakeTextClassifier);
textClassificationManager.setTextClassifier(fakeTextClassifier);
- smartActions = new SmartSuggestionsHelper(context, config);
+ smartActions = new TestableSmartSuggestionsHelper(context, config);
notificationBuilder = new Notification.Builder(context, "id");
}
+ static class TestableSmartSuggestionsHelper extends SmartSuggestionsHelper {
+ private int numOfSessionsCreated = 0;
+
+ TestableSmartSuggestionsHelper(Context context, SmartSuggestionsConfig config) {
+ super(context, config);
+ }
+
+ @Override
+ TextClassifier createTextClassificationSession() {
+ numOfSessionsCreated += 1;
+ return super.createTextClassificationSession();
+ }
+
+ int getNumOfSessionsCreated() {
+ return numOfSessionsCreated;
+ }
+ }
+
@Test
public void onNotificationEnqueued_notMessageCategory() {
Notification notification = notificationBuilder.setContentText(MESSAGE).build();
@@ -90,6 +105,8 @@
assertThat(smartSuggestions.getReplies()).isEmpty();
assertThat(smartSuggestions.getActions()).isEmpty();
+ // Ideally, we should verify that createTextClassificationSession
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(0);
}
@Test
@@ -107,6 +124,7 @@
assertThat(smartSuggestions.getReplies()).isEmpty();
assertThat(smartSuggestions.getActions()).isEmpty();
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(0);
}
@Test
@@ -123,6 +141,7 @@
assertThat(smartSuggestions.getReplies()).isEmpty();
assertAdjustmentWithSmartAction(smartSuggestions);
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(1);
}
@Test
@@ -139,6 +158,7 @@
List<Message> messages = request.getConversation();
assertThat(messages).hasSize(1);
assertThat(messages.get(0).getText().toString()).isEqualTo(MESSAGE);
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(1);
}
@Test
@@ -172,6 +192,7 @@
assertMessage(messages.get(1), "secondMessage", PERSON_USER_SELF, 2000);
assertMessage(messages.get(2), "thirdMessage", userA, 3000);
assertMessage(messages.get(3), "fourthMessage", userB, 4000);
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(1);
}
@Test
@@ -195,6 +216,7 @@
assertThat(smartSuggestions.getReplies()).isEmpty();
assertThat(smartSuggestions.getActions()).isEmpty();
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(0);
}
@Test
@@ -215,6 +237,7 @@
assertThat(smartSuggestions.getReplies()).isEmpty();
assertThat(smartSuggestions.getActions()).isEmpty();
+ assertThat(smartActions.getNumOfSessionsCreated()).isEqualTo(0);
}
@Test
@@ -345,6 +368,25 @@
assertThat(smartSuggestions.getActions().get(0).title.toString()).isEqualTo("12345");
}
+ @Ignore // Disabled because it is way too slow to run on an emulator.
+ @Test
+ public void noBinderLeakage() {
+ // Use the real text classifier from system.
+ TextClassificationManager textClassificationManager =
+ context.getSystemService(TextClassificationManager.class);
+ textClassificationManager.setTextClassifier(null);
+
+ // System server crashes when there are more than 20,000 leaked binder proxy.
+ // See
+ // http://cs/android/frameworks/base/core/java/android/os/BinderProxy.java?l=73&rcl=ae52315c8c7d0391bd3c7bca0525a98eeb4cd840.
+ for (int i = 0; i < 20000; i++) {
+ Notification notification = createMessageCategoryNotification();
+ StatusBarNotification statusBarNotification =
+ createStatusBarNotification(notification, PACKAGE_NAME);
+ smartActions.onNotificationEnqueued(statusBarNotification);
+ }
+ }
+
private Notification createMessageCategoryNotification() {
return notificationBuilder
.setContentText(MESSAGE)