Blame - tests/feature-processor_test.cc - platform/external/libtextclassifier

blob: 88a93f3806d1fa98f48390148aaac076c1bd373c [file] [log] [blame]

Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "smartselect/feature-processor.h"
				18
				19	#include "gmock/gmock.h"
				20	#include "gtest/gtest.h"
				21
				22	namespace libtextclassifier {
				23	namespace {
				24
				25	using testing::ElementsAreArray;
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame^]	26	using testing::FloatEq;
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	27
				28	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
				29	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				30	Token("fěěbař@google.com", 6, 23, false),
				31	Token("heře!", 24, 29, false)};
				32
				33	internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
				34
				35	// clang-format off
				36	EXPECT_THAT(tokens, ElementsAreArray(
				37	{Token("Hělló", 0, 5, false),
				38	Token("fěě", 6, 9, false),
				39	Token("bař", 9, 12, false),
				40	Token("@google.com", 12, 23, false),
				41	Token("heře!", 24, 29, false)}));
				42	// clang-format on
				43	}
				44
				45	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
				46	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				47	Token("fěěbař@google.com", 6, 23, false),
				48	Token("heře!", 24, 29, false)};
				49
				50	internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
				51
				52	// clang-format off
				53	EXPECT_THAT(tokens, ElementsAreArray(
				54	{Token("Hělló", 0, 5, false),
				55	Token("fěěbař", 6, 12, false),
				56	Token("@google.com", 12, 23, false),
				57	Token("heře!", 24, 29, false)}));
				58	// clang-format on
				59	}
				60
				61	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
				62	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				63	Token("fěěbař@google.com", 6, 23, false),
				64	Token("heře!", 24, 29, false)};
				65
				66	internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
				67
				68	// clang-format off
				69	EXPECT_THAT(tokens, ElementsAreArray(
				70	{Token("Hělló", 0, 5, false),
				71	Token("fěě", 6, 9, false),
				72	Token("bař@google.com", 9, 23, false),
				73	Token("heře!", 24, 29, false)}));
				74	// clang-format on
				75	}
				76
				77	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
				78	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				79	Token("fěěbař@google.com", 6, 23, false),
				80	Token("heře!", 24, 29, false)};
				81
				82	internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
				83
				84	// clang-format off
				85	EXPECT_THAT(tokens, ElementsAreArray(
				86	{Token("Hělló", 0, 5, false),
				87	Token("fěěbař@google.com", 6, 23, false),
				88	Token("heře!", 24, 29, false)}));
				89	// clang-format on
				90	}
				91
				92	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
				93	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				94	Token("fěěbař@google.com", 6, 23, false),
				95	Token("heře!", 24, 29, false)};
				96
				97	internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
				98
				99	// clang-format off
				100	EXPECT_THAT(tokens, ElementsAreArray(
				101	{Token("Hě", 0, 2, false),
				102	Token("lló", 2, 5, false),
				103	Token("fěě", 6, 9, false),
				104	Token("bař@google.com", 9, 23, false),
				105	Token("heře!", 24, 29, false)}));
				106	// clang-format on
				107	}
				108
				109	TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	110	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				111	const CodepointSpan span = {0, 5};
				112	// clang-format off
				113	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				114	Token("Lině", 6, 10),
				115	Token("Sěcond", 11, 17),
				116	Token("Lině", 18, 22),
				117	Token("Thiřd", 23, 28),
				118	Token("Lině", 29, 33)};
				119	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	120
				121	// Keeps the first line.
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	122	internal::StripTokensFromOtherLines(context, span, &tokens);
				123	EXPECT_THAT(tokens,
				124	ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	125	}
				126
				127	TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	128	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				129	const CodepointSpan span = {18, 22};
				130	// clang-format off
				131	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				132	Token("Lině", 6, 10),
				133	Token("Sěcond", 11, 17),
				134	Token("Lině", 18, 22),
				135	Token("Thiřd", 23, 28),
				136	Token("Lině", 29, 33)};
				137	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	138
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	139	// Keeps the first line.
				140	internal::StripTokensFromOtherLines(context, span, &tokens);
				141	EXPECT_THAT(tokens, ElementsAreArray(
				142	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	143	}
				144
				145	TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	146	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				147	const CodepointSpan span = {24, 33};
				148	// clang-format off
				149	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				150	Token("Lině", 6, 10),
				151	Token("Sěcond", 11, 17),
				152	Token("Lině", 18, 22),
				153	Token("Thiřd", 23, 28),
				154	Token("Lině", 29, 33)};
				155	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	156
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	157	// Keeps the first line.
				158	internal::StripTokensFromOtherLines(context, span, &tokens);
				159	EXPECT_THAT(tokens, ElementsAreArray(
				160	{Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	161	}
				162
				163	TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	164	const std::string context = "Fiřst Lině\|Sěcond Lině\nThiřd Lině";
				165	const CodepointSpan span = {18, 22};
				166	// clang-format off
				167	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				168	Token("Lině", 6, 10),
				169	Token("Sěcond", 11, 17),
				170	Token("Lině", 18, 22),
				171	Token("Thiřd", 23, 28),
				172	Token("Lině", 29, 33)};
				173	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	174
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	175	// Keeps the first line.
				176	internal::StripTokensFromOtherLines(context, span, &tokens);
				177	EXPECT_THAT(tokens, ElementsAreArray(
				178	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	179	}
				180
				181	TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	182	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				183	const CodepointSpan span = {5, 23};
				184	// clang-format off
				185	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				186	Token("Lině", 6, 10),
				187	Token("Sěcond", 18, 23),
				188	Token("Lině", 19, 23),
				189	Token("Thiřd", 23, 28),
				190	Token("Lině", 29, 33)};
				191	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	192
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	193	// Keeps the first line.
				194	internal::StripTokensFromOtherLines(context, span, &tokens);
				195	EXPECT_THAT(tokens, ElementsAreArray(
				196	{Token("Fiřst", 0, 5), Token("Lině", 6, 10),
				197	Token("Sěcond", 18, 23), Token("Lině", 19, 23),
				198	Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	199	}
				200
Matt Sharifi	0d68ef9	2017-03-27 14:20:21 +0200	[diff] [blame]	201	class TestingFeatureProcessor : public FeatureProcessor {
				202	public:
				203	using FeatureProcessor::FeatureProcessor;
				204	using FeatureProcessor::SpanToLabel;
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame^]	205	using FeatureProcessor::SupportedCodepointsRatio;
				206	using FeatureProcessor::IsCodepointSupported;
Matt Sharifi	0d68ef9	2017-03-27 14:20:21 +0200	[diff] [blame]	207	};
				208
				209	TEST(FeatureProcessorTest, SpanToLabel) {
				210	FeatureProcessorOptions options;
				211	options.set_context_size(1);
				212	options.set_max_selection_span(1);
				213	options.set_tokenize_on_space(true);
				214	options.set_snap_label_span_boundaries_to_containing_tokens(false);
				215
				216	TokenizationCodepointRange* config =
				217	options.add_tokenization_codepoint_config();
				218	config->set_start(32);
				219	config->set_end(33);
				220	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				221
				222	TestingFeatureProcessor feature_processor(options);
				223	std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
				224	ASSERT_EQ(3, tokens.size());
				225	int label;
				226	ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
				227	EXPECT_EQ(kInvalidLabel, label);
				228	ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
				229	EXPECT_NE(kInvalidLabel, label);
				230	TokenSpan token_span;
				231	feature_processor.LabelToTokenSpan(label, &token_span);
				232	EXPECT_EQ(0, token_span.first);
				233	EXPECT_EQ(0, token_span.second);
				234
				235	// Reconfigure with snapping enabled.
				236	options.set_snap_label_span_boundaries_to_containing_tokens(true);
				237	TestingFeatureProcessor feature_processor2(options);
				238	int label2;
				239	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
				240	EXPECT_EQ(label, label2);
				241	ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
				242	EXPECT_EQ(label, label2);
				243	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
				244	EXPECT_EQ(label, label2);
				245
				246	// Cross a token boundary.
				247	ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
				248	EXPECT_EQ(kInvalidLabel, label2);
				249	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
				250	EXPECT_EQ(kInvalidLabel, label2);
				251
				252	// Multiple tokens.
				253	options.set_context_size(2);
				254	options.set_max_selection_span(2);
				255	TestingFeatureProcessor feature_processor3(options);
				256	tokens = feature_processor3.Tokenize("zero, one, two, three, four");
				257	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
				258	EXPECT_NE(kInvalidLabel, label2);
				259	feature_processor3.LabelToTokenSpan(label2, &token_span);
				260	EXPECT_EQ(1, token_span.first);
				261	EXPECT_EQ(0, token_span.second);
				262
				263	int label3;
				264	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
				265	EXPECT_EQ(label2, label3);
				266	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
				267	EXPECT_EQ(label2, label3);
				268	ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
				269	EXPECT_EQ(label2, label3);
				270	}
				271
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	272	TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
				273	FeatureProcessorOptions options;
				274	options.set_num_buckets(10);
				275	options.set_context_size(7);
				276	options.set_max_selection_span(7);
				277	options.add_chargram_orders(1);
				278	options.set_tokenize_on_space(true);
				279	options.set_context_dropout_probability(0.5);
				280	options.set_use_variable_context_dropout(true);
				281	TokenizationCodepointRange* config =
				282	options.add_tokenization_codepoint_config();
				283	config->set_start(32);
				284	config->set_end(33);
				285	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				286	FeatureProcessor feature_processor(options);
				287
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	288	// Test that two subsequent runs with random context dropout produce
				289	// different features.
				290	feature_processor.SetRandom(new std::mt19937);
				291
				292	std::vector<std::vector<std::pair<int, float>>> features;
				293	std::vector<std::vector<std::pair<int, float>>> features2;
				294	std::vector<float> extra_features;
				295	std::vector<CodepointSpan> selection_label_spans;
				296	int selection_label;
				297	CodepointSpan selection_codepoint_label;
				298	int classification_label;
				299	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	300	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				301	&features, &extra_features, &selection_label_spans, &selection_label,
				302	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	303	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	304	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				305	&features2, &extra_features, &selection_label_spans, &selection_label,
				306	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	307
				308	EXPECT_NE(features, features2);
				309	}
				310
				311	TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
				312	FeatureProcessorOptions options;
				313	options.set_num_buckets(10);
				314	options.set_context_size(9);
				315	options.set_max_selection_span(7);
				316	options.add_chargram_orders(1);
				317	options.set_tokenize_on_space(true);
				318	TokenizationCodepointRange* config =
				319	options.add_tokenization_codepoint_config();
				320	config->set_start(32);
				321	config->set_end(33);
				322	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				323	FeatureProcessor feature_processor(options);
				324
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	325	std::vector<std::vector<std::pair<int, float>>> features;
				326	std::vector<float> extra_features;
				327	std::vector<CodepointSpan> selection_label_spans;
				328	int selection_label;
				329	CodepointSpan selection_codepoint_label;
				330	int classification_label;
				331	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	332	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				333	&features, &extra_features, &selection_label_spans, &selection_label,
				334	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	335	EXPECT_EQ(19, features.size());
				336
				337	// Should pad the string.
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	338	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	339	"X", {0, 1}, {0, 1}, "", &features, &extra_features,
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	340	&selection_label_spans, &selection_label, &selection_codepoint_label,
				341	&classification_label));
				342	EXPECT_EQ(19, features.size());
				343	}
				344
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	345	TEST(FeatureProcessorTest, CenterTokenFromClick) {
				346	int token_index;
				347
				348	// Exactly aligned indices.
				349	token_index = internal::CenterTokenFromClick(
				350	{6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				351	Token("heře!", 12, 17, false)});
				352	EXPECT_EQ(token_index, 1);
				353
				354	// Click is contained in a token.
				355	token_index = internal::CenterTokenFromClick(
				356	{13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				357	Token("heře!", 12, 17, false)});
				358	EXPECT_EQ(token_index, 2);
				359
				360	// Click spans two tokens.
				361	token_index = internal::CenterTokenFromClick(
				362	{6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				363	Token("heře!", 12, 17, false)});
				364	EXPECT_EQ(token_index, kInvalidIndex);
				365	}
				366
				367	TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	368	int token_index;
				369
				370	// Selection of length 3. Exactly aligned indices.
				371	token_index = internal::CenterTokenFromMiddleOfSelection(
				372	{7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				373	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				374	Token("Token5", 28, 34, false)});
				375	EXPECT_EQ(token_index, 2);
				376
				377	// Selection of length 1 token. Exactly aligned indices.
				378	token_index = internal::CenterTokenFromMiddleOfSelection(
				379	{21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				380	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				381	Token("Token5", 28, 34, false)});
				382	EXPECT_EQ(token_index, 3);
				383
				384	// Selection marks sub-token range, with no tokens in it.
				385	token_index = internal::CenterTokenFromMiddleOfSelection(
				386	{29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				387	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				388	Token("Token5", 28, 34, false)});
				389	EXPECT_EQ(token_index, kInvalidIndex);
				390
				391	// Selection of length 2. Sub-token indices.
				392	token_index = internal::CenterTokenFromMiddleOfSelection(
				393	{3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				394	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				395	Token("Token5", 28, 34, false)});
				396	EXPECT_EQ(token_index, 1);
				397
				398	// Selection of length 1. Sub-token indices.
				399	token_index = internal::CenterTokenFromMiddleOfSelection(
				400	{22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				401	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				402	Token("Token5", 28, 34, false)});
				403	EXPECT_EQ(token_index, 4);
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame]	404
				405	// Some invalid ones.
				406	token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
				407	EXPECT_EQ(token_index, -1);
				408	}
				409
				410	TEST(FeatureProcessorTest, GetFeaturesForSharing) {
				411	FeatureProcessorOptions options;
				412	options.set_num_buckets(10);
				413	options.set_context_size(9);
				414	options.set_max_selection_span(7);
				415	options.add_chargram_orders(1);
				416	options.set_tokenize_on_space(true);
				417	options.set_center_token_selection_method(
				418	FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION);
				419	options.set_only_use_line_with_click(true);
				420	options.set_split_tokens_on_selection_boundaries(true);
				421	options.set_extract_selection_mask_feature(true);
				422	TokenizationCodepointRange* config =
				423	options.add_tokenization_codepoint_config();
				424	config->set_start(32);
				425	config->set_end(33);
				426	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				427	config = options.add_tokenization_codepoint_config();
				428	config->set_start(10);
				429	config->set_end(11);
				430	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				431	FeatureProcessor feature_processor(options);
				432
				433	std::vector<std::vector<std::pair<int, float>>> features;
				434	std::vector<float> extra_features;
				435	std::vector<CodepointSpan> selection_label_spans;
				436	int selection_label;
				437	CodepointSpan selection_codepoint_label;
				438	int classification_label;
				439	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
				440	"line 1\nline2\nsome entity\n line 4", {13, 24}, {13, 24}, "", &features,
				441	&extra_features, &selection_label_spans, &selection_label,
				442	&selection_codepoint_label, &classification_label));
				443	EXPECT_EQ(19, features.size());
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	444	}
				445
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame^]	446	TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
				447	FeatureProcessorOptions options;
				448	options.set_context_size(2);
				449	options.set_max_selection_span(2);
				450	options.set_snap_label_span_boundaries_to_containing_tokens(false);
				451
				452	TokenizationCodepointRange* config =
				453	options.add_tokenization_codepoint_config();
				454	config->set_start(32);
				455	config->set_end(33);
				456	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				457
				458	FeatureProcessorOptions::CodepointRange* range;
				459	range = options.add_supported_codepoint_ranges();
				460	range->set_start(0);
				461	range->set_end(128);
				462
				463	range = options.add_supported_codepoint_ranges();
				464	range->set_start(10000);
				465	range->set_end(10001);
				466
				467	range = options.add_supported_codepoint_ranges();
				468	range->set_start(20000);
				469	range->set_end(30000);
				470
				471	TestingFeatureProcessor feature_processor(options);
				472	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
				473	1, feature_processor.Tokenize("aaa bbb ccc")),
				474	FloatEq(1.0));
				475	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
				476	1, feature_processor.Tokenize("aaa bbb ěěě")),
				477	FloatEq(2.0 / 3));
				478	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
				479	1, feature_processor.Tokenize("ěěě řřř ěěě")),
				480	FloatEq(0.0));
				481	EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
				482	EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
				483	EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
				484	EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
				485	EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
				486	EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
				487	EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
				488	EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
				489	EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));
				490
				491	std::vector<nlp_core::FeatureVector> features;
				492	std::vector<float> extra_features;
				493
				494	options.set_min_supported_codepoint_ratio(0.0);
				495	feature_processor = TestingFeatureProcessor(options);
				496	EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
				497	&extra_features,
				498	/selection_label_spans=/nullptr));
				499
				500	options.set_min_supported_codepoint_ratio(0.2);
				501	feature_processor = TestingFeatureProcessor(options);
				502	EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
				503	&extra_features,
				504	/selection_label_spans=/nullptr));
				505
				506	options.set_min_supported_codepoint_ratio(0.5);
				507	feature_processor = TestingFeatureProcessor(options);
				508	EXPECT_FALSE(feature_processor.GetFeatures(
				509	"ěěě řřř eee", {4, 7}, &features, &extra_features,
				510	/selection_label_spans=/nullptr));
				511	}
				512
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	513	} // namespace
				514	} // namespace libtextclassifier