Blame - tests/feature-processor_test.cc - platform/external/libtextclassifier

blob: 27cac6ac916555607003dc2e54755a13726cf155 [file] [log] [blame]

Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "smartselect/feature-processor.h"
				18
				19	#include "gmock/gmock.h"
				20	#include "gtest/gtest.h"
				21
				22	namespace libtextclassifier {
				23	namespace {
				24
				25	using testing::ElementsAreArray;
				26
				27	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
				28	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				29	Token("fěěbař@google.com", 6, 23, false),
				30	Token("heře!", 24, 29, false)};
				31
				32	internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
				33
				34	// clang-format off
				35	EXPECT_THAT(tokens, ElementsAreArray(
				36	{Token("Hělló", 0, 5, false),
				37	Token("fěě", 6, 9, false),
				38	Token("bař", 9, 12, false),
				39	Token("@google.com", 12, 23, false),
				40	Token("heře!", 24, 29, false)}));
				41	// clang-format on
				42	}
				43
				44	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
				45	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				46	Token("fěěbař@google.com", 6, 23, false),
				47	Token("heře!", 24, 29, false)};
				48
				49	internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
				50
				51	// clang-format off
				52	EXPECT_THAT(tokens, ElementsAreArray(
				53	{Token("Hělló", 0, 5, false),
				54	Token("fěěbař", 6, 12, false),
				55	Token("@google.com", 12, 23, false),
				56	Token("heře!", 24, 29, false)}));
				57	// clang-format on
				58	}
				59
				60	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
				61	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				62	Token("fěěbař@google.com", 6, 23, false),
				63	Token("heře!", 24, 29, false)};
				64
				65	internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
				66
				67	// clang-format off
				68	EXPECT_THAT(tokens, ElementsAreArray(
				69	{Token("Hělló", 0, 5, false),
				70	Token("fěě", 6, 9, false),
				71	Token("bař@google.com", 9, 23, false),
				72	Token("heře!", 24, 29, false)}));
				73	// clang-format on
				74	}
				75
				76	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
				77	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				78	Token("fěěbař@google.com", 6, 23, false),
				79	Token("heře!", 24, 29, false)};
				80
				81	internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
				82
				83	// clang-format off
				84	EXPECT_THAT(tokens, ElementsAreArray(
				85	{Token("Hělló", 0, 5, false),
				86	Token("fěěbař@google.com", 6, 23, false),
				87	Token("heře!", 24, 29, false)}));
				88	// clang-format on
				89	}
				90
				91	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
				92	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
				93	Token("fěěbař@google.com", 6, 23, false),
				94	Token("heře!", 24, 29, false)};
				95
				96	internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
				97
				98	// clang-format off
				99	EXPECT_THAT(tokens, ElementsAreArray(
				100	{Token("Hě", 0, 2, false),
				101	Token("lló", 2, 5, false),
				102	Token("fěě", 6, 9, false),
				103	Token("bař@google.com", 9, 23, false),
				104	Token("heře!", 24, 29, false)}));
				105	// clang-format on
				106	}
				107
				108	TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	109	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				110	const CodepointSpan span = {0, 5};
				111	// clang-format off
				112	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				113	Token("Lině", 6, 10),
				114	Token("Sěcond", 11, 17),
				115	Token("Lině", 18, 22),
				116	Token("Thiřd", 23, 28),
				117	Token("Lině", 29, 33)};
				118	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	119
				120	// Keeps the first line.
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	121	internal::StripTokensFromOtherLines(context, span, &tokens);
				122	EXPECT_THAT(tokens,
				123	ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	124	}
				125
				126	TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	127	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				128	const CodepointSpan span = {18, 22};
				129	// clang-format off
				130	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				131	Token("Lině", 6, 10),
				132	Token("Sěcond", 11, 17),
				133	Token("Lině", 18, 22),
				134	Token("Thiřd", 23, 28),
				135	Token("Lině", 29, 33)};
				136	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	137
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	138	// Keeps the first line.
				139	internal::StripTokensFromOtherLines(context, span, &tokens);
				140	EXPECT_THAT(tokens, ElementsAreArray(
				141	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	142	}
				143
				144	TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	145	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				146	const CodepointSpan span = {24, 33};
				147	// clang-format off
				148	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				149	Token("Lině", 6, 10),
				150	Token("Sěcond", 11, 17),
				151	Token("Lině", 18, 22),
				152	Token("Thiřd", 23, 28),
				153	Token("Lině", 29, 33)};
				154	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	155
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	156	// Keeps the first line.
				157	internal::StripTokensFromOtherLines(context, span, &tokens);
				158	EXPECT_THAT(tokens, ElementsAreArray(
				159	{Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	160	}
				161
				162	TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	163	const std::string context = "Fiřst Lině\|Sěcond Lině\nThiřd Lině";
				164	const CodepointSpan span = {18, 22};
				165	// clang-format off
				166	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				167	Token("Lině", 6, 10),
				168	Token("Sěcond", 11, 17),
				169	Token("Lině", 18, 22),
				170	Token("Thiřd", 23, 28),
				171	Token("Lině", 29, 33)};
				172	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	173
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	174	// Keeps the first line.
				175	internal::StripTokensFromOtherLines(context, span, &tokens);
				176	EXPECT_THAT(tokens, ElementsAreArray(
				177	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	178	}
				179
				180	TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	181	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
				182	const CodepointSpan span = {5, 23};
				183	// clang-format off
				184	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
				185	Token("Lině", 6, 10),
				186	Token("Sěcond", 18, 23),
				187	Token("Lině", 19, 23),
				188	Token("Thiřd", 23, 28),
				189	Token("Lině", 29, 33)};
				190	// clang-format on
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	191
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	192	// Keeps the first line.
				193	internal::StripTokensFromOtherLines(context, span, &tokens);
				194	EXPECT_THAT(tokens, ElementsAreArray(
				195	{Token("Fiřst", 0, 5), Token("Lině", 6, 10),
				196	Token("Sěcond", 18, 23), Token("Lině", 19, 23),
				197	Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	198	}
				199
				200	TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
				201	FeatureProcessorOptions options;
				202	options.set_num_buckets(10);
				203	options.set_context_size(7);
				204	options.set_max_selection_span(7);
				205	options.add_chargram_orders(1);
				206	options.set_tokenize_on_space(true);
				207	options.set_context_dropout_probability(0.5);
				208	options.set_use_variable_context_dropout(true);
				209	TokenizationCodepointRange* config =
				210	options.add_tokenization_codepoint_config();
				211	config->set_start(32);
				212	config->set_end(33);
				213	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				214	FeatureProcessor feature_processor(options);
				215
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	216	// Test that two subsequent runs with random context dropout produce
				217	// different features.
				218	feature_processor.SetRandom(new std::mt19937);
				219
				220	std::vector<std::vector<std::pair<int, float>>> features;
				221	std::vector<std::vector<std::pair<int, float>>> features2;
				222	std::vector<float> extra_features;
				223	std::vector<CodepointSpan> selection_label_spans;
				224	int selection_label;
				225	CodepointSpan selection_codepoint_label;
				226	int classification_label;
				227	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	228	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				229	&features, &extra_features, &selection_label_spans, &selection_label,
				230	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	231	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	232	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				233	&features2, &extra_features, &selection_label_spans, &selection_label,
				234	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	235
				236	EXPECT_NE(features, features2);
				237	}
				238
				239	TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
				240	FeatureProcessorOptions options;
				241	options.set_num_buckets(10);
				242	options.set_context_size(9);
				243	options.set_max_selection_span(7);
				244	options.add_chargram_orders(1);
				245	options.set_tokenize_on_space(true);
				246	TokenizationCodepointRange* config =
				247	options.add_tokenization_codepoint_config();
				248	config->set_start(32);
				249	config->set_end(33);
				250	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				251	FeatureProcessor feature_processor(options);
				252
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	253	std::vector<std::vector<std::pair<int, float>>> features;
				254	std::vector<float> extra_features;
				255	std::vector<CodepointSpan> selection_label_spans;
				256	int selection_label;
				257	CodepointSpan selection_codepoint_label;
				258	int classification_label;
				259	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	260	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
				261	&features, &extra_features, &selection_label_spans, &selection_label,
				262	&selection_codepoint_label, &classification_label));
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	263	EXPECT_EQ(19, features.size());
				264
				265	// Should pad the string.
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	266	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	267	"X", {0, 1}, {0, 1}, "", &features, &extra_features,
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	268	&selection_label_spans, &selection_label, &selection_codepoint_label,
				269	&classification_label));
				270	EXPECT_EQ(19, features.size());
				271	}
				272
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	273	TEST(FeatureProcessorTest, FindTokensInSelectionSingleCharacter) {
				274	FeatureProcessorOptions options;
				275	options.set_num_buckets(10);
				276	options.set_context_size(9);
				277	options.set_max_selection_span(7);
				278	options.add_chargram_orders(1);
				279	options.set_tokenize_on_space(true);
				280	TokenizationCodepointRange* config =
				281	options.add_tokenization_codepoint_config();
				282	config->set_start(32);
				283	config->set_end(33);
				284	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	285	FeatureProcessor feature_processor(options);
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	286
				287	SelectionWithContext selection_with_context;
				288	selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
				289
				290	// Selection and click indices of the X in the middle:
				291	selection_with_context.selection_start = 20;
				292	selection_with_context.selection_end = 21;
				293	// clang-format off
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	294	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	295	feature_processor.Tokenize(selection_with_context.context),
				296	selection_with_context),
				297	ElementsAreArray({Token("X", 20, 21, false)}));
				298	// clang-format on
				299	}
				300
				301	TEST(FeatureProcessorTest, FindTokensInSelectionInsideTokenBoundary) {
				302	FeatureProcessorOptions options;
				303	options.set_num_buckets(10);
				304	options.set_context_size(9);
				305	options.set_max_selection_span(7);
				306	options.add_chargram_orders(1);
				307	options.set_tokenize_on_space(true);
				308	TokenizationCodepointRange* config =
				309	options.add_tokenization_codepoint_config();
				310	config->set_start(32);
				311	config->set_end(33);
				312	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	313	FeatureProcessor feature_processor(options);
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	314
				315	SelectionWithContext selection_with_context;
				316	selection_with_context.context = "I live at 350 Third Street, today.";
				317
				318	const std::vector<Token> expected_selection = {
				319	// clang-format off
				320	Token("350", 10, 13, false),
				321	Token("Third", 14, 19, false),
				322	Token("Street,", 20, 27, false),
				323	// clang-format on
				324	};
				325
				326	// Selection: I live at {350 Third Str}eet, today.
				327	selection_with_context.selection_start = 10;
				328	selection_with_context.selection_end = 23;
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	329	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	330	feature_processor.Tokenize(selection_with_context.context),
				331	selection_with_context),
				332	ElementsAreArray(expected_selection));
				333
				334	// Selection: I live at {350 Third Street,} today.
				335	selection_with_context.selection_start = 10;
				336	selection_with_context.selection_end = 27;
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	337	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	338	feature_processor.Tokenize(selection_with_context.context),
				339	selection_with_context),
				340	ElementsAreArray(expected_selection));
				341
				342	// Selection: I live at {350 Third Street, }today.
				343	selection_with_context.selection_start = 10;
				344	selection_with_context.selection_end = 28;
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	345	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	346	feature_processor.Tokenize(selection_with_context.context),
				347	selection_with_context),
				348	ElementsAreArray(expected_selection));
				349
				350	// Selection: I live at {350 Third S}treet, today.
				351	selection_with_context.selection_start = 10;
				352	selection_with_context.selection_end = 21;
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	353	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	354	feature_processor.Tokenize(selection_with_context.context),
				355	selection_with_context),
				356	ElementsAreArray(expected_selection));
				357
				358	// Test that when crossing the boundary, we select less/more.
				359
				360	// Selection: I live at {350 Third} Street, today.
				361	selection_with_context.selection_start = 10;
				362	selection_with_context.selection_end = 19;
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	363	EXPECT_THAT(internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	364	feature_processor.Tokenize(selection_with_context.context),
				365	selection_with_context),
				366	ElementsAreArray({
				367	// clang-format off
				368	Token("350", 10, 13, false),
				369	Token("Third", 14, 19, false),
				370	// clang-format on
				371	}));
				372
				373	// Selection: I live at {350 Third Street, t}oday.
				374	selection_with_context.selection_start = 10;
				375	selection_with_context.selection_end = 29;
				376	EXPECT_THAT(
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	377	internal::FindTokensInSelection(
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	378	feature_processor.Tokenize(selection_with_context.context),
				379	selection_with_context),
				380	ElementsAreArray({
				381	// clang-format off
				382	Token("350", 10, 13, false),
				383	Token("Third", 14, 19, false),
				384	Token("Street,", 20, 27, false),
				385	Token("today.", 28, 34, false),
				386	// clang-format on
				387	}));
				388	}
				389
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	390	TEST(FeatureProcessorTest, CenterTokenFromClick) {
				391	int token_index;
				392
				393	// Exactly aligned indices.
				394	token_index = internal::CenterTokenFromClick(
				395	{6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				396	Token("heře!", 12, 17, false)});
				397	EXPECT_EQ(token_index, 1);
				398
				399	// Click is contained in a token.
				400	token_index = internal::CenterTokenFromClick(
				401	{13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				402	Token("heře!", 12, 17, false)});
				403	EXPECT_EQ(token_index, 2);
				404
				405	// Click spans two tokens.
				406	token_index = internal::CenterTokenFromClick(
				407	{6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
				408	Token("heře!", 12, 17, false)});
				409	EXPECT_EQ(token_index, kInvalidIndex);
				410	}
				411
				412	TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
				413	SelectionWithContext selection;
				414	int token_index;
				415
				416	// Selection of length 3. Exactly aligned indices.
				417	token_index = internal::CenterTokenFromMiddleOfSelection(
				418	{7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				419	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				420	Token("Token5", 28, 34, false)});
				421	EXPECT_EQ(token_index, 2);
				422
				423	// Selection of length 1 token. Exactly aligned indices.
				424	token_index = internal::CenterTokenFromMiddleOfSelection(
				425	{21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				426	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				427	Token("Token5", 28, 34, false)});
				428	EXPECT_EQ(token_index, 3);
				429
				430	// Selection marks sub-token range, with no tokens in it.
				431	token_index = internal::CenterTokenFromMiddleOfSelection(
				432	{29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				433	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				434	Token("Token5", 28, 34, false)});
				435	EXPECT_EQ(token_index, kInvalidIndex);
				436
				437	// Selection of length 2. Sub-token indices.
				438	token_index = internal::CenterTokenFromMiddleOfSelection(
				439	{3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				440	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				441	Token("Token5", 28, 34, false)});
				442	EXPECT_EQ(token_index, 1);
				443
				444	// Selection of length 1. Sub-token indices.
				445	token_index = internal::CenterTokenFromMiddleOfSelection(
				446	{22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
				447	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
				448	Token("Token5", 28, 34, false)});
				449	EXPECT_EQ(token_index, 4);
Alex Salcianu	9087f1f	2017-03-22 21:22:39 -0400	[diff] [blame^]	450
				451	// Some invalid ones.
				452	token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
				453	EXPECT_EQ(token_index, -1);
				454	}
				455
				456	TEST(FeatureProcessorTest, GetFeaturesForSharing) {
				457	FeatureProcessorOptions options;
				458	options.set_num_buckets(10);
				459	options.set_context_size(9);
				460	options.set_max_selection_span(7);
				461	options.add_chargram_orders(1);
				462	options.set_tokenize_on_space(true);
				463	options.set_center_token_selection_method(
				464	FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION);
				465	options.set_only_use_line_with_click(true);
				466	options.set_split_tokens_on_selection_boundaries(true);
				467	options.set_extract_selection_mask_feature(true);
				468	TokenizationCodepointRange* config =
				469	options.add_tokenization_codepoint_config();
				470	config->set_start(32);
				471	config->set_end(33);
				472	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				473	config = options.add_tokenization_codepoint_config();
				474	config->set_start(10);
				475	config->set_end(11);
				476	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
				477	FeatureProcessor feature_processor(options);
				478
				479	std::vector<std::vector<std::pair<int, float>>> features;
				480	std::vector<float> extra_features;
				481	std::vector<CodepointSpan> selection_label_spans;
				482	int selection_label;
				483	CodepointSpan selection_codepoint_label;
				484	int classification_label;
				485	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
				486	"line 1\nline2\nsome entity\n line 4", {13, 24}, {13, 24}, "", &features,
				487	&extra_features, &selection_label_spans, &selection_label,
				488	&selection_codepoint_label, &classification_label));
				489	EXPECT_EQ(19, features.size());
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	490	}
				491
Matt Sharifi	d40f976	2017-03-14 21:24:23 +0100	[diff] [blame]	492	} // namespace
				493	} // namespace libtextclassifier