Blame - feature-processor_test.cc - platform/external/libtextclassifier

2017-03-14 21:24:23 +0100

[diff] [blame]

/*

*

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

17

#include "feature-processor.h"

18

19

#include "model-executor.h"

20

#include "tensor-view.h"

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

21

22

#include "gmock/gmock.h"

23

#include "gtest/gtest.h"

24

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

25

namespace libtextclassifier2 {

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

26

namespace {

27

28

using testing::ElementsAreArray;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

29

using testing::FloatEq;

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

30

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

31

flatbuffers::DetachedBuffer PackFeatureProcessorOptions(

32

const FeatureProcessorOptionsT& options) {

33

flatbuffers::FlatBufferBuilder builder;

34

builder.Finish(CreateFeatureProcessorOptions(builder, &options));

35

return builder.Release();

36

}

37

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

38

class TestingFeatureProcessor : public FeatureProcessor {

39

public:

40

using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;

41

using FeatureProcessor::FeatureProcessor;

42

using FeatureProcessor::ICUTokenize;

43

using FeatureProcessor::IsCodepointInRanges;

44

using FeatureProcessor::SpanToLabel;

45

using FeatureProcessor::StripTokensFromOtherLines;

46

using FeatureProcessor::supported_codepoint_ranges_;

47

using FeatureProcessor::SupportedCodepointsRatio;

48

};

49

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

50

// EmbeddingExecutor that always returns features based on

51

class FakeEmbeddingExecutor : public EmbeddingExecutor {

52

public:

53

bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,

54

int dest_size) override {

55

TC_CHECK_GE(dest_size, 4);

56

EXPECT_EQ(sparse_features.size(), 1);

57

dest[0] = sparse_features.data()[0];

58

dest[1] = sparse_features.data()[0];

59

dest[2] = -sparse_features.data()[0];

60

dest[3] = -sparse_features.data()[0];

return true;

}

private:

std::vector<float> storage_;

66

};

67

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

68

TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

69

std::vector<Token> tokens{Token("Hělló", 0, 5),

70

Token("fěěbař@google.com", 6, 23),

71

Token("heře!", 24, 29)};

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

72

73

internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);

74

75

// clang-format off

76

EXPECT_THAT(tokens, ElementsAreArray(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

77

{Token("Hělló", 0, 5),

78

Token("fěě", 6, 9),

79

Token("bař", 9, 12),

80

Token("@google.com", 12, 23),

81

Token("heře!", 24, 29)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

// clang-format on

}

TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

86

std::vector<Token> tokens{Token("Hělló", 0, 5),

87

Token("fěěbař@google.com", 6, 23),

88

Token("heře!", 24, 29)};

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

89

90

internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);

91

92

// clang-format off

93

EXPECT_THAT(tokens, ElementsAreArray(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

94

{Token("Hělló", 0, 5),

95

Token("fěěbař", 6, 12),

96

Token("@google.com", 12, 23),

97

Token("heře!", 24, 29)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

// clang-format on

}

TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

102

std::vector<Token> tokens{Token("Hělló", 0, 5),

103

Token("fěěbař@google.com", 6, 23),

104

Token("heře!", 24, 29)};

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

105

106

internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);

107

108

// clang-format off

109

EXPECT_THAT(tokens, ElementsAreArray(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

110

{Token("Hělló", 0, 5),

111

Token("fěě", 6, 9),

112

Token("bař@google.com", 9, 23),

113

Token("heře!", 24, 29)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

// clang-format on

}

TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

118

std::vector<Token> tokens{Token("Hělló", 0, 5),

119

Token("fěěbař@google.com", 6, 23),

120

Token("heře!", 24, 29)};

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

121

122

internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);

123

124

// clang-format off

125

EXPECT_THAT(tokens, ElementsAreArray(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

126

{Token("Hělló", 0, 5),

127

Token("fěěbař@google.com", 6, 23),

128

Token("heře!", 24, 29)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

// clang-format on

}

TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

133

std::vector<Token> tokens{Token("Hělló", 0, 5),

134

Token("fěěbař@google.com", 6, 23),

135

Token("heře!", 24, 29)};

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

136

137

internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);

138

139

// clang-format off

140

EXPECT_THAT(tokens, ElementsAreArray(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

{Token("Hě", 0, 2),

Token("lló", 2, 5),

Token("fěě", 6, 9),

Token("bař@google.com", 9, 23),

145

Token("heře!", 24, 29)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

// clang-format on

}

TEST(FeatureProcessorTest, KeepLineWithClickFirst) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

150

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

151

FeatureProcessorOptionsT options;

152

options.only_use_line_with_click = true;

153

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

154

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

155

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

156

&unilib);

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

157

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

158

const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";

159

const CodepointSpan span = {0, 5};

160

// clang-format off

161

std::vector<Token> tokens = {Token("Fiřst", 0, 5),

162

Token("Lině", 6, 10),

163

Token("Sěcond", 11, 17),

164

Token("Lině", 18, 22),

165

Token("Thiřd", 23, 28),

166

Token("Lině", 29, 33)};

167

// clang-format on

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

168

169

// Keeps the first line.

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

170

feature_processor.StripTokensFromOtherLines(context, span, &tokens);

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

171

EXPECT_THAT(tokens,

172

ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

173

}

174

175

TEST(FeatureProcessorTest, KeepLineWithClickSecond) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

176

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

177

FeatureProcessorOptionsT options;

178

options.only_use_line_with_click = true;

179

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

180

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

181

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

182

&unilib);

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

183

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

184

const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";

185

const CodepointSpan span = {18, 22};

186

// clang-format off

187

std::vector<Token> tokens = {Token("Fiřst", 0, 5),

188

Token("Lině", 6, 10),

189

Token("Sěcond", 11, 17),

190

Token("Lině", 18, 22),

191

Token("Thiřd", 23, 28),

192

Token("Lině", 29, 33)};

193

// clang-format on

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

194

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

195

// Keeps the first line.

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

196

feature_processor.StripTokensFromOtherLines(context, span, &tokens);

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

197

EXPECT_THAT(tokens, ElementsAreArray(

198

{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

199

}

200

201

TEST(FeatureProcessorTest, KeepLineWithClickThird) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

202

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

203

FeatureProcessorOptionsT options;

204

options.only_use_line_with_click = true;

205

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

206

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

207

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

208

&unilib);

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

209

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

210

const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";

211

const CodepointSpan span = {24, 33};

212

// clang-format off

213

std::vector<Token> tokens = {Token("Fiřst", 0, 5),

214

Token("Lině", 6, 10),

215

Token("Sěcond", 11, 17),

216

Token("Lině", 18, 22),

217

Token("Thiřd", 23, 28),

218

Token("Lině", 29, 33)};

219

// clang-format on

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

220

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

221

// Keeps the first line.

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

222

feature_processor.StripTokensFromOtherLines(context, span, &tokens);

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

223

EXPECT_THAT(tokens, ElementsAreArray(

224

{Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

225

}

226

227

TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

228

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

229

FeatureProcessorOptionsT options;

230

options.only_use_line_with_click = true;

231

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

232

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

233

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

234

&unilib);

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

235

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

236

const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";

237

const CodepointSpan span = {18, 22};

238

// clang-format off

239

std::vector<Token> tokens = {Token("Fiřst", 0, 5),

240

Token("Lině", 6, 10),

241

Token("Sěcond", 11, 17),

242

Token("Lině", 18, 22),

243

Token("Thiřd", 23, 28),

244

Token("Lině", 29, 33)};

245

// clang-format on

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

246

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

247

// Keeps the first line.

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

248

feature_processor.StripTokensFromOtherLines(context, span, &tokens);

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

249

EXPECT_THAT(tokens, ElementsAreArray(

250

{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

251

}

252

253

TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

254

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

255

FeatureProcessorOptionsT options;

256

options.only_use_line_with_click = true;

257

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

258

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

259

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

260

&unilib);

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

261

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

262

const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";

263

const CodepointSpan span = {5, 23};

264

// clang-format off

265

std::vector<Token> tokens = {Token("Fiřst", 0, 5),

266

Token("Lině", 6, 10),

267

Token("Sěcond", 18, 23),

268

Token("Lině", 19, 23),

269

Token("Thiřd", 23, 28),

270

Token("Lině", 29, 33)};

271

// clang-format on

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

272

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

273

// Keeps the first line.

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

274

feature_processor.StripTokensFromOtherLines(context, span, &tokens);

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

275

EXPECT_THAT(tokens, ElementsAreArray(

276

{Token("Fiřst", 0, 5), Token("Lině", 6, 10),

277

Token("Sěcond", 18, 23), Token("Lině", 19, 23),

278

Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

279

}

280

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

281

TEST(FeatureProcessorTest, SpanToLabel) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

282

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

283

FeatureProcessorOptionsT options;

284

options.context_size = 1;

285

options.max_selection_span = 1;

286

options.snap_label_span_boundaries_to_containing_tokens = false;

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

287

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

288

options.tokenization_codepoint_config.emplace_back(

289

new TokenizationCodepointRangeT());

290

auto& config = options.tokenization_codepoint_config.back();

291

config->start = 32;

292

config->end = 33;

293

config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

294

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

295

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

296

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

297

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

298

&unilib);

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

299

std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");

300

ASSERT_EQ(3, tokens.size());

301

int label;

302

ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));

303

EXPECT_EQ(kInvalidLabel, label);

304

ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));

305

EXPECT_NE(kInvalidLabel, label);

306

TokenSpan token_span;

307

feature_processor.LabelToTokenSpan(label, &token_span);

308

EXPECT_EQ(0, token_span.first);

309

EXPECT_EQ(0, token_span.second);

310

311

// Reconfigure with snapping enabled.

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

312

options.snap_label_span_boundaries_to_containing_tokens = true;

313

flatbuffers::DetachedBuffer options2_fb =

314

PackFeatureProcessorOptions(options);

315

TestingFeatureProcessor feature_processor2(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

316

flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),

317

&unilib);

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

318

int label2;

319

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));

320

EXPECT_EQ(label, label2);

321

ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));

322

EXPECT_EQ(label, label2);

323

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));

324

EXPECT_EQ(label, label2);

325

326

// Cross a token boundary.

327

ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));

328

EXPECT_EQ(kInvalidLabel, label2);

329

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));

330

EXPECT_EQ(kInvalidLabel, label2);

331

332

// Multiple tokens.

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

333

options.context_size = 2;

334

options.max_selection_span = 2;

335

flatbuffers::DetachedBuffer options3_fb =

336

PackFeatureProcessorOptions(options);

337

TestingFeatureProcessor feature_processor3(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

338

flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),

339

&unilib);

Matt Sharifi

2017-03-27 14:20:21 +0200

[diff] [blame]

340

tokens = feature_processor3.Tokenize("zero, one, two, three, four");

341

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));

342

EXPECT_NE(kInvalidLabel, label2);

343

feature_processor3.LabelToTokenSpan(label2, &token_span);

344

EXPECT_EQ(1, token_span.first);

345

EXPECT_EQ(0, token_span.second);

346

347

int label3;

348

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));

349

EXPECT_EQ(label2, label3);

350

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));

351

EXPECT_EQ(label2, label3);

352

ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));

353

EXPECT_EQ(label2, label3);

354

}

355

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

356

TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

357

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

358

FeatureProcessorOptionsT options;

359

options.context_size = 1;

360

options.max_selection_span = 1;

361

options.snap_label_span_boundaries_to_containing_tokens = false;

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

362

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

363

options.tokenization_codepoint_config.emplace_back(

364

new TokenizationCodepointRangeT());

365

auto& config = options.tokenization_codepoint_config.back();

366

config->start = 32;

367

config->end = 33;

368

config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

369

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

370

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

371

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

372

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

373

&unilib);

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

374

std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");

375

ASSERT_EQ(3, tokens.size());

376

int label;

377

ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));

378

EXPECT_EQ(kInvalidLabel, label);

379

ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));

380

EXPECT_NE(kInvalidLabel, label);

381

TokenSpan token_span;

382

feature_processor.LabelToTokenSpan(label, &token_span);

383

EXPECT_EQ(0, token_span.first);

384

EXPECT_EQ(0, token_span.second);

385

386

// Reconfigure with snapping enabled.

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

387

options.snap_label_span_boundaries_to_containing_tokens = true;

388

flatbuffers::DetachedBuffer options2_fb =

389

PackFeatureProcessorOptions(options);

390

TestingFeatureProcessor feature_processor2(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

391

flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),

392

&unilib);

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

393

int label2;

394

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));

395

EXPECT_EQ(label, label2);

396

ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));

397

EXPECT_EQ(label, label2);

398

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));

399

EXPECT_EQ(label, label2);

400

401

// Cross a token boundary.

402

ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));

403

EXPECT_EQ(kInvalidLabel, label2);

404

ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));

405

EXPECT_EQ(kInvalidLabel, label2);

406

407

// Multiple tokens.

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

408

options.context_size = 2;

409

options.max_selection_span = 2;

410

flatbuffers::DetachedBuffer options3_fb =

411

PackFeatureProcessorOptions(options);

412

TestingFeatureProcessor feature_processor3(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

413

flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),

414

&unilib);

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

415

tokens = feature_processor3.Tokenize("zero, one, two, three, four");

416

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));

417

EXPECT_NE(kInvalidLabel, label2);

418

feature_processor3.LabelToTokenSpan(label2, &token_span);

419

EXPECT_EQ(1, token_span.first);

420

EXPECT_EQ(0, token_span.second);

421

422

int label3;

423

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));

424

EXPECT_EQ(label2, label3);

425

ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));

426

EXPECT_EQ(label2, label3);

427

ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));

428

EXPECT_EQ(label2, label3);

429

}

430

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

431

TEST(FeatureProcessorTest, CenterTokenFromClick) {

432

int token_index;

433

434

// Exactly aligned indices.

435

token_index = internal::CenterTokenFromClick(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

436

{6, 11},

437

{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

438

EXPECT_EQ(token_index, 1);

439

440

// Click is contained in a token.

441

token_index = internal::CenterTokenFromClick(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

442

{13, 17},

443

{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

444

EXPECT_EQ(token_index, 2);

445

446

// Click spans two tokens.

447

token_index = internal::CenterTokenFromClick(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

448

{6, 17},

449

{Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

450

EXPECT_EQ(token_index, kInvalidIndex);

451

}

452

453

TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

454

int token_index;

455

456

// Selection of length 3. Exactly aligned indices.

457

token_index = internal::CenterTokenFromMiddleOfSelection(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

458

{7, 27},

459

{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),

460

Token("Token4", 21, 27), Token("Token5", 28, 34)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

461

EXPECT_EQ(token_index, 2);

462

463

// Selection of length 1 token. Exactly aligned indices.

464

token_index = internal::CenterTokenFromMiddleOfSelection(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

465

{21, 27},

466

{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),

467

Token("Token4", 21, 27), Token("Token5", 28, 34)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

468

EXPECT_EQ(token_index, 3);

469

470

// Selection marks sub-token range, with no tokens in it.

471

token_index = internal::CenterTokenFromMiddleOfSelection(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

472

{29, 33},

473

{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),

474

Token("Token4", 21, 27), Token("Token5", 28, 34)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

475

EXPECT_EQ(token_index, kInvalidIndex);

476

477

// Selection of length 2. Sub-token indices.

478

token_index = internal::CenterTokenFromMiddleOfSelection(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

479

{3, 25},

480

{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),

481

Token("Token4", 21, 27), Token("Token5", 28, 34)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

482

EXPECT_EQ(token_index, 1);

483

484

// Selection of length 1. Sub-token indices.

485

token_index = internal::CenterTokenFromMiddleOfSelection(

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

486

{22, 34},

487

{Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),

488

Token("Token4", 21, 27), Token("Token5", 28, 34)});

Matt Sharifi

2017-03-17 17:02:43 +0100

[diff] [blame]

489

EXPECT_EQ(token_index, 4);

Alex Salcianu

9087f1f

2017-03-22 21:22:39 -0400

[diff] [blame]

490

491

// Some invalid ones.

492

token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});

493

EXPECT_EQ(token_index, -1);

494

}

495

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

496

TEST(FeatureProcessorTest, SupportedCodepointsRatio) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

497

FeatureProcessorOptionsT options;

498

options.context_size = 2;

499

options.max_selection_span = 2;

500

options.snap_label_span_boundaries_to_containing_tokens = false;

501

options.feature_version = 2;

502

options.embedding_size = 4;

503

options.bounds_sensitive_features.reset(

504

new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());

505

options.bounds_sensitive_features->enabled = true;

506

options.bounds_sensitive_features->num_tokens_before = 5;

507

options.bounds_sensitive_features->num_tokens_inside_left = 3;

508

options.bounds_sensitive_features->num_tokens_inside_right = 3;

509

options.bounds_sensitive_features->num_tokens_after = 5;

510

options.bounds_sensitive_features->include_inside_bag = true;

511

options.bounds_sensitive_features->include_inside_length = true;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

512

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

513

options.tokenization_codepoint_config.emplace_back(

514

new TokenizationCodepointRangeT());

515

auto& config = options.tokenization_codepoint_config.back();

516

config->start = 32;

517

config->end = 33;

518

config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

519

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

520

{

521

options.supported_codepoint_ranges.emplace_back(

522

new FeatureProcessorOptions_::CodepointRangeT());

523

auto& range = options.supported_codepoint_ranges.back();

524

range->start = 0;

525

range->end = 128;

526

}

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

527

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

528

{

529

options.supported_codepoint_ranges.emplace_back(

530

new FeatureProcessorOptions_::CodepointRangeT());

531

auto& range = options.supported_codepoint_ranges.back();

532

range->start = 10000;

533

range->end = 10001;

534

}

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

535

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

536

{

537

options.supported_codepoint_ranges.emplace_back(

538

new FeatureProcessorOptions_::CodepointRangeT());

539

auto& range = options.supported_codepoint_ranges.back();

540

range->start = 20000;

541

range->end = 30000;

542

}

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

543

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

544

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

545

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

546

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

547

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

548

&unilib);

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

549

EXPECT_THAT(feature_processor.SupportedCodepointsRatio(

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

550

{0, 3}, feature_processor.Tokenize("aaa bbb ccc")),

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

551

FloatEq(1.0));

552

EXPECT_THAT(feature_processor.SupportedCodepointsRatio(

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

553

{0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

554

FloatEq(2.0 / 3));

555

EXPECT_THAT(feature_processor.SupportedCodepointsRatio(

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

556

{0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

557

FloatEq(0.0));

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

558

EXPECT_FALSE(feature_processor.IsCodepointInRanges(

559

-1, feature_processor.supported_codepoint_ranges_));

560

EXPECT_TRUE(feature_processor.IsCodepointInRanges(

561

0, feature_processor.supported_codepoint_ranges_));

562

EXPECT_TRUE(feature_processor.IsCodepointInRanges(

563

10, feature_processor.supported_codepoint_ranges_));

564

EXPECT_TRUE(feature_processor.IsCodepointInRanges(

565

127, feature_processor.supported_codepoint_ranges_));

566

EXPECT_FALSE(feature_processor.IsCodepointInRanges(

567

128, feature_processor.supported_codepoint_ranges_));

568

EXPECT_FALSE(feature_processor.IsCodepointInRanges(

569

9999, feature_processor.supported_codepoint_ranges_));

570

EXPECT_TRUE(feature_processor.IsCodepointInRanges(

571

10000, feature_processor.supported_codepoint_ranges_));

572

EXPECT_FALSE(feature_processor.IsCodepointInRanges(

573

10001, feature_processor.supported_codepoint_ranges_));

574

EXPECT_TRUE(feature_processor.IsCodepointInRanges(

575

25000, feature_processor.supported_codepoint_ranges_));

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

576

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

577

std::unique_ptr<CachedFeatures> cached_features;

578

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

579

FakeEmbeddingExecutor embedding_executor;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

580

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

581

const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),

582

Token("eee", 8, 11)};

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

583

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

584

options.min_supported_codepoint_ratio = 0.0;

585

flatbuffers::DetachedBuffer options2_fb =

586

PackFeatureProcessorOptions(options);

587

TestingFeatureProcessor feature_processor2(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

588

flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),

589

&unilib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

590

EXPECT_TRUE(feature_processor2.ExtractFeatures(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

591

tokens, /*token_span=*/{0, 3},

592

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

593

&embedding_executor,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

594

/*feature_vector_size=*/4, &cached_features));

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

595

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

596

options.min_supported_codepoint_ratio = 0.2;

597

flatbuffers::DetachedBuffer options3_fb =

598

PackFeatureProcessorOptions(options);

599

TestingFeatureProcessor feature_processor3(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

600

flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),

601

&unilib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

602

EXPECT_TRUE(feature_processor3.ExtractFeatures(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

603

tokens, /*token_span=*/{0, 3},

604

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

605

&embedding_executor,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

606

/*feature_vector_size=*/4, &cached_features));

607

608

options.min_supported_codepoint_ratio = 0.5;

609

flatbuffers::DetachedBuffer options4_fb =

610

PackFeatureProcessorOptions(options);

611

TestingFeatureProcessor feature_processor4(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

612

flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),

613

&unilib);

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

614

EXPECT_FALSE(feature_processor4.ExtractFeatures(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

615

tokens, /*token_span=*/{0, 3},

616

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

617

&embedding_executor,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

618

/*feature_vector_size=*/4, &cached_features));

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

619

}

620

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

621

TEST(FeatureProcessorTest, InSpanFeature) {

622

FeatureProcessorOptionsT options;

623

options.context_size = 2;

624

options.max_selection_span = 2;

625

options.snap_label_span_boundaries_to_containing_tokens = false;

626

options.feature_version = 2;

627

options.embedding_size = 4;

628

options.extract_selection_mask_feature = true;

629

630

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

631

CREATE_UNILIB_FOR_TESTING

632

TestingFeatureProcessor feature_processor(

633

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

634

&unilib);

635

636

std::unique_ptr<CachedFeatures> cached_features;

637

638

FakeEmbeddingExecutor embedding_executor;

639

640

const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),

641

Token("ccc", 8, 11), Token("ddd", 12, 15)};

642

643

EXPECT_TRUE(feature_processor.ExtractFeatures(

644

tokens, /*token_span=*/{0, 4},

645

/*selection_span_for_feature=*/{4, 11}, &embedding_executor,

646

/*feature_vector_size=*/5, &cached_features));

647

std::vector<float> features;

648

cached_features->AppendClickContextFeaturesForClick(1, &features);

649

ASSERT_EQ(features.size(), 25);

650

EXPECT_THAT(features[4], FloatEq(0.0));

651

EXPECT_THAT(features[9], FloatEq(0.0));

652

EXPECT_THAT(features[14], FloatEq(1.0));

653

EXPECT_THAT(features[19], FloatEq(1.0));

654

EXPECT_THAT(features[24], FloatEq(0.0));

655

}

656

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

657

TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {

658

std::vector<Token> tokens_orig{

659

Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),

660

Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),

661

Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),

662

Token("12", 0, 0)};

663

664

std::vector<Token> tokens;

665

int click_index;

666

667

// Try to click first token and see if it gets padded from left.

668

tokens = tokens_orig;

669

click_index = 0;

670

internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);

671

// clang-format off

672

EXPECT_EQ(tokens, std::vector<Token>({Token(),

Token(),

Token("0", 0, 0),

Token("1", 0, 0),

Token("2", 0, 0)}));

// clang-format on

EXPECT_EQ(click_index, 2);

679

680

// When we click the second token nothing should get padded.

681

tokens = tokens_orig;

682

click_index = 2;

683

internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);

684

// clang-format off

685

EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),

Token("1", 0, 0),

Token("2", 0, 0),

Token("3", 0, 0),

Token("4", 0, 0)}));

// clang-format on

EXPECT_EQ(click_index, 2);

692

693

// When we click the last token tokens should get padded from the right.

694

tokens = tokens_orig;

695

click_index = 12;

696

internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);

697

// clang-format off

698

EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),

Token("11", 0, 0),

Token("12", 0, 0),

Token(),

Token()}));

// clang-format on

EXPECT_EQ(click_index, 2);

705

}

706

707

TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {

708

std::vector<Token> tokens_orig{

709

Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),

710

Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),

711

Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),

712

Token("12", 0, 0)};

713

714

std::vector<Token> tokens;

715

int click_index;

716

717

// Try to click first token and see if it gets padded from left to maximum

718

// context_size.

719

tokens = tokens_orig;

720

click_index = 0;

721

internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);

722

// clang-format off

723

EXPECT_EQ(tokens, std::vector<Token>({Token(),

Token(),

Token("0", 0, 0),

Token("1", 0, 0),

Token("2", 0, 0),

Token("3", 0, 0),

Token("4", 0, 0),

Token("5", 0, 0)}));

// clang-format on

EXPECT_EQ(click_index, 2);

733

734

// Clicking to the middle with enough context should not produce any padding.

735

tokens = tokens_orig;

736

click_index = 6;

737

internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);

738

// clang-format off

739

EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),

Token("2", 0, 0),

Token("3", 0, 0),

Token("4", 0, 0),

Token("5", 0, 0),

Token("6", 0, 0),

Token("7", 0, 0),

Token("8", 0, 0),

Token("9", 0, 0)}));

// clang-format on

EXPECT_EQ(click_index, 5);

750

751

// Clicking at the end should pad right to maximum context_size.

752

tokens = tokens_orig;

753

click_index = 11;

754

internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);

755

// clang-format off

756

EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),

Token("7", 0, 0),

Token("8", 0, 0),

Token("9", 0, 0),

Token("10", 0, 0),

Token("11", 0, 0),

Token("12", 0, 0),

Token(),

Token()}));

// clang-format on

EXPECT_EQ(click_index, 5);

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

767

}

768

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

769

TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

770

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

771

FeatureProcessorOptionsT options;

772

options.tokenization_codepoint_config.emplace_back(

773

new TokenizationCodepointRangeT());

774

{

775

auto& config = options.tokenization_codepoint_config.back();

776

config->start = 0;

777

config->end = 256;

778

config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;

779

config->script_id = 1;

780

}

781

options.tokenize_on_script_change = false;

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

782

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

783

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

784

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

785

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

786

&unilib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

787

788

EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),

789

std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));

790

791

options.tokenize_on_script_change = true;

792

flatbuffers::DetachedBuffer options_fb2 =

793

PackFeatureProcessorOptions(options);

794

TestingFeatureProcessor feature_processor2(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

795

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),

796

&unilib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

797

798

EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),

799

std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),

800

Token("웹사이트", 7, 11)}));

801

}

802

803

#ifdef LIBTEXTCLASSIFIER_TEST_ICU

804

TEST(FeatureProcessorTest, ICUTokenize) {

805

FeatureProcessorOptionsT options;

806

options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;

807

808

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

809

TestingFeatureProcessor feature_processor(

810

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

811

std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");

812

ASSERT_EQ(tokens,

813

// clang-format off

814

std::vector<Token>({Token("พระบาท", 0, 6),

815

Token("สมเด็จ", 6, 12),

816

Token("พระ", 12, 15),

817

Token("ปร", 15, 17),

818

Token("มิ", 17, 19)}));

819

// clang-format on

820

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

821

#endif

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

822

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

823

#ifdef LIBTEXTCLASSIFIER_TEST_ICU

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

824

TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

825

FeatureProcessorOptionsT options;

826

options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;

827

options.icu_preserve_whitespace_tokens = true;

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

828

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

829

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

830

TestingFeatureProcessor feature_processor(

831

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

832

std::vector<Token> tokens =

833

feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");

834

ASSERT_EQ(tokens,

835

// clang-format off

836

std::vector<Token>({Token("พระบาท", 0, 6),

837

Token(" ", 6, 7),

838

Token("สมเด็จ", 7, 13),

839

Token(" ", 13, 14),

840

Token("พระ", 14, 17),

Token(" ", 17, 18),

Token("ปร", 18, 20),

Token(" ", 20, 21),

Token("มิ", 21, 23)}));

845

// clang-format on

846

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

847

#endif

Lukas Zilka

2017-04-10 17:22:22 +0200

[diff] [blame]

848

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

849

#ifdef LIBTEXTCLASSIFIER_TEST_ICU

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

850

TEST(FeatureProcessorTest, MixedTokenize) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

851

FeatureProcessorOptionsT options;

852

options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

853

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

854

options.tokenization_codepoint_config.emplace_back(

855

new TokenizationCodepointRangeT());

856

auto& config = options.tokenization_codepoint_config.back();

857

config->start = 32;

858

config->end = 33;

859

config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

860

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

861

{

862

options.internal_tokenizer_codepoint_ranges.emplace_back(

863

new FeatureProcessorOptions_::CodepointRangeT());

864

auto& range = options.internal_tokenizer_codepoint_ranges.back();

865

range->start = 0;

866

range->end = 128;

867

}

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

868

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

869

{

870

options.internal_tokenizer_codepoint_ranges.emplace_back(

871

new FeatureProcessorOptions_::CodepointRangeT());

872

auto& range = options.internal_tokenizer_codepoint_ranges.back();

873

range->start = 128;

874

range->end = 256;

875

}

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

876

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

877

{

878

options.internal_tokenizer_codepoint_ranges.emplace_back(

879

new FeatureProcessorOptions_::CodepointRangeT());

880

auto& range = options.internal_tokenizer_codepoint_ranges.back();

881

range->start = 256;

882

range->end = 384;

883

}

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

884

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

885

{

886

options.internal_tokenizer_codepoint_ranges.emplace_back(

887

new FeatureProcessorOptions_::CodepointRangeT());

888

auto& range = options.internal_tokenizer_codepoint_ranges.back();

889

range->start = 384;

890

range->end = 592;

891

}

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

892

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

893

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

894

TestingFeatureProcessor feature_processor(

895

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

896

std::vector<Token> tokens = feature_processor.Tokenize(

897

"こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");

898

ASSERT_EQ(tokens,

899

// clang-format off

900

std::vector<Token>({Token("こんにちは", 0, 5),

901

Token("Japanese-ląnguagę", 5, 22),

902

Token("text", 23, 27),

903

Token("世界", 28, 30),

904

Token("http://www.google.com/", 31, 53)}));

905

// clang-format on

906

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

907

#endif

Matt Sharifi

2017-04-25 18:41:11 +0200

[diff] [blame]

908

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

909

TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

910

CREATE_UNILIB_FOR_TESTING

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

911

FeatureProcessorOptionsT options;

912

options.ignored_span_boundary_codepoints.push_back('.');

913

options.ignored_span_boundary_codepoints.push_back(',');

914

options.ignored_span_boundary_codepoints.push_back('[');

915

options.ignored_span_boundary_codepoints.push_back(']');

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

916

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

917

flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);

918

TestingFeatureProcessor feature_processor(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame^]

919

flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),

920

&unilib);

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

921

922

const std::string text1_utf8 = "ěščř";

923

const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);

924

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

925

text1.begin(), text1.end(),

926

/*count_from_beginning=*/true),

927

0);

928

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

929

text1.begin(), text1.end(),

930

/*count_from_beginning=*/false),

931

0);

932

933

const std::string text2_utf8 = ".,abčd";

934

const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);

935

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

936

text2.begin(), text2.end(),

937

/*count_from_beginning=*/true),

938

2);

939

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

940

text2.begin(), text2.end(),

941

/*count_from_beginning=*/false),

942

0);

943

944

const std::string text3_utf8 = ".,abčd[]";

945

const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);

946

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

947

text3.begin(), text3.end(),

948

/*count_from_beginning=*/true),

949

2);

950

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

951

text3.begin(), text3.end(),

952

/*count_from_beginning=*/false),

953

2);

954

955

const std::string text4_utf8 = "[abčd]";

956

const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);

957

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

958

text4.begin(), text4.end(),

959

/*count_from_beginning=*/true),

960

1);

961

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

962

text4.begin(), text4.end(),

963

/*count_from_beginning=*/false),

964

1);

965

966

const std::string text5_utf8 = "";

967

const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);

968

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

969

text5.begin(), text5.end(),

970

/*count_from_beginning=*/true),

971

0);

972

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

973

text5.begin(), text5.end(),

974

/*count_from_beginning=*/false),

975

0);

976

977

const std::string text6_utf8 = "012345ěščř";

978

const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);

979

UnicodeText::const_iterator text6_begin = text6.begin();

980

std::advance(text6_begin, 6);

981

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

982

text6_begin, text6.end(),

983

/*count_from_beginning=*/true),

984

0);

985

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

986

text6_begin, text6.end(),

987

/*count_from_beginning=*/false),

988

0);

989

990

const std::string text7_utf8 = "012345.,ěščř";

991

const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);

992

UnicodeText::const_iterator text7_begin = text7.begin();

993

std::advance(text7_begin, 6);

994

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

995

text7_begin, text7.end(),

996

/*count_from_beginning=*/true),

997

2);

998

UnicodeText::const_iterator text7_end = text7.begin();

999

std::advance(text7_end, 8);

1000

EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(

1001

text7.begin(), text7_end,

1002

/*count_from_beginning=*/false),

1003

2);

1004

1005

// Test not stripping.

1006

EXPECT_EQ(feature_processor.StripBoundaryCodepoints(

1007

"Hello [[[Wořld]] or not?", {0, 24}),

1008

std::make_pair(0, 24));

1009

// Test basic stripping.

1010

EXPECT_EQ(feature_processor.StripBoundaryCodepoints(

1011

"Hello [[[Wořld]] or not?", {6, 16}),

1012

std::make_pair(9, 14));

1013

// Test stripping when everything is stripped.

1014

EXPECT_EQ(

1015

feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),

1016

std::make_pair(6, 6));

1017

// Test stripping empty string.

1018

EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),

1019

std::make_pair(0, 0));

1020

}

1021

Lukas Zilka

2017-12-13 16:37:03 +0100

[diff] [blame]

1022

TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {

1023

const std::vector<Token> tokens{Token("Hělló", 0, 5),

1024

Token("fěěbař@google.com", 6, 23),

1025

Token("heře!", 24, 29)};

1026

1027

// Spans matching the tokens exactly.

1028

EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));

1029

EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));

1030

EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));

1031

EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));

1032

EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));

1033

EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));

1034

1035

// Snapping to containing tokens has no effect.

1036

EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));

1037

EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));

1038

EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));

1039

EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));

1040

EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));

1041

EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));

1042

1043

// Span boundaries inside tokens.

1044

EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));

1045

EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));

1046

1047

// Tokens adjacent to the span, but not overlapping.

1048

EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));

1049

EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));

1050

}

1051

Matt Sharifi

2017-03-14 21:24:23 +0100

[diff] [blame]

1052

} // namespace

Lukas Zilka