Blame - native/annotator/annotator.cc - platform/external/libtextclassifier

2018-01-24 11:11:20 +0100

[diff] [blame]

1

/*

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3

*

4

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

17

#include "annotator/annotator.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

18

19

#include <algorithm>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

20

#include <cmath>

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

21

#include <cstddef>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

22

#include <iterator>

23

#include <numeric>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

24

#include <string>

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

25

#include <unordered_map>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

26

#include <vector>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

27

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

28

#include "annotator/collections.h"

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

29

#include "annotator/model_generated.h"

30

#include "annotator/types.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

31

#include "utils/base/logging.h"

32

#include "utils/checksum.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

33

#include "utils/i18n/locale.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

34

#include "utils/math/softmax.h"

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

35

#include "utils/normalization.h"

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

36

#include "utils/optional.h"

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

37

#include "utils/regex-match.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

38

#include "utils/strings/numbers.h"

39

#include "utils/strings/split.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

40

#include "utils/utf8/unicodetext.h"

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

41

#include "utils/utf8/unilib-common.h"

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

42

#include "utils/zlib/zlib_regex.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

43

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

44

namespace libtextclassifier3 {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

45

46

using SortedIntSet = std::set<int, std::function<bool(int, int)>>;

47

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

48

const std::string& Annotator::kPhoneCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

49

*[]() { return new std::string("phone"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

50

const std::string& Annotator::kAddressCollection =

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

51

*[]() { return new std::string("address"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

52

const std::string& Annotator::kDateCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

53

*[]() { return new std::string("date"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

54

const std::string& Annotator::kUrlCollection =

55

*[]() { return new std::string("url"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

56

const std::string& Annotator::kEmailCollection =

57

*[]() { return new std::string("email"); }();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

58

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

59

namespace {

60

const Model* LoadAndVerifyModel(const void* addr, int size) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

61

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

Tony Mak

51a9e54

2018-11-02 13:36:22 +0000

[diff] [blame]

62

if (VerifyModelBuffer(verifier)) {

63

return GetModel(addr);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

68

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

69

const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,

70

int size) {

71

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

72

if (VerifyPersonNameModelBuffer(verifier)) {

73

return GetPersonNameModel(addr);

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

79

// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will

80

// create a new instance, assign ownership to owned_lib, and return it.

81

const UniLib* MaybeCreateUnilib(const UniLib* lib,

82

std::unique_ptr<UniLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new UniLib);

87

return owned_lib->get();

}

}

// As above, but for CalendarLib.

92

const CalendarLib* MaybeCreateCalendarlib(

93

const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new CalendarLib);

98

return owned_lib->get();

}

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

102

// Returns whether the provided input is valid:

103

// * Valid utf8 text.

104

// * Sane span indices.

105

bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan span) {

106

if (!context.is_valid()) {

107

return false;

108

}

109

return (span.first >= 0 && span.first < span.second &&

110

span.second <= context.size_codepoints());

111

}

112

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

113

std::unordered_set<char32> FlatbuffersIntVectorToChar32UnorderedSet(

114

const flatbuffers::Vector<int32_t>* ints) {

115

if (ints == nullptr) {

116

return {};

117

}

118

std::unordered_set<char32> ints_set;

119

for (auto value : *ints) {

120

ints_set.insert(static_cast<char32>(value));

}

return ints_set;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

125

DateAnnotationOptions ToDateAnnotationOptions(

126

const GrammarDatetimeModel_::AnnotationOptions* fb_annotation_options,

127

const std::string& reference_timezone, const int64 reference_time_ms_utc) {

128

DateAnnotationOptions result_annotation_options;

129

result_annotation_options.base_timestamp_millis = reference_time_ms_utc;

130

result_annotation_options.reference_timezone = reference_timezone;

131

if (fb_annotation_options != nullptr) {

132

result_annotation_options.enable_special_day_offset =

133

fb_annotation_options->enable_special_day_offset();

134

result_annotation_options.merge_adjacent_components =

135

fb_annotation_options->merge_adjacent_components();

136

result_annotation_options.enable_date_range =

137

fb_annotation_options->enable_date_range();

138

result_annotation_options.include_preposition =

139

fb_annotation_options->include_preposition();

140

result_annotation_options.expand_date_series =

141

fb_annotation_options->expand_date_series();

142

if (fb_annotation_options->extra_requested_dates() != nullptr) {

143

for (const auto& extra_requested_date :

144

*fb_annotation_options->extra_requested_dates()) {

145

result_annotation_options.extra_requested_dates.push_back(

146

extra_requested_date->str());

}

}

}

return result_annotation_options;

151

}

152

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

153

} // namespace

154

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

155

tflite::Interpreter* InterpreterManager::SelectionInterpreter() {

156

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

157

TC3_CHECK(selection_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

158

selection_interpreter_ = selection_executor_->CreateInterpreter();

159

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

160

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

161

}

162

}

163

return selection_interpreter_.get();

164

}

165

166

tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {

167

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

168

TC3_CHECK(classification_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

169

classification_interpreter_ = classification_executor_->CreateInterpreter();

170

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

171

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

172

}

173

}

174

return classification_interpreter_.get();

175

}

176

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

177

std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(

178

const char* buffer, int size, const UniLib* unilib,

179

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

180

const Model* model = LoadAndVerifyModel(buffer, size);

181

if (model == nullptr) {

return nullptr;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

185

auto classifier =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

186

std::unique_ptr<Annotator>(new Annotator(model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

187

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

194

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

195

std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,

196

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

197

if (!(*mmap)->handle().ok()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

198

TC3_VLOG(1) << "Mmap failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

203

(*mmap)->handle().num_bytes());

204

if (!model) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

205

TC3_LOG(ERROR) << "Model verification failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

209

auto classifier = std::unique_ptr<Annotator>(

210

new Annotator(mmap, model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

211

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

218

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

219

std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,

220

std::unique_ptr<CalendarLib> calendarlib) {

221

if (!(*mmap)->handle().ok()) {

222

TC3_VLOG(1) << "Mmap failed.";

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

227

(*mmap)->handle().num_bytes());

228

if (model == nullptr) {

229

TC3_LOG(ERROR) << "Model verification failed.";

return nullptr;

}

auto classifier = std::unique_ptr<Annotator>(

234

new Annotator(mmap, model, std::move(unilib), std::move(calendarlib)));

235

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

242

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

243

int fd, int offset, int size, const UniLib* unilib,

244

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

245

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

246

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

247

}

248

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

249

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

250

int fd, int offset, int size, std::unique_ptr<UniLib> unilib,

251

std::unique_ptr<CalendarLib> calendarlib) {

252

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

253

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

254

}

255

256

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

257

int fd, const UniLib* unilib, const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

258

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

259

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

260

}

261

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

262

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

263

int fd, std::unique_ptr<UniLib> unilib,

264

std::unique_ptr<CalendarLib> calendarlib) {

265

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

266

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

267

}

268

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

269

std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,

270

const UniLib* unilib,

271

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

272

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

273

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

274

}

275

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

276

std::unique_ptr<Annotator> Annotator::FromPath(

277

const std::string& path, std::unique_ptr<UniLib> unilib,

278

std::unique_ptr<CalendarLib> calendarlib) {

279

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

280

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

281

}

282

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

283

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

284

const UniLib* unilib, const CalendarLib* calendarlib)

285

: model_(model),

286

mmap_(std::move(*mmap)),

287

owned_unilib_(nullptr),

288

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

289

owned_calendarlib_(nullptr),

290

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

291

ValidateAndInitialize();

292

}

293

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

294

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

295

std::unique_ptr<UniLib> unilib,

296

std::unique_ptr<CalendarLib> calendarlib)

297

: model_(model),

298

mmap_(std::move(*mmap)),

299

owned_unilib_(std::move(unilib)),

300

unilib_(owned_unilib_.get()),

301

owned_calendarlib_(std::move(calendarlib)),

302

calendarlib_(owned_calendarlib_.get()) {

303

ValidateAndInitialize();

304

}

305

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

306

Annotator::Annotator(const Model* model, const UniLib* unilib,

307

const CalendarLib* calendarlib)

308

: model_(model),

309

owned_unilib_(nullptr),

310

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

311

owned_calendarlib_(nullptr),

312

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

313

ValidateAndInitialize();

314

}

315

316

void Annotator::ValidateAndInitialize() {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

317

initialized_ = false;

318

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

319

if (model_ == nullptr) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

320

TC3_LOG(ERROR) << "No model specified.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

324

const bool model_enabled_for_annotation =

325

(model_->triggering_options() != nullptr &&

326

(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));

327

const bool model_enabled_for_classification =

328

(model_->triggering_options() != nullptr &&

329

(model_->triggering_options()->enabled_modes() &

330

ModeFlag_CLASSIFICATION));

331

const bool model_enabled_for_selection =

332

(model_->triggering_options() != nullptr &&

333

(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));

334

335

// Annotation requires the selection model.

336

if (model_enabled_for_annotation || model_enabled_for_selection) {

337

if (!model_->selection_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

338

TC3_LOG(ERROR) << "No selection options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

339

return;

340

}

341

if (!model_->selection_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

342

TC3_LOG(ERROR) << "No selection feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

343

return;

344

}

345

if (!model_->selection_feature_options()->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

346

TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

347

return;

348

}

349

if (!model_->selection_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

350

TC3_LOG(ERROR) << "No selection model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

351

return;

352

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

353

selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

354

if (!selection_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

355

TC3_LOG(ERROR) << "Could not initialize selection executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

356

return;

357

}

358

selection_feature_processor_.reset(

359

new FeatureProcessor(model_->selection_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

360

}

361

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

362

// Annotation requires the classification model for conflict resolution and

363

// scoring.

364

// Selection requires the classification model for conflict resolution.

365

if (model_enabled_for_annotation || model_enabled_for_classification ||

366

model_enabled_for_selection) {

367

if (!model_->classification_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

368

TC3_LOG(ERROR) << "No classification options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

373

TC3_LOG(ERROR) << "No classification feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()

378

->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

379

TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

380

return;

381

}

382

if (!model_->classification_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

383

TC3_LOG(ERROR) << "No clf model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

387

classification_executor_ =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

388

ModelExecutor::FromBuffer(model_->classification_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

389

if (!classification_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

390

TC3_LOG(ERROR) << "Could not initialize classification executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

classification_feature_processor_.reset(new FeatureProcessor(

395

model_->classification_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

396

}

397

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

398

// The embeddings need to be specified if the model is to be used for

399

// classification or selection.

400

if (model_enabled_for_annotation || model_enabled_for_classification ||

401

model_enabled_for_selection) {

402

if (!model_->embedding_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

403

TC3_LOG(ERROR) << "No embedding model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

404

return;

405

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

406

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

407

// Check that the embedding size of the selection and classification model

408

// matches, as they are using the same embeddings.

409

if (model_enabled_for_selection &&

410

(model_->selection_feature_options()->embedding_size() !=

411

model_->classification_feature_options()->embedding_size() ||

412

model_->selection_feature_options()->embedding_quantization_bits() !=

413

model_->classification_feature_options()

414

->embedding_quantization_bits())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

415

TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

416

return;

417

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

418

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

419

embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

420

model_->embedding_model(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

421

model_->classification_feature_options()->embedding_size(),

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

422

model_->classification_feature_options()->embedding_quantization_bits(),

423

model_->embedding_pruning_mask());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

424

if (!embedding_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

425

TC3_LOG(ERROR) << "Could not initialize embedding executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

426

return;

427

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

428

}

429

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

430

std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

431

if (model_->regex_model()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

432

if (!InitializeRegexModel(decompressor.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

433

TC3_LOG(ERROR) << "Could not initialize regex model.";

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

434

return;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

435

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

436

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

437

if (model_->grammar_datetime_model() &&

438

model_->grammar_datetime_model()->datetime_rules()) {

439

cfg_datetime_parser_.reset(new dates::CfgDatetimeAnnotator(

440

*unilib_,

441

/*tokenizer_options=*/

442

model_->grammar_datetime_model()->grammar_tokenizer_options(),

443

*calendarlib_,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

444

/*datetime_rules=*/model_->grammar_datetime_model()->datetime_rules(),

445

model_->grammar_datetime_model()->target_classification_score(),

446

model_->grammar_datetime_model()->priority_score()));

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

447

if (!cfg_datetime_parser_) {

448

TC3_LOG(ERROR) << "Could not initialize context free grammar based "

"datetime parser.";

return;

}

} else if (model_->datetime_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

453

datetime_parser_ = DatetimeParser::Instance(

454

model_->datetime_model(), *unilib_, *calendarlib_, decompressor.get());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

455

if (!datetime_parser_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

456

TC3_LOG(ERROR) << "Could not initialize datetime parser.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

461

if (model_->output_options()) {

462

if (model_->output_options()->filtered_collections_annotation()) {

463

for (const auto collection :

464

*model_->output_options()->filtered_collections_annotation()) {

465

filtered_collections_annotation_.insert(collection->str());

466

}

467

}

468

if (model_->output_options()->filtered_collections_classification()) {

469

for (const auto collection :

470

*model_->output_options()->filtered_collections_classification()) {

471

filtered_collections_classification_.insert(collection->str());

472

}

473

}

474

if (model_->output_options()->filtered_collections_selection()) {

475

for (const auto collection :

476

*model_->output_options()->filtered_collections_selection()) {

477

filtered_collections_selection_.insert(collection->str());

}

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

482

if (model_->number_annotator_options() &&

483

model_->number_annotator_options()->enabled()) {

484

number_annotator_.reset(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

485

new NumberAnnotator(model_->number_annotator_options(), unilib_));

486

}

487

488

if (model_->money_parsing_options()) {

489

money_separators_ = FlatbuffersIntVectorToChar32UnorderedSet(

490

model_->money_parsing_options()->separators());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

491

}

492

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

493

if (model_->duration_annotator_options() &&

494

model_->duration_annotator_options()->enabled()) {

495

duration_annotator_.reset(

496

new DurationAnnotator(model_->duration_annotator_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

497

selection_feature_processor_.get(), unilib_));

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

498

}

499

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

500

if (model_->entity_data_schema()) {

501

entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(

502

model_->entity_data_schema()->Data(),

503

model_->entity_data_schema()->size());

504

if (entity_data_schema_ == nullptr) {

505

TC3_LOG(ERROR) << "Could not load entity data schema data.";

return;

}

entity_data_builder_.reset(

510

new ReflectiveFlatbufferBuilder(entity_data_schema_));

511

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

512

entity_data_schema_ = nullptr;

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

513

entity_data_builder_ = nullptr;

514

}

515

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

516

if (model_->grammar_model()) {

517

grammar_annotator_.reset(new GrammarAnnotator(

518

unilib_, model_->grammar_model(), entity_data_builder_.get()));

519

}

520

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

521

if (model_->triggering_locales() &&

522

!ParseLocales(model_->triggering_locales()->c_str(),

523

&model_triggering_locales_)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

524

TC3_LOG(ERROR) << "Could not parse model supported locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

529

model_->triggering_options()->locales() != nullptr &&

530

!ParseLocales(model_->triggering_options()->locales()->c_str(),

531

&ml_model_triggering_locales_)) {

532

TC3_LOG(ERROR) << "Could not parse supported ML model locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

537

model_->triggering_options()->dictionary_locales() != nullptr &&

538

!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),

539

&dictionary_locales_)) {

540

TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";

return;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

initialized_ = true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

547

bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

548

if (!model_->regex_model()->patterns()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

549

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

550

}

551

552

// Initialize pattern recognizers.

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

553

int regex_pattern_id = 0;

554

for (const auto& regex_pattern : *model_->regex_model()->patterns()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

555

std::unique_ptr<UniLib::RegexPattern> compiled_pattern =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

556

UncompressMakeRegexPattern(

557

*unilib_, regex_pattern->pattern(),

558

regex_pattern->compressed_pattern(),

559

model_->regex_model()->lazy_regex_compilation(), decompressor);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

560

if (!compiled_pattern) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

561

TC3_LOG(INFO) << "Failed to load regex pattern";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

562

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

563

}

564

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

565

if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

566

annotation_regex_patterns_.push_back(regex_pattern_id);

567

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

568

if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

569

classification_regex_patterns_.push_back(regex_pattern_id);

570

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

571

if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

572

selection_regex_patterns_.push_back(regex_pattern_id);

573

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

574

regex_patterns_.push_back({

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

575

regex_pattern,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

576

std::move(compiled_pattern),

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

577

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

578

++regex_pattern_id;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

579

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

580

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

581

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

582

}

583

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

584

bool Annotator::InitializeKnowledgeEngine(

585

const std::string& serialized_config) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

586

std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

587

if (!knowledge_engine->Initialize(serialized_config, unilib_)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

588

TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";

589

return false;

590

}

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

591

if (model_->triggering_options() != nullptr) {

592

knowledge_engine->SetPriorityScore(

593

model_->triggering_options()->knowledge_priority_score());

594

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

595

knowledge_engine_ = std::move(knowledge_engine);

return true;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

599

bool Annotator::InitializeContactEngine(const std::string& serialized_config) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

600

std::unique_ptr<ContactEngine> contact_engine(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

601

new ContactEngine(selection_feature_processor_.get(), unilib_,

602

model_->contact_annotator_options()));

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

603

if (!contact_engine->Initialize(serialized_config)) {

604

TC3_LOG(ERROR) << "Failed to initialize the contact engine.";

605

return false;

606

}

607

contact_engine_ = std::move(contact_engine);

return true;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

611

bool Annotator::InitializeInstalledAppEngine(

612

const std::string& serialized_config) {

613

std::unique_ptr<InstalledAppEngine> installed_app_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

614

new InstalledAppEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

615

if (!installed_app_engine->Initialize(serialized_config)) {

616

TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";

617

return false;

618

}

619

installed_app_engine_ = std::move(installed_app_engine);

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

623

void Annotator::SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id) {

624

lang_id_ = lang_id;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

625

if (lang_id_ != nullptr && model_->translate_annotator_options() &&

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

626

model_->translate_annotator_options()->enabled()) {

627

translate_annotator_.reset(new TranslateAnnotator(

628

model_->translate_annotator_options(), lang_id_, unilib_));

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

629

} else {

630

translate_annotator_.reset(nullptr);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

}

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

634

bool Annotator::InitializePersonNameEngineFromUnownedBuffer(const void* buffer,

635

int size) {

636

const PersonNameModel* person_name_model =

637

LoadAndVerifyPersonNameModel(buffer, size);

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

638

639

if (person_name_model == nullptr) {

640

TC3_LOG(ERROR) << "Person name model verification failed.";

return false;

}

if (!person_name_model->enabled()) {

return true;

}

std::unique_ptr<PersonNameEngine> person_name_engine(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

649

new PersonNameEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

650

if (!person_name_engine->Initialize(person_name_model)) {

651

TC3_LOG(ERROR) << "Failed to initialize the person name engine.";

652

return false;

653

}

654

person_name_engine_ = std::move(person_name_engine);

return true;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

658

bool Annotator::InitializePersonNameEngineFromScopedMmap(

659

const ScopedMmap& mmap) {

660

if (!mmap.handle().ok()) {

661

TC3_LOG(ERROR) << "Mmap for person name model failed.";

return false;

}

return InitializePersonNameEngineFromUnownedBuffer(mmap.handle().start(),

666

mmap.handle().num_bytes());

667

}

668

669

bool Annotator::InitializePersonNameEngineFromPath(const std::string& path) {

670

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

671

return InitializePersonNameEngineFromScopedMmap(*mmap);

672

}

673

674

bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,

675

int size) {

676

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

677

return InitializePersonNameEngineFromScopedMmap(*mmap);

678

}

679

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

680

namespace {

681

682

int CountDigits(const std::string& str, CodepointSpan selection_indices) {

683

int count = 0;

684

int i = 0;

685

const UnicodeText unicode_str = UTF8ToUnicodeText(str, /*do_copy=*/false);

686

for (auto it = unicode_str.begin(); it != unicode_str.end(); ++it, ++i) {

687

if (i >= selection_indices.first && i < selection_indices.second &&

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

688

IsDigit(*it)) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

++count;

}

}

return count;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

695

} // namespace

696

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

697

namespace internal {

698

// Helper function, which if the initial 'span' contains only white-spaces,

699

// moves the selection to a single-codepoint selection on a left or right side

700

// of this space.

701

CodepointSpan SnapLeftIfWhitespaceSelection(CodepointSpan span,

702

const UnicodeText& context_unicode,

703

const UniLib& unilib) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

704

TC3_CHECK(ValidNonEmptySpan(span));

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

705

706

UnicodeText::const_iterator it;

707

708

// Check that the current selection is all whitespaces.

709

it = context_unicode.begin();

710

std::advance(it, span.first);

711

for (int i = 0; i < (span.second - span.first); ++i, ++it) {

712

if (!unilib.IsWhitespace(*it)) {

return span;

}

}

CodepointSpan result;

// Try moving left.

result = span;

it = context_unicode.begin();

722

std::advance(it, span.first);

723

while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {

--result.first;

--it;

}

result.second = result.first + 1;

728

if (!unilib.IsWhitespace(*it)) {

return result;

}

// If moving left didn't find a non-whitespace character, just return the

// original span.

return span;

}

} // namespace internal

737

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

738

bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

739

return !span.classification.empty() &&

740

filtered_collections_annotation_.find(

741

span.classification[0].collection) !=

742

filtered_collections_annotation_.end();

743

}

744

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

745

bool Annotator::FilteredForClassification(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

746

const ClassificationResult& classification) const {

747

return filtered_collections_classification_.find(classification.collection) !=

748

filtered_collections_classification_.end();

749

}

750

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

751

bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

752

return !span.classification.empty() &&

753

filtered_collections_selection_.find(

754

span.classification[0].collection) !=

755

filtered_collections_selection_.end();

756

}

757

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

758

namespace {

759

inline bool ClassifiedAsOther(

760

const std::vector<ClassificationResult>& classification) {

761

return !classification.empty() &&

762

classification[0].collection == Collections::Other();

763

}

764

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

765

} // namespace

766

767

float Annotator::GetPriorityScore(

768

const std::vector<ClassificationResult>& classification) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

769

if (!classification.empty() && !ClassifiedAsOther(classification)) {

770

return classification[0].priority_score;

771

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

772

if (model_->triggering_options() != nullptr) {

773

return model_->triggering_options()->other_collection_priority_score();

774

} else {

775

return -1000.0;

776

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

777

}

778

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

779

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

780

bool Annotator::VerifyRegexMatchCandidate(

781

const std::string& context, const VerificationOptions* verification_options,

782

const std::string& match, const UniLib::RegexMatcher* matcher) const {

783

if (verification_options == nullptr) {

784

return true;

785

}

786

if (verification_options->verify_luhn_checksum() &&

787

!VerifyLuhnChecksum(match)) {

788

return false;

789

}

790

const int lua_verifier = verification_options->lua_verifier();

791

if (lua_verifier >= 0) {

792

if (model_->regex_model()->lua_verifier() == nullptr ||

793

lua_verifier >= model_->regex_model()->lua_verifier()->size()) {

794

TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;

return false;

}

return VerifyMatch(

context, matcher,

model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

804

CodepointSpan Annotator::SuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

805

const std::string& context, CodepointSpan click_indices,

806

const SelectionOptions& options) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

807

CodepointSpan original_click_indices = click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

808

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

809

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

810

return original_click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

811

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

812

if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

813

return original_click_indices;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

814

}

815

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

816

std::vector<Locale> detected_text_language_tags;

817

if (!ParseLocales(options.detected_text_language_tags,

818

&detected_text_language_tags)) {

819

TC3_LOG(WARNING)

820

<< "Failed to parse the detected_text_language_tags in options: "

821

<< options.detected_text_language_tags;

822

}

823

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

824

model_triggering_locales_,

825

/*default_value=*/true)) {

826

return original_click_indices;

827

}

828

Lukas Zilka

df710db

2018-02-27 12:44:09 +0100

[diff] [blame]

829

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

830

/*do_copy=*/false);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

831

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame]

832

if (!IsValidSpanInput(context_unicode, click_indices)) {

833

TC3_VLOG(1)

834

<< "Trying to run SuggestSelection with invalid input, indices: "

835

<< click_indices.first << " " << click_indices.second;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

836

return original_click_indices;

837

}

838

839

if (model_->snap_whitespace_selections()) {

840

// We want to expand a purely white-space selection to a multi-selection it

841

// would've been part of. But with this feature disabled we would do a no-

842

// op, because no token is found. Therefore, we need to modify the

843

// 'click_indices' a bit to include a part of the token, so that the click-

844

// finding logic finds the clicked token correctly. This modification is

845

// done by the following function. Note, that it's enough to check the left

846

// side of the current selection, because if the white-space is a part of a

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

847

// multi-selection, necessarily both tokens - on the left and the right

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

848

// sides need to be selected. Thus snapping only to the left is sufficient

849

// (there's a check at the bottom that makes sure that if we snap to the

850

// left token but the result does not contain the initial white-space,

851

// returns the original indices).

852

click_indices = internal::SnapLeftIfWhitespaceSelection(

853

click_indices, context_unicode, *unilib_);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

854

}

855

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

856

std::vector<AnnotatedSpan> candidates;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

857

InterpreterManager interpreter_manager(selection_executor_.get(),

858

classification_executor_.get());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

859

std::vector<Token> tokens;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

860

if (!ModelSuggestSelection(context_unicode, click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

861

detected_text_language_tags, &interpreter_manager,

862

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

863

TC3_LOG(ERROR) << "Model suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

864

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

865

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

866

if (!RegexChunk(context_unicode, selection_regex_patterns_, &candidates,

867

/*is_serialized_entity_data_enabled=*/false)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

868

TC3_LOG(ERROR) << "Regex suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

869

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

870

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

871

if (!DatetimeChunk(

872

UTF8ToUnicodeText(context, /*do_copy=*/false),

873

/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",

874

options.locales, ModeFlag_SELECTION, options.annotation_usecase,

875

/*is_serialized_entity_data_enabled=*/false, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

876

TC3_LOG(ERROR) << "Datetime suggest selection failed.";

877

return original_click_indices;

878

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

879

if (knowledge_engine_ != nullptr &&

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

880

!knowledge_engine_->Chunk(context, options.annotation_usecase,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

881

options.location_context, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

882

TC3_LOG(ERROR) << "Knowledge suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

883

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

884

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

885

if (contact_engine_ != nullptr &&

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

886

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

887

TC3_LOG(ERROR) << "Contact suggest selection failed.";

888

return original_click_indices;

889

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

890

if (installed_app_engine_ != nullptr &&

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

891

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

892

TC3_LOG(ERROR) << "Installed app suggest selection failed.";

893

return original_click_indices;

894

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

895

if (number_annotator_ != nullptr &&

896

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

897

&candidates)) {

898

TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";

899

return original_click_indices;

900

}

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

901

if (duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

902

!duration_annotator_->FindAll(context_unicode, tokens,

903

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

904

TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";

905

return original_click_indices;

906

}

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

907

if (person_name_engine_ != nullptr &&

908

!person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {

909

TC3_LOG(ERROR) << "Person name suggest selection failed.";

910

return original_click_indices;

911

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

912

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

913

AnnotatedSpan grammar_suggested_span;

914

if (grammar_annotator_ != nullptr &&

915

grammar_annotator_->SuggestSelection(detected_text_language_tags,

916

context_unicode, click_indices,

917

&grammar_suggested_span)) {

918

candidates.push_back(grammar_suggested_span);

919

}

920

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

921

// Sort candidates according to their position in the input, so that the next

922

// code can assume that any connected component of overlapping spans forms a

923

// contiguous block.

924

std::sort(candidates.begin(), candidates.end(),

925

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

926

return a.span.first < b.span.first;

927

});

928

929

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

930

if (!ResolveConflicts(candidates, context, tokens,

931

detected_text_language_tags, options.annotation_usecase,

932

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

933

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

934

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

935

}

936

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

937

std::sort(candidate_indices.begin(), candidate_indices.end(),

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

938

[this, &candidates](int a, int b) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

939

return GetPriorityScore(candidates[a].classification) >

940

GetPriorityScore(candidates[b].classification);

941

});

942

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

943

for (const int i : candidate_indices) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

944

if (SpansOverlap(candidates[i].span, click_indices) &&

945

SpansOverlap(candidates[i].span, original_click_indices)) {

946

// Run model classification if not present but requested and there's a

947

// classification collection filter specified.

948

if (candidates[i].classification.empty() &&

949

model_->selection_options()->always_classify_suggested_selection() &&

950

!filtered_collections_selection_.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

951

if (!ModelClassifyText(context, detected_text_language_tags,

952

candidates[i].span, &interpreter_manager,

953

/*embedding_cache=*/nullptr,

954

&candidates[i].classification)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

955

return original_click_indices;

}

}

// Ignore if span classification is filtered.

960

if (FilteredForSelection(candidates[i])) {

961

return original_click_indices;

962

}

963

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

964

return candidates[i].span;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

968

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

namespace {

// Helper function that returns the index of the first candidate that

973

// transitively does not overlap with the candidate on 'start_index'. If the end

974

// of 'candidates' is reached, it returns the index that points right behind the

975

// array.

976

int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,

977

int start_index) {

978

int first_non_overlapping = start_index + 1;

979

CodepointSpan conflicting_span = candidates[start_index].span;

980

while (

981

first_non_overlapping < candidates.size() &&

982

SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {

983

// Grow the span to include the current one.

984

conflicting_span.second = std::max(

985

conflicting_span.second, candidates[first_non_overlapping].span.second);

986

987

++first_non_overlapping;

988

}

989

return first_non_overlapping;

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

993

bool Annotator::ResolveConflicts(

994

const std::vector<AnnotatedSpan>& candidates, const std::string& context,

995

const std::vector<Token>& cached_tokens,

996

const std::vector<Locale>& detected_text_language_tags,

997

AnnotationUsecase annotation_usecase,

998

InterpreterManager* interpreter_manager, std::vector<int>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

999

result->clear();

1000

result->reserve(candidates.size());

1001

for (int i = 0; i < candidates.size();) {

1002

int first_non_overlapping =

1003

FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);

1004

1005

const bool conflict_found = first_non_overlapping != (i + 1);

1006

if (conflict_found) {

1007

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1008

if (!ResolveConflict(context, cached_tokens, candidates,

1009

detected_text_language_tags, i,

1010

first_non_overlapping, annotation_usecase,

1011

interpreter_manager, &candidate_indices)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1012

return false;

1013

}

1014

result->insert(result->end(), candidate_indices.begin(),

1015

candidate_indices.end());

1016

} else {

1017

result->push_back(i);

1018

}

1019

1020

// Skip over the whole conflicting group/go to next candidate.

1021

i = first_non_overlapping;

}

return true;

}

namespace {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1027

// Returns true, if the given two sources do conflict in given annotation

1028

// usecase.

1029

// - In SMART usecase, all sources do conflict, because there's only 1 possible

1030

// annotation for a given span.

1031

// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime

1032

// and duration), while others not (e.g. duration and number).

1033

bool DoSourcesConflict(AnnotationUsecase annotation_usecase,

1034

const AnnotatedSpan::Source source1,

1035

const AnnotatedSpan::Source source2) {

1036

uint32 source_mask =

1037

(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));

1038

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1039

switch (annotation_usecase) {

1040

case AnnotationUsecase_ANNOTATION_USECASE_SMART:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1041

// In the SMART mode, all annotations conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1042

return true;

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1043

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1044

case AnnotationUsecase_ANNOTATION_USECASE_RAW:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1045

// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",

1046

// can have two non-conflicting annotations: "in 3 hours" (datetime), "3

1047

// hours" (duration).

1048

if ((source_mask &

1049

(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&

1050

(source_mask &

1051

(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {

1052

return false;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1053

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1054

1055

// A KNOWLEDGE entity does not conflict with anything.

1056

if ((source_mask &

1057

(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {

return false;

}

// Entities from other sources can conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1062

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1067

bool Annotator::ResolveConflict(

1068

const std::string& context, const std::vector<Token>& cached_tokens,

1069

const std::vector<AnnotatedSpan>& candidates,

1070

const std::vector<Locale>& detected_text_language_tags, int start_index,

1071

int end_index, AnnotationUsecase annotation_usecase,

1072

InterpreterManager* interpreter_manager,

1073

std::vector<int>* chosen_indices) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1074

std::vector<int> conflicting_indices;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1075

std::unordered_map<int, std::pair<float, int>> scores_lengths;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1076

for (int i = start_index; i < end_index; ++i) {

1077

conflicting_indices.push_back(i);

1078

if (!candidates[i].classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1079

scores_lengths[i] = {

1080

GetPriorityScore(candidates[i].classification),

1081

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

continue;

}

// OPTIMIZATION: So that we don't have to classify all the ML model

1086

// spans apriori, we wait until we get here, when they conflict with

1087

// something and we need the actual classification scores. So if the

1088

// candidate conflicts and comes from the model, we need to run a

1089

// classification to determine its priority:

1090

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1091

if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1092

candidates[i].span, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1093

/*embedding_cache=*/nullptr, &classification)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (!classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1098

scores_lengths[i] = {

1099

GetPriorityScore(classification),

1100

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2020-01-09 12:32:17 +0000

[diff] [blame]

1104

const bool prioritize_longest_annotation =

1105

model_->triggering_options() != nullptr &&

1106

model_->triggering_options()->prioritize_longest_annotation();

1107

std::sort(conflicting_indices.begin(), conflicting_indices.end(),

1108

[&scores_lengths, candidates, conflicting_indices,

1109

prioritize_longest_annotation](int i, int j) {

1110

if (scores_lengths[i].first == scores_lengths[j].first &&

1111

prioritize_longest_annotation) {

1112

return scores_lengths[i].second > scores_lengths[j].second;

1113

}

1114

return scores_lengths[i].first > scores_lengths[j].first;

1115

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1116

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1117

// Here we keep a set of indices that were chosen, per-source, to enable

1118

// effective computation.

1119

std::unordered_map<AnnotatedSpan::Source, SortedIntSet>

1120

chosen_indices_for_source_map;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1121

1122

// Greedily place the candidates if they don't conflict with the already

1123

// placed ones.

1124

for (int i = 0; i < conflicting_indices.size(); ++i) {

1125

const int considered_candidate = conflicting_indices[i];

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1126

1127

// See if there is a conflict between the candidate and all already placed

1128

// candidates.

1129

bool conflict = false;

1130

SortedIntSet* chosen_indices_for_source_ptr = nullptr;

1131

for (auto& source_set_pair : chosen_indices_for_source_map) {

1132

if (source_set_pair.first == candidates[considered_candidate].source) {

1133

chosen_indices_for_source_ptr = &source_set_pair.second;

1134

}

1135

1136

if (DoSourcesConflict(annotation_usecase, source_set_pair.first,

1137

candidates[considered_candidate].source) &&

1138

DoesCandidateConflict(considered_candidate, candidates,

1139

source_set_pair.second)) {

1140

conflict = true;

1141

break;

1142

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1143

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1144

1145

// Skip the candidate if a conflict was found.

if (conflict) {

continue;

}

// If the set of indices for the current source doesn't exist yet,

1151

// initialize it.

1152

if (chosen_indices_for_source_ptr == nullptr) {

1153

SortedIntSet new_set([&candidates](int a, int b) {

1154

return candidates[a].span.first < candidates[b].span.first;

1155

});

1156

chosen_indices_for_source_map[candidates[considered_candidate].source] =

1157

std::move(new_set);

1158

chosen_indices_for_source_ptr =

1159

&chosen_indices_for_source_map[candidates[considered_candidate]

.source];

}

// Place the candidate to the output and to the per-source conflict set.

1164

chosen_indices->push_back(considered_candidate);

1165

chosen_indices_for_source_ptr->insert(considered_candidate);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1166

}

1167

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1168

std::sort(chosen_indices->begin(), chosen_indices->end());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1173

bool Annotator::ModelSuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1174

const UnicodeText& context_unicode, CodepointSpan click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1175

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1176

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1177

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1178

if (model_->triggering_options() == nullptr ||

1179

!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1183

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1184

ml_model_triggering_locales_,

1185

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1189

int click_pos;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1190

*tokens = selection_feature_processor_->Tokenize(context_unicode);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1191

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1192

context_unicode, click_indices,

1193

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1194

tokens, &click_pos);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1195

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1196

TC3_VLOG(1) << "Could not calculate the click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1197

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1198

}

1199

1200

const int symmetry_context_size =

1201

model_->selection_options()->symmetry_context_size();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1202

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1203

bounds_sensitive_features = selection_feature_processor_->GetOptions()

1204

->bounds_sensitive_features();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1205

1206

// The symmetry context span is the clicked token with symmetry_context_size

1207

// tokens on either side.

1208

const TokenSpan symmetry_context_span = IntersectTokenSpans(

1209

ExpandTokenSpan(SingleTokenSpan(click_pos),

1210

/*num_tokens_left=*/symmetry_context_size,

1211

/*num_tokens_right=*/symmetry_context_size),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1212

{0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1213

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1214

// Compute the extraction span based on the model type.

1215

TokenSpan extraction_span;

1216

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1217

// The extraction span is the symmetry context span expanded to include

1218

// max_selection_span tokens on either side, which is how far a selection

1219

// can stretch from the click, plus a relevant number of tokens outside of

1220

// the bounds of the selection.

1221

const int max_selection_span =

1222

selection_feature_processor_->GetOptions()->max_selection_span();

1223

extraction_span =

1224

ExpandTokenSpan(symmetry_context_span,

1225

/*num_tokens_left=*/max_selection_span +

1226

bounds_sensitive_features->num_tokens_before(),

1227

/*num_tokens_right=*/max_selection_span +

1228

bounds_sensitive_features->num_tokens_after());

1229

} else {

1230

// The extraction span is the symmetry context span expanded to include

1231

// context_size tokens on either side.

1232

const int context_size =

1233

selection_feature_processor_->GetOptions()->context_size();

1234

extraction_span = ExpandTokenSpan(symmetry_context_span,

1235

/*num_tokens_left=*/context_size,

1236

/*num_tokens_right=*/context_size);

1237

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1238

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1239

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1240

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1241

*tokens, extraction_span)) {

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1245

std::unique_ptr<CachedFeatures> cached_features;

1246

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1247

*tokens, extraction_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1248

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1249

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1250

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1251

selection_feature_processor_->EmbeddingSize() +

1252

selection_feature_processor_->DenseFeaturesCount(),

1253

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1254

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Produce selection model candidates.

1259

std::vector<TokenSpan> chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1260

if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1261

interpreter_manager->SelectionInterpreter(), *cached_features,

1262

&chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1263

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

for (const TokenSpan& chunk : chunks) {

1268

AnnotatedSpan candidate;

1269

candidate.span = selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1270

context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1271

if (model_->selection_options()->strip_unpaired_brackets()) {

1272

candidate.span =

1273

StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);

1274

}

1275

1276

// Only output non-empty spans.

1277

if (candidate.span.first != candidate.span.second) {

1278

result->push_back(candidate);

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1284

bool Annotator::ModelClassifyText(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1285

const std::string& context,

1286

const std::vector<Locale>& detected_text_language_tags,

1287

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1288

FeatureProcessor::EmbeddingCache* embedding_cache,

1289

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1290

return ModelClassifyText(context, {}, detected_text_language_tags,

1291

selection_indices, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1292

embedding_cache, classification_results);

}

namespace internal {

std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,

1297

CodepointSpan selection_indices,

1298

TokenSpan tokens_around_selection_to_copy) {

1299

const auto first_selection_token = std::upper_bound(

1300

cached_tokens.begin(), cached_tokens.end(), selection_indices.first,

1301

[](int selection_start, const Token& token) {

1302

return selection_start < token.end;

1303

});

1304

const auto last_selection_token = std::lower_bound(

1305

cached_tokens.begin(), cached_tokens.end(), selection_indices.second,

1306

[](const Token& token, int selection_end) {

1307

return token.start < selection_end;

1308

});

1309

1310

const int64 first_token = std::max(

1311

static_cast<int64>(0),

1312

static_cast<int64>((first_selection_token - cached_tokens.begin()) -

1313

tokens_around_selection_to_copy.first));

1314

const int64 last_token = std::min(

1315

static_cast<int64>(cached_tokens.size()),

1316

static_cast<int64>((last_selection_token - cached_tokens.begin()) +

1317

tokens_around_selection_to_copy.second));

1318

1319

std::vector<Token> tokens;

1320

tokens.reserve(last_token - first_token);

1321

for (int i = first_token; i < last_token; ++i) {

1322

tokens.push_back(cached_tokens[i]);

}

return tokens;

}

} // namespace internal

1327

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1328

TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1329

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1330

bounds_sensitive_features =

1331

classification_feature_processor_->GetOptions()

1332

->bounds_sensitive_features();

1333

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1334

// The extraction span is the selection span expanded to include a relevant

1335

// number of tokens outside of the bounds of the selection.

1336

return {bounds_sensitive_features->num_tokens_before(),

1337

bounds_sensitive_features->num_tokens_after()};

1338

} else {

1339

// The extraction span is the clicked token with context_size tokens on

1340

// either side.

1341

const int context_size =

1342

selection_feature_processor_->GetOptions()->context_size();

1343

return {context_size, context_size};

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1347

namespace {

1348

// Sorts the classification results from high score to low score.

1349

void SortClassificationResults(

1350

std::vector<ClassificationResult>* classification_results) {

1351

std::sort(classification_results->begin(), classification_results->end(),

1352

[](const ClassificationResult& a, const ClassificationResult& b) {

1353

return a.score > b.score;

});

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1358

bool Annotator::ModelClassifyText(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1359

const std::string& context, const std::vector<Token>& cached_tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1360

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1361

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1362

FeatureProcessor::EmbeddingCache* embedding_cache,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1363

std::vector<ClassificationResult>* classification_results) const {

1364

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1365

return ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1366

selection_indices, interpreter_manager,

1367

embedding_cache, classification_results, &tokens);

1368

}

1369

1370

bool Annotator::ModelClassifyText(

1371

const std::string& context, const std::vector<Token>& cached_tokens,

1372

const std::vector<Locale>& detected_text_language_tags,

1373

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1374

FeatureProcessor::EmbeddingCache* embedding_cache,

1375

std::vector<ClassificationResult>* classification_results,

1376

std::vector<Token>* tokens) const {

1377

if (model_->triggering_options() == nullptr ||

1378

!(model_->triggering_options()->enabled_modes() &

1379

ModeFlag_CLASSIFICATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1383

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1384

ml_model_triggering_locales_,

1385

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1389

if (cached_tokens.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1390

*tokens = classification_feature_processor_->Tokenize(context);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1391

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1392

*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,

1393

ClassifyTextUpperBoundNeededTokens());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1394

}

1395

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1396

int click_pos;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1397

classification_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1398

context, selection_indices,

1399

classification_feature_processor_->GetOptions()

1400

->only_use_line_with_click(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1401

tokens, &click_pos);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1402

const TokenSpan selection_token_span =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1403

CodepointSpanToTokenSpan(*tokens, selection_indices);

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1404

const int selection_num_tokens = TokenSpanSize(selection_token_span);

1405

if (model_->classification_options()->max_num_tokens() > 0 &&

1406

model_->classification_options()->max_num_tokens() <

1407

selection_num_tokens) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1408

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1412

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1413

bounds_sensitive_features =

1414

classification_feature_processor_->GetOptions()

1415

->bounds_sensitive_features();

1416

if (selection_token_span.first == kInvalidIndex ||

1417

selection_token_span.second == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1418

TC3_LOG(ERROR) << "Could not determine span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Compute the extraction span based on the model type.

1423

TokenSpan extraction_span;

1424

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1425

// The extraction span is the selection span expanded to include a relevant

1426

// number of tokens outside of the bounds of the selection.

1427

extraction_span = ExpandTokenSpan(

1428

selection_token_span,

1429

/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),

1430

/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());

1431

} else {

1432

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1433

TC3_LOG(ERROR) << "Couldn't choose a click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1434

return false;

1435

}

1436

// The extraction span is the clicked token with context_size tokens on

1437

// either side.

1438

const int context_size =

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1439

classification_feature_processor_->GetOptions()->context_size();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1440

extraction_span = ExpandTokenSpan(SingleTokenSpan(click_pos),

1441

/*num_tokens_left=*/context_size,

1442

/*num_tokens_right=*/context_size);

1443

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1444

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1445

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1446

if (!classification_feature_processor_->HasEnoughSupportedCodepoints(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1447

*tokens, extraction_span)) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1448

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1452

std::unique_ptr<CachedFeatures> cached_features;

1453

if (!classification_feature_processor_->ExtractFeatures(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1454

*tokens, extraction_span, selection_indices,

1455

embedding_executor_.get(), embedding_cache,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1456

classification_feature_processor_->EmbeddingSize() +

1457

classification_feature_processor_->DenseFeaturesCount(),

1458

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1459

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1460

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1461

}

1462

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1463

std::vector<float> features;

1464

features.reserve(cached_features->OutputFeaturesSize());

1465

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1466

cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,

1467

&features);

1468

} else {

1469

cached_features->AppendClickContextFeaturesForClick(click_pos, &features);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1470

}

1471

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1472

TensorView<float> logits = classification_executor_->ComputeLogits(

1473

TensorView<float>(features.data(),

1474

{1, static_cast<int>(features.size())}),

1475

interpreter_manager->ClassificationInterpreter());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1476

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1477

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (logits.dims() != 2 || logits.dim(0) != 1 ||

1482

logits.dim(1) != classification_feature_processor_->NumCollections()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1483

TC3_LOG(ERROR) << "Mismatching output";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

const std::vector<float> scores =

1488

ComputeSoftmax(logits.data(), logits.dim(1));

1489

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1490

if (scores.empty()) {

1491

*classification_results = {{Collections::Other(), 1.0}};

1492

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1493

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1494

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1495

const int best_score_index =

1496

std::max_element(scores.begin(), scores.end()) - scores.begin();

1497

const std::string top_collection =

1498

classification_feature_processor_->LabelToCollection(best_score_index);

1499

1500

// Sanity checks.

1501

if (top_collection == Collections::Phone()) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1502

const int digit_count = CountDigits(context, selection_indices);

1503

if (digit_count <

1504

model_->classification_options()->phone_min_num_digits() ||

1505

digit_count >

1506

model_->classification_options()->phone_max_num_digits()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1507

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1508

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1509

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1510

} else if (top_collection == Collections::Address()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1511

if (selection_num_tokens <

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1512

model_->classification_options()->address_min_num_tokens()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1513

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1514

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1515

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1516

} else if (top_collection == Collections::Dictionary()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1517

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1518

dictionary_locales_,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1519

/*default_value=*/false)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1520

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1521

return true;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1522

}

1523

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1524

1525

*classification_results = {{top_collection, 1.0, scores[best_score_index]}};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1526

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1527

}

1528

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1529

bool Annotator::RegexClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1530

const std::string& context, CodepointSpan selection_indices,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1531

std::vector<ClassificationResult>* classification_result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1532

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1533

UTF8ToUnicodeText(context, /*do_copy=*/false)

1534

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1535

const UnicodeText selection_text_unicode(

1536

UTF8ToUnicodeText(selection_text, /*do_copy=*/false));

1537

1538

// Check whether any of the regular expressions match.

1539

for (const int pattern_id : classification_regex_patterns_) {

1540

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

1541

const std::unique_ptr<UniLib::RegexMatcher> matcher =

1542

regex_pattern.pattern->Matcher(selection_text_unicode);

1543

int status = UniLib::RegexMatcher::kNoError;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1544

bool matches;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1545

if (regex_pattern.config->use_approximate_matching()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1546

matches = matcher->ApproximatelyMatches(&status);

1547

} else {

1548

matches = matcher->Matches(&status);

1549

}

1550

if (status != UniLib::RegexMatcher::kNoError) {

1551

return false;

1552

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1553

if (matches && VerifyRegexMatchCandidate(

1554

context, regex_pattern.config->verification_options(),

1555

selection_text, matcher.get())) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1556

classification_result->push_back(

1557

{regex_pattern.config->collection_name()->str(),

1558

regex_pattern.config->target_classification_score(),

1559

regex_pattern.config->priority_score()});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1560

if (!SerializedEntityDataFromRegexMatch(

1561

regex_pattern.config, matcher.get(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1562

&classification_result->back().serialized_entity_data)) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1563

TC3_LOG(ERROR) << "Could not get entity data.";

1564

return false;

1565

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1569

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1570

}

1571

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1572

namespace {

1573

std::string PickCollectionForDatetime(

1574

const DatetimeParseResult& datetime_parse_result) {

1575

switch (datetime_parse_result.granularity) {

1576

case GRANULARITY_HOUR:

1577

case GRANULARITY_MINUTE:

1578

case GRANULARITY_SECOND:

1579

return Collections::DateTime();

1580

default:

1581

return Collections::Date();

1582

}

1583

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1584

1585

std::string CreateDatetimeSerializedEntityData(

1586

const DatetimeParseResult& parse_result) {

1587

EntityDataT entity_data;

1588

entity_data.datetime.reset(new EntityData_::DatetimeT());

1589

entity_data.datetime->time_ms_utc = parse_result.time_ms_utc;

1590

entity_data.datetime->granularity =

1591

static_cast<EntityData_::Datetime_::Granularity>(

1592

parse_result.granularity);

1593

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1594

for (const auto& c : parse_result.datetime_components) {

1595

EntityData_::Datetime_::DatetimeComponentT datetime_component;

1596

datetime_component.absolute_value = c.value;

1597

datetime_component.relative_count = c.relative_count;

1598

datetime_component.component_type =

1599

static_cast<EntityData_::Datetime_::DatetimeComponent_::ComponentType>(

1600

c.component_type);

1601

datetime_component.relation_type =

1602

EntityData_::Datetime_::DatetimeComponent_::RelationType_ABSOLUTE;

1603

if (c.relative_qualifier !=

1604

DatetimeComponent::RelativeQualifier::UNSPECIFIED) {

1605

datetime_component.relation_type =

1606

EntityData_::Datetime_::DatetimeComponent_::RelationType_RELATIVE;

1607

}

1608

entity_data.datetime->datetime_component.emplace_back(

1609

new EntityData_::Datetime_::DatetimeComponentT(datetime_component));

1610

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1611

flatbuffers::FlatBufferBuilder builder;

1612

FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));

1613

return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),

1614

builder.GetSize());

1615

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1616

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1617

} // namespace

1618

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1619

bool Annotator::DatetimeClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1620

const std::string& context, CodepointSpan selection_indices,

1621

const ClassificationOptions& options,

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1622

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1623

if (!datetime_parser_ && !cfg_datetime_parser_) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1627

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1628

UTF8ToUnicodeText(context, /*do_copy=*/false)

1629

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1630

1631

std::vector<DatetimeParseResultSpan> datetime_spans;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1632

if (cfg_datetime_parser_) {

1633

if (!(model_->grammar_datetime_model()->enabled_modes() &

1634

ModeFlag_CLASSIFICATION)) {

1635

return true;

1636

}

1637

std::vector<Locale> parsed_locales;

1638

ParseLocales(options.locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

1639

cfg_datetime_parser_->Parse(

1640

selection_text,

1641

ToDateAnnotationOptions(

1642

model_->grammar_datetime_model()->annotation_options(),

1643

options.reference_timezone, options.reference_time_ms_utc),

1644

parsed_locales, &datetime_spans);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1645

} else if (datetime_parser_) {

1646

if (!datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,

1647

options.reference_timezone, options.locales,

1648

ModeFlag_CLASSIFICATION,

1649

options.annotation_usecase,

1650

/*anchor_start_end=*/true, &datetime_spans)) {

1651

TC3_LOG(ERROR) << "Error during parsing datetime.";

1652

return false;

1653

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1654

}

1655

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

1656

// Only consider the result valid if the selection and extracted datetime

1657

// spans exactly match.

1658

if (std::make_pair(datetime_span.span.first + selection_indices.first,

1659

datetime_span.span.second + selection_indices.first) ==

1660

selection_indices) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1661

for (const DatetimeParseResult& parse_result : datetime_span.data) {

1662

classification_results->emplace_back(

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1663

PickCollectionForDatetime(parse_result),

1664

datetime_span.target_classification_score);

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1665

classification_results->back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1666

classification_results->back().serialized_entity_data =

1667

CreateDatetimeSerializedEntityData(parse_result);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1668

classification_results->back().priority_score =

1669

datetime_span.priority_score;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1670

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1671

return true;

1672

}

1673

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1674

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1675

}

1676

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1677

std::vector<ClassificationResult> Annotator::ClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1678

const std::string& context, CodepointSpan selection_indices,

1679

const ClassificationOptions& options) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1680

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1681

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1685

if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {

return {};

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1689

std::vector<Locale> detected_text_language_tags;

1690

if (!ParseLocales(options.detected_text_language_tags,

1691

&detected_text_language_tags)) {

1692

TC3_LOG(WARNING)

1693

<< "Failed to parse the detected_text_language_tags in options: "

1694

<< options.detected_text_language_tags;

1695

}

1696

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1697

model_triggering_locales_,

1698

/*default_value=*/true)) {

return {};

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

1702

if (!IsValidSpanInput(UTF8ToUnicodeText(context, /*do_copy=*/false),

1703

selection_indices)) {

1704

TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1705

<< std::get<0>(selection_indices) << " "

1706

<< std::get<1>(selection_indices);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1710

// We'll accumulate a list of candidates, and pick the best candidate in the

1711

// end.

1712

std::vector<AnnotatedSpan> candidates;

1713

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1714

// Try the knowledge engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1715

// TODO(b/126579108): Propagate error status.

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1716

ClassificationResult knowledge_result;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1717

if (knowledge_engine_ &&

1718

knowledge_engine_->ClassifyText(

1719

context, selection_indices, options.annotation_usecase,

1720

options.location_context, &knowledge_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1721

candidates.push_back({selection_indices, {knowledge_result}});

1722

candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1723

}

1724

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1725

AddContactMetadataToKnowledgeClassificationResults(&candidates);

1726

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1727

// Try the contact engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1728

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1729

ClassificationResult contact_result;

1730

if (contact_engine_ && contact_engine_->ClassifyText(

1731

context, selection_indices, &contact_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1732

candidates.push_back({selection_indices, {contact_result}});

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1733

}

1734

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1735

// Try the person name engine.

1736

ClassificationResult person_name_result;

1737

if (person_name_engine_ &&

1738

person_name_engine_->ClassifyText(context, selection_indices,

1739

&person_name_result)) {

1740

candidates.push_back({selection_indices, {person_name_result}});

1741

}

1742

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1743

// Try the installed app engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1744

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1745

ClassificationResult installed_app_result;

1746

if (installed_app_engine_ &&

1747

installed_app_engine_->ClassifyText(context, selection_indices,

1748

&installed_app_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1749

candidates.push_back({selection_indices, {installed_app_result}});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1750

}

1751

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1752

// Try the regular expression models.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1753

std::vector<ClassificationResult> regex_results;

1754

if (!RegexClassifyText(context, selection_indices, &regex_results)) {

1755

return {};

1756

}

1757

for (const ClassificationResult& result : regex_results) {

1758

candidates.push_back({selection_indices, {result}});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1759

}

1760

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1761

// Try the date model.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1762

//

1763

// DatetimeClassifyText only returns the first result, which can however have

1764

// more interpretations. They are inserted in the candidates as a single

1765

// AnnotatedSpan, so that they get treated together by the conflict resolution

1766

// algorithm.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1767

std::vector<ClassificationResult> datetime_results;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1768

if (!DatetimeClassifyText(context, selection_indices, options,

1769

&datetime_results)) {

1770

return {};

1771

}

1772

if (!datetime_results.empty()) {

1773

candidates.push_back({selection_indices, std::move(datetime_results)});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1774

candidates.back().source = AnnotatedSpan::Source::DATETIME;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1775

}

1776

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1777

// Try the number annotator.

1778

// TODO(b/126579108): Propagate error status.

1779

ClassificationResult number_annotator_result;

1780

if (number_annotator_ &&

1781

number_annotator_->ClassifyText(

1782

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1783

options.annotation_usecase, &number_annotator_result)) {

1784

candidates.push_back({selection_indices, {number_annotator_result}});

1785

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1786

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1787

// Try the duration annotator.

1788

ClassificationResult duration_annotator_result;

1789

if (duration_annotator_ &&

1790

duration_annotator_->ClassifyText(

1791

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1792

options.annotation_usecase, &duration_annotator_result)) {

1793

candidates.push_back({selection_indices, {duration_annotator_result}});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1794

candidates.back().source = AnnotatedSpan::Source::DURATION;

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1795

}

1796

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1797

// Try the translate annotator.

1798

ClassificationResult translate_annotator_result;

1799

if (translate_annotator_ &&

1800

translate_annotator_->ClassifyText(

1801

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1802

options.user_familiar_language_tags, &translate_annotator_result)) {

1803

candidates.push_back({selection_indices, {translate_annotator_result}});

1804

}

1805

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

1806

// Try the grammar model.

1807

ClassificationResult grammar_annotator_result;

1808

if (grammar_annotator_ && grammar_annotator_->ClassifyText(

1809

detected_text_language_tags,

1810

UTF8ToUnicodeText(context, /*do_copy=*/false),

1811

selection_indices, &grammar_annotator_result)) {

1812

candidates.push_back({selection_indices, {grammar_annotator_result}});

1813

}

1814

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1815

// Try the ML model.

1816

//

1817

// The output of the model is considered as an exclusive 1-of-N choice. That's

1818

// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1

1819

// span for each candidate, like e.g. the regex model.

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1820

InterpreterManager interpreter_manager(selection_executor_.get(),

1821

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1822

std::vector<ClassificationResult> model_results;

1823

std::vector<Token> tokens;

1824

if (!ModelClassifyText(

1825

context, /*cached_tokens=*/{}, detected_text_language_tags,

1826

selection_indices, &interpreter_manager,

1827

/*embedding_cache=*/nullptr, &model_results, &tokens)) {

1828

return {};

1829

}

1830

if (!model_results.empty()) {

1831

candidates.push_back({selection_indices, std::move(model_results)});

1832

}

1833

1834

std::vector<int> candidate_indices;

1835

if (!ResolveConflicts(candidates, context, tokens,

1836

detected_text_language_tags, options.annotation_usecase,

1837

&interpreter_manager, &candidate_indices)) {

1838

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

return {};

}

std::vector<ClassificationResult> results;

1843

for (const int i : candidate_indices) {

1844

for (const ClassificationResult& result : candidates[i].classification) {

1845

if (!FilteredForClassification(result)) {

1846

results.push_back(result);

1847

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1848

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1849

}

1850

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1851

// Sort results according to score.

1852

std::sort(results.begin(), results.end(),

1853

[](const ClassificationResult& a, const ClassificationResult& b) {

1854

return a.score > b.score;

1855

});

1856

1857

if (results.empty()) {

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1858

results = {{Collections::Other(), 1.0}};

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1859

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1860

return results;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1861

}

1862

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1863

bool Annotator::ModelAnnotate(

1864

const std::string& context,

1865

const std::vector<Locale>& detected_text_language_tags,

1866

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

1867

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1868

if (model_->triggering_options() == nullptr ||

1869

!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1873

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1874

ml_model_triggering_locales_,

1875

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1879

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

1880

/*do_copy=*/false);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1881

std::vector<UnicodeTextRange> lines;

1882

if (!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {

1883

lines.push_back({context_unicode.begin(), context_unicode.end()});

1884

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1885

lines = selection_feature_processor_->SplitContext(

1886

context_unicode, selection_feature_processor_->GetOptions()

1887

->use_pipe_character_for_newline());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1888

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1889

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1890

const float min_annotate_confidence =

1891

(model_->triggering_options() != nullptr

1892

? model_->triggering_options()->min_annotate_confidence()

1893

: 0.f);

1894

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1895

for (const UnicodeTextRange& line : lines) {

Tony Mak

408c6b8

2019-03-08 17:57:27 +0000

[diff] [blame]

1896

FeatureProcessor::EmbeddingCache embedding_cache;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1897

const std::string line_str =

1898

UnicodeText::UTF8Substring(line.first, line.second);

1899

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1900

*tokens = selection_feature_processor_->Tokenize(line_str);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1901

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1902

line_str, {0, std::distance(line.first, line.second)},

1903

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1904

tokens,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1905

/*click_pos=*/nullptr);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1906

const TokenSpan full_line_span = {0, tokens->size()};

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1907

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1908

// TODO(zilka): Add support for greater granularity of this check.

1909

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1910

*tokens, full_line_span)) {

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1914

std::unique_ptr<CachedFeatures> cached_features;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1915

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1916

*tokens, full_line_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1917

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1918

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1919

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1920

selection_feature_processor_->EmbeddingSize() +

1921

selection_feature_processor_->DenseFeaturesCount(),

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1922

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1923

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1924

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1925

}

1926

1927

std::vector<TokenSpan> local_chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1928

if (!ModelChunk(tokens->size(), /*span_of_interest=*/full_line_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1929

interpreter_manager->SelectionInterpreter(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1930

*cached_features, &local_chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1931

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1932

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1933

}

1934

1935

const int offset = std::distance(context_unicode.begin(), line.first);

1936

for (const TokenSpan& chunk : local_chunks) {

1937

const CodepointSpan codepoint_span =

1938

selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1939

line_str, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1940

1941

// Skip empty spans.

1942

if (codepoint_span.first != codepoint_span.second) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1943

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1944

if (!ModelClassifyText(line_str, *tokens, detected_text_language_tags,

1945

codepoint_span, interpreter_manager,

1946

&embedding_cache, &classification)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1947

TC3_LOG(ERROR) << "Could not classify text: "

1948

<< (codepoint_span.first + offset) << " "

1949

<< (codepoint_span.second + offset);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

// Do not include the span if it's classified as "other".

1954

if (!classification.empty() && !ClassifiedAsOther(classification) &&

1955

classification[0].score >= min_annotate_confidence) {

1956

AnnotatedSpan result_span;

1957

result_span.span = {codepoint_span.first + offset,

1958

codepoint_span.second + offset};

1959

result_span.classification = std::move(classification);

1960

result->push_back(std::move(result_span));

1961

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1962

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1963

}

1964

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1968

const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1969

return selection_feature_processor_.get();

1970

}

1971

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1972

const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1973

const {

1974

return classification_feature_processor_.get();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1975

}

1976

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1977

const DatetimeParser* Annotator::DatetimeParserForTests() const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1978

return datetime_parser_.get();

1979

}

1980

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1981

void Annotator::RemoveNotEnabledEntityTypes(

1982

const EnabledEntityTypes& is_entity_type_enabled,

1983

std::vector<AnnotatedSpan>* annotated_spans) const {

1984

for (AnnotatedSpan& annotated_span : *annotated_spans) {

1985

std::vector<ClassificationResult>& classifications =

1986

annotated_span.classification;

1987

classifications.erase(

1988

std::remove_if(classifications.begin(), classifications.end(),

1989

[&is_entity_type_enabled](

1990

const ClassificationResult& classification_result) {

1991

return !is_entity_type_enabled(

1992

classification_result.collection);

1993

}),

1994

classifications.end());

1995

}

1996

annotated_spans->erase(

1997

std::remove_if(annotated_spans->begin(), annotated_spans->end(),

1998

[](const AnnotatedSpan& annotated_span) {

1999

return annotated_span.classification.empty();

2000

}),

2001

annotated_spans->end());

2002

}

2003

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2004

void Annotator::AddContactMetadataToKnowledgeClassificationResults(

2005

std::vector<AnnotatedSpan>* candidates) const {

2006

if (candidates == nullptr || contact_engine_ == nullptr) {

2007

return;

2008

}

2009

for (auto& candidate : *candidates) {

2010

for (auto& classification_result : candidate.classification) {

2011

contact_engine_->AddContactMetadataToKnowledgeClassificationResult(

2012

&classification_result);

}

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2017

std::vector<AnnotatedSpan> Annotator::Annotate(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2018

const std::string& context, const AnnotationOptions& options) const {

2019

std::vector<AnnotatedSpan> candidates;

2020

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2021

if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {

return {};

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2025

const UnicodeText context_unicode =

2026

UTF8ToUnicodeText(context, /*do_copy=*/false);

2027

if (!context_unicode.is_valid()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2031

std::vector<Locale> detected_text_language_tags;

2032

if (!ParseLocales(options.detected_text_language_tags,

2033

&detected_text_language_tags)) {

2034

TC3_LOG(WARNING)

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2035

<< "Failed to parse the detected_text_language_tags in options: "

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2036

<< options.detected_text_language_tags;

2037

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2038

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

2039

model_triggering_locales_,

2040

/*default_value=*/true)) {

return {};

}

InterpreterManager interpreter_manager(selection_executor_.get(),

2045

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2046

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2047

// Annotate with the selection model.

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2048

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2049

if (!ModelAnnotate(context, detected_text_language_tags, &interpreter_manager,

2050

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2051

TC3_LOG(ERROR) << "Couldn't run ModelAnnotate.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

// Annotate with the regular expression models.

2056

if (!RegexChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2057

annotation_regex_patterns_, &candidates,

2058

options.is_serialized_entity_data_enabled)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2059

TC3_LOG(ERROR) << "Couldn't run RegexChunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

// Annotate with the datetime model.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2064

const EnabledEntityTypes is_entity_type_enabled(options.entity_types);

2065

if ((is_entity_type_enabled(Collections::Date()) ||

2066

is_entity_type_enabled(Collections::DateTime())) &&

2067

!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2068

options.reference_time_ms_utc, options.reference_timezone,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2069

options.locales, ModeFlag_ANNOTATION,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2070

options.annotation_usecase,

2071

options.is_serialized_entity_data_enabled, &candidates)) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2072

TC3_LOG(ERROR) << "Couldn't run DatetimeChunk.";

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2076

// Annotate with the knowledge engine into a temporary vector.

2077

std::vector<AnnotatedSpan> knowledge_candidates;

2078

if (knowledge_engine_ &&

2079

!knowledge_engine_->Chunk(context, options.annotation_usecase,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2080

options.location_context,

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2081

&knowledge_candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2082

TC3_LOG(ERROR) << "Couldn't run knowledge engine Chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2086

AddContactMetadataToKnowledgeClassificationResults(&knowledge_candidates);

2087

2088

// Move the knowledge candidates to the full candidate list, and erase

2089

// knowledge_candidates.

2090

candidates.insert(candidates.end(),

2091

std::make_move_iterator(knowledge_candidates.begin()),

2092

std::make_move_iterator(knowledge_candidates.end()));

2093

knowledge_candidates.clear();

2094

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2095

// Annotate with the contact engine.

2096

if (contact_engine_ &&

2097

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

2098

TC3_LOG(ERROR) << "Couldn't run contact engine Chunk.";

return {};

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2102

// Annotate with the installed app engine.

2103

if (installed_app_engine_ &&

2104

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

2105

TC3_LOG(ERROR) << "Couldn't run installed app engine Chunk.";

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2109

// Annotate with the number annotator.

2110

if (number_annotator_ != nullptr &&

2111

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

2112

&candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

2113

TC3_LOG(ERROR) << "Couldn't run number annotator FindAll.";

return {};

}

// Annotate with the duration annotator.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2118

if (is_entity_type_enabled(Collections::Duration()) &&

2119

duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2120

!duration_annotator_->FindAll(context_unicode, tokens,

2121

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

2122

TC3_LOG(ERROR) << "Couldn't run duration annotator FindAll.";

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

return {};

}

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2126

// Annotate with the person name engine.

2127

if (is_entity_type_enabled(Collections::PersonName()) &&

2128

person_name_engine_ &&

2129

!person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {

2130

TC3_LOG(ERROR) << "Couldn't run person name engine Chunk.";

return {};

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2134

// Annotate with the grammar annotators.

2135

if (grammar_annotator_ != nullptr &&

2136

!grammar_annotator_->Annotate(detected_text_language_tags,

2137

context_unicode, &candidates)) {

2138

TC3_LOG(ERROR) << "Couldn't run grammar annotators.";

return {};

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2142

// Sort candidates according to their position in the input, so that the next

2143

// code can assume that any connected component of overlapping spans forms a

2144

// contiguous block.

2145

std::sort(candidates.begin(), candidates.end(),

2146

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

2147

return a.span.first < b.span.first;

2148

});

2149

2150

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2151

if (!ResolveConflicts(candidates, context, tokens,

2152

detected_text_language_tags, options.annotation_usecase,

2153

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2154

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2158

std::vector<AnnotatedSpan> result;

2159

result.reserve(candidate_indices.size());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2160

AnnotatedSpan aggregated_span;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2161

for (const int i : candidate_indices) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2162

if (candidates[i].span != aggregated_span.span) {

2163

if (!aggregated_span.classification.empty()) {

2164

result.push_back(std::move(aggregated_span));

2165

}

2166

aggregated_span =

2167

AnnotatedSpan(candidates[i].span, /*arg_classification=*/{});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2168

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2169

if (candidates[i].classification.empty() ||

2170

ClassifiedAsOther(candidates[i].classification) ||

2171

FilteredForAnnotation(candidates[i])) {

2172

continue;

2173

}

2174

for (ClassificationResult& classification : candidates[i].classification) {

2175

aggregated_span.classification.push_back(std::move(classification));

2176

}

2177

}

2178

if (!aggregated_span.classification.empty()) {

2179

result.push_back(std::move(aggregated_span));

2180

}

2181

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2182

// We generate all candidates and remove them later (with the exception of

2183

// date/time/duration entities) because there are complex interdependencies

2184

// between the entity types. E.g., the TLD of an email can be interpreted as a

2185

// URL, but most likely a user of the API does not want such annotations if

2186

// "url" is enabled and "email" is not.

2187

RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);

2188

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2189

for (AnnotatedSpan& annotated_span : result) {

2190

SortClassificationResults(&annotated_span.classification);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2191

}

2192

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return result;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2196

CodepointSpan Annotator::ComputeSelectionBoundaries(

2197

const UniLib::RegexMatcher* match,

2198

const RegexModel_::Pattern* config) const {

2199

if (config->capturing_group() == nullptr) {

2200

// Use first capturing group to specify the selection.

2201

int status = UniLib::RegexMatcher::kNoError;

2202

const CodepointSpan result = {match->Start(1, &status),

2203

match->End(1, &status)};

2204

if (status != UniLib::RegexMatcher::kNoError) {

2205

return {kInvalidIndex, kInvalidIndex};

}

return result;

}

CodepointSpan result = {kInvalidIndex, kInvalidIndex};

2211

const int num_groups = config->capturing_group()->size();

2212

for (int i = 0; i < num_groups; i++) {

2213

if (!config->capturing_group()->Get(i)->extend_selection()) {

continue;

}

int status = UniLib::RegexMatcher::kNoError;

2218

// Check match and adjust bounds.

2219

const int group_start = match->Start(i, &status);

2220

const int group_end = match->End(i, &status);

2221

if (status != UniLib::RegexMatcher::kNoError) {

2222

return {kInvalidIndex, kInvalidIndex};

2223

}

2224

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

2225

continue;

2226

}

2227

if (result.first == kInvalidIndex) {

2228

result = {group_start, group_end};

2229

} else {

2230

result.first = std::min(result.first, group_start);

2231

result.second = std::max(result.second, group_end);

}

}

return result;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2237

bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2238

if (pattern->serialized_entity_data() != nullptr ||

2239

pattern->entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2240

return true;

2241

}

2242

if (pattern->capturing_group() != nullptr) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2243

for (const CapturingGroup* group : *pattern->capturing_group()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2244

if (group->entity_field_path() != nullptr) {

2245

return true;

2246

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2247

if (group->serialized_entity_data() != nullptr ||

2248

group->entity_data() != nullptr) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2249

return true;

2250

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

return false;

}

bool Annotator::SerializedEntityDataFromRegexMatch(

2257

const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,

2258

std::string* serialized_entity_data) const {

2259

if (!HasEntityData(pattern)) {

2260

serialized_entity_data->clear();

2261

return true;

2262

}

2263

TC3_CHECK(entity_data_builder_ != nullptr);

2264

2265

std::unique_ptr<ReflectiveFlatbuffer> entity_data =

2266

entity_data_builder_->NewRoot();

2267

2268

TC3_CHECK(entity_data != nullptr);

2269

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2270

// Set fixed entity data.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2271

if (pattern->serialized_entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2272

entity_data->MergeFromSerializedFlatbuffer(

2273

StringPiece(pattern->serialized_entity_data()->c_str(),

2274

pattern->serialized_entity_data()->size()));

2275

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2276

if (pattern->entity_data() != nullptr) {

2277

entity_data->MergeFrom(

2278

reinterpret_cast<const flatbuffers::Table*>(pattern->entity_data()));

2279

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2280

2281

// Add entity data from rule capturing groups.

2282

if (pattern->capturing_group() != nullptr) {

2283

const int num_groups = pattern->capturing_group()->size();

2284

for (int i = 0; i < num_groups; i++) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2285

const CapturingGroup* group = pattern->capturing_group()->Get(i);

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2286

2287

// Check whether the group matched.

2288

Optional<std::string> group_match_text =

2289

GetCapturingGroupText(matcher, /*group_id=*/i);

2290

if (!group_match_text.has_value()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2291

continue;

2292

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2293

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2294

// Set fixed entity data from capturing group match.

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2295

if (group->serialized_entity_data() != nullptr) {

2296

entity_data->MergeFromSerializedFlatbuffer(

2297

StringPiece(group->serialized_entity_data()->c_str(),

2298

group->serialized_entity_data()->size()));

2299

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2300

if (group->entity_data() != nullptr) {

2301

entity_data->MergeFrom(reinterpret_cast<const flatbuffers::Table*>(

2302

pattern->entity_data()));

2303

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2304

2305

// Set entity field from capturing group text.

2306

if (group->entity_field_path() != nullptr) {

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2307

UnicodeText normalized_group_match_text =

2308

UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);

2309

2310

// Apply normalization if specified.

2311

if (group->normalization_options() != nullptr) {

2312

normalized_group_match_text =

2313

NormalizeText(unilib_, group->normalization_options(),

2314

normalized_group_match_text);

2315

}

2316

2317

if (!entity_data->ParseAndSet(

2318

group->entity_field_path(),

2319

normalized_group_match_text.ToUTF8String())) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2320

TC3_LOG(ERROR)

2321

<< "Could not set entity data from rule capturing group.";

2322

return false;

2323

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

}

*serialized_entity_data = entity_data->Serialize();

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2332

UnicodeText RemoveMoneySeparators(

2333

const std::unordered_set<char32>& decimal_separators,

2334

const UnicodeText& amount,

2335

UnicodeText::const_iterator it_decimal_separator) {

2336

UnicodeText whole_amount;

2337

for (auto it = amount.begin();

2338

it != amount.end() && it != it_decimal_separator; ++it) {

2339

if (std::find(decimal_separators.begin(), decimal_separators.end(),

2340

static_cast<char32>(*it)) == decimal_separators.end()) {

2341

whole_amount.push_back(*it);

}

}

return whole_amount;

}

bool Annotator::ParseAndFillInMoneyAmount(

2348

std::string* serialized_entity_data) const {

2349

std::unique_ptr<EntityDataT> data =

2350

LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(

2351

*serialized_entity_data);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2352

if (data == nullptr || data->money->unnormalized_amount.empty()) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

UnicodeText amount =

UTF8ToUnicodeText(data->money->unnormalized_amount, /*do_copy=*/false);

2358

int separator_back_index = 0;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2359

auto it_decimal_separator = --amount.end();

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2360

for (; it_decimal_separator != amount.begin();

2361

--it_decimal_separator, ++separator_back_index) {

2362

if (std::find(money_separators_.begin(), money_separators_.end(),

2363

static_cast<char32>(*it_decimal_separator)) !=

2364

money_separators_.end()) {

break;

}

}

// If there are 3 digits after the last separator, we consider that a

2370

// thousands separator => the number is an int (e.g. 1.234 is considered int).

2371

// If there is no separator in number, also that number is an int.

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2372

if (separator_back_index == 3 || it_decimal_separator == amount.begin()) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2373

it_decimal_separator = amount.end();

2374

}

2375

2376

if (!unilib_->ParseInt32(RemoveMoneySeparators(money_separators_, amount,

2377

it_decimal_separator),

2378

&data->money->amount_whole_part)) {

2379

TC3_LOG(ERROR) << "Could not parse the money whole part as int32.";

2380

return false;

2381

}

2382

if (it_decimal_separator == amount.end()) {

2383

data->money->amount_decimal_part = 0;

2384

} else {

2385

const int amount_codepoints_size = amount.size_codepoints();

2386

if (!unilib_->ParseInt32(

2387

UnicodeText::Substring(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2388

amount, amount_codepoints_size - separator_back_index,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2389

amount_codepoints_size, /*do_copy=*/false),

2390

&data->money->amount_decimal_part)) {

2391

TC3_LOG(ERROR) << "Could not parse the money decimal part as int32.";

return false;

}

}

*serialized_entity_data =

2397

PackFlatbuffer<libtextclassifier3::EntityData>(data.get());

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2401

bool Annotator::RegexChunk(const UnicodeText& context_unicode,

2402

const std::vector<int>& rules,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2403

std::vector<AnnotatedSpan>* result,

2404

bool is_serialized_entity_data_enabled) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2405

for (int pattern_id : rules) {

2406

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

2407

const auto matcher = regex_pattern.pattern->Matcher(context_unicode);

2408

if (!matcher) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2409

TC3_LOG(ERROR) << "Could not get regex matcher for pattern: "

2410

<< pattern_id;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

int status = UniLib::RegexMatcher::kNoError;

2415

while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2416

if (regex_pattern.config->verification_options()) {

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2417

if (!VerifyRegexMatchCandidate(

2418

context_unicode.ToUTF8String(),

2419

regex_pattern.config->verification_options(),

2420

matcher->Group(1, &status).ToUTF8String(), matcher.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2421

continue;

2422

}

2423

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2424

2425

std::string serialized_entity_data;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2426

if (is_serialized_entity_data_enabled) {

2427

if (!SerializedEntityDataFromRegexMatch(

2428

regex_pattern.config, matcher.get(), &serialized_entity_data)) {

2429

TC3_LOG(ERROR) << "Could not get entity data.";

2430

return false;

2431

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2432

2433

// Further parsing unnormalized_amount for money into amount_whole_part

2434

// and amount_decimal_part. Can't do this with regexes because we cannot

2435

// have empty groups (amount_decimal_part might be an empty group).

2436

if (regex_pattern.config->collection_name()->str() ==

2437

Collections::Money()) {

2438

if (!ParseAndFillInMoneyAmount(&serialized_entity_data)) {

2439

TC3_LOG(ERROR) << "Could not parse and fill in money amount.";

2440

}

2441

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2442

}

2443

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2444

result->emplace_back();

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2445

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2446

// Selection/annotation regular expressions need to specify a capturing

2447

// group specifying the selection.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2448

result->back().span =

2449

ComputeSelectionBoundaries(matcher.get(), regex_pattern.config);

2450

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2451

result->back().classification = {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2452

{regex_pattern.config->collection_name()->str(),

2453

regex_pattern.config->target_classification_score(),

2454

regex_pattern.config->priority_score()}};

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2455

2456

result->back().classification[0].serialized_entity_data =

2457

serialized_entity_data;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2463

bool Annotator::ModelChunk(int num_tokens, const TokenSpan& span_of_interest,

2464

tflite::Interpreter* selection_interpreter,

2465

const CachedFeatures& cached_features,

2466

std::vector<TokenSpan>* chunks) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2467

const int max_selection_span =

2468

selection_feature_processor_->GetOptions()->max_selection_span();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2469

// The inference span is the span of interest expanded to include

2470

// max_selection_span tokens on either side, which is how far a selection can

2471

// stretch from the click.

2472

const TokenSpan inference_span = IntersectTokenSpans(

2473

ExpandTokenSpan(span_of_interest,

2474

/*num_tokens_left=*/max_selection_span,

2475

/*num_tokens_right=*/max_selection_span),

2476

{0, num_tokens});

2477

2478

std::vector<ScoredChunk> scored_chunks;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2479

if (selection_feature_processor_->GetOptions()->bounds_sensitive_features() &&

2480

selection_feature_processor_->GetOptions()

2481

->bounds_sensitive_features()

2482

->enabled()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2483

if (!ModelBoundsSensitiveScoreChunks(

2484

num_tokens, span_of_interest, inference_span, cached_features,

2485

selection_interpreter, &scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

} else {

if (!ModelClickContextScoreChunks(num_tokens, span_of_interest,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2490

cached_features, selection_interpreter,

2491

&scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2492

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2493

}

2494

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2495

std::sort(scored_chunks.rbegin(), scored_chunks.rend(),

2496

[](const ScoredChunk& lhs, const ScoredChunk& rhs) {

2497

return lhs.score < rhs.score;

2498

});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2499

2500

// Traverse the candidate chunks from highest-scoring to lowest-scoring. Pick

2501

// them greedily as long as they do not overlap with any previously picked

2502

// chunks.

2503

std::vector<bool> token_used(TokenSpanSize(inference_span));

2504

chunks->clear();

2505

for (const ScoredChunk& scored_chunk : scored_chunks) {

2506

bool feasible = true;

2507

for (int i = scored_chunk.token_span.first;

2508

i < scored_chunk.token_span.second; ++i) {

2509

if (token_used[i - inference_span.first]) {

feasible = false;

break;

}

}

if (!feasible) {

continue;

}

for (int i = scored_chunk.token_span.first;

2520

i < scored_chunk.token_span.second; ++i) {

2521

token_used[i - inference_span.first] = true;

2522

}

2523

2524

chunks->push_back(scored_chunk.token_span);

2525

}

2526

2527

std::sort(chunks->begin(), chunks->end());

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2532

namespace {

2533

// Updates the value at the given key in the map to maximum of the current value

2534

// and the given value, or simply inserts the value if the key is not yet there.

2535

template <typename Map>

2536

void UpdateMax(Map* map, typename Map::key_type key,

2537

typename Map::mapped_type value) {

2538

const auto it = map->find(key);

2539

if (it != map->end()) {

2540

it->second = std::max(it->second, value);

} else {

(*map)[key] = value;

}

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2547

bool Annotator::ModelClickContextScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2548

int num_tokens, const TokenSpan& span_of_interest,

2549

const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2550

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2551

std::vector<ScoredChunk>* scored_chunks) const {

2552

const int max_batch_size = model_->selection_options()->batch_size();

2553

2554

std::vector<float> all_features;

2555

std::map<TokenSpan, float> chunk_scores;

2556

for (int batch_start = span_of_interest.first;

2557

batch_start < span_of_interest.second; batch_start += max_batch_size) {

2558

const int batch_end =

2559

std::min(batch_start + max_batch_size, span_of_interest.second);

2560

2561

// Prepare features for the whole batch.

2562

all_features.clear();

2563

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2564

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2565

cached_features.AppendClickContextFeaturesForClick(click_pos,

&all_features);

}

// Run batched inference.

2570

const int batch_size = batch_end - batch_start;

2571

const int features_size = cached_features.OutputFeaturesSize();

2572

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2573

TensorView<float>(all_features.data(), {batch_size, features_size}),

2574

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2575

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2576

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2577

return false;

2578

}

2579

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2580

logits.dim(1) !=

2581

selection_feature_processor_->GetSelectionLabelCount()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2582

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2588

const std::vector<float> scores = ComputeSoftmax(

2589

logits.data() + logits.dim(1) * (click_pos - batch_start),

2590

logits.dim(1));

2591

for (int j = 0;

2592

j < selection_feature_processor_->GetSelectionLabelCount(); ++j) {

2593

TokenSpan relative_token_span;

2594

if (!selection_feature_processor_->LabelToTokenSpan(

2595

j, &relative_token_span)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2596

TC3_LOG(ERROR) << "Couldn't map the label to a token span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2597

return false;

2598

}

2599

const TokenSpan candidate_span = ExpandTokenSpan(

2600

SingleTokenSpan(click_pos), relative_token_span.first,

2601

relative_token_span.second);

2602

if (candidate_span.first >= 0 && candidate_span.second <= num_tokens) {

2603

UpdateMax(&chunk_scores, candidate_span, scores[j]);

}

}

}

}

scored_chunks->clear();

2610

scored_chunks->reserve(chunk_scores.size());

2611

for (const auto& entry : chunk_scores) {

2612

scored_chunks->push_back(ScoredChunk{entry.first, entry.second});

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2618

bool Annotator::ModelBoundsSensitiveScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2619

int num_tokens, const TokenSpan& span_of_interest,

2620

const TokenSpan& inference_span, const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2621

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2622

std::vector<ScoredChunk>* scored_chunks) const {

2623

const int max_selection_span =

2624

selection_feature_processor_->GetOptions()->max_selection_span();

2625

const int max_chunk_length = selection_feature_processor_->GetOptions()

2626

->selection_reduced_output_space()

2627

? max_selection_span + 1

2628

: 2 * max_selection_span + 1;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2629

const bool score_single_token_spans_as_zero =

2630

selection_feature_processor_->GetOptions()

2631

->bounds_sensitive_features()

2632

->score_single_token_spans_as_zero();

2633

2634

scored_chunks->clear();

2635

if (score_single_token_spans_as_zero) {

2636

scored_chunks->reserve(TokenSpanSize(span_of_interest));

2637

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2638

2639

// Prepare all chunk candidates into one batch:

2640

// - Are contained in the inference span

2641

// - Have a non-empty intersection with the span of interest

2642

// - Are at least one token long

2643

// - Are not longer than the maximum chunk length

2644

std::vector<TokenSpan> candidate_spans;

2645

for (int start = inference_span.first; start < span_of_interest.second;

2646

++start) {

2647

const int leftmost_end_index = std::max(start, span_of_interest.first) + 1;

2648

for (int end = leftmost_end_index;

2649

end <= inference_span.second && end - start <= max_chunk_length;

2650

++end) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2651

const TokenSpan candidate_span = {start, end};

2652

if (score_single_token_spans_as_zero &&

2653

TokenSpanSize(candidate_span) == 1) {

2654

// Do not include the single token span in the batch, add a zero score

2655

// for it directly to the output.

2656

scored_chunks->push_back(ScoredChunk{candidate_span, 0.0f});

2657

} else {

2658

candidate_spans.push_back(candidate_span);

2659

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

const int max_batch_size = model_->selection_options()->batch_size();

2664

2665

std::vector<float> all_features;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2666

scored_chunks->reserve(scored_chunks->size() + candidate_spans.size());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2667

for (int batch_start = 0; batch_start < candidate_spans.size();

2668

batch_start += max_batch_size) {

2669

const int batch_end = std::min(batch_start + max_batch_size,

2670

static_cast<int>(candidate_spans.size()));

2671

2672

// Prepare features for the whole batch.

2673

all_features.clear();

2674

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2675

for (int i = batch_start; i < batch_end; ++i) {

2676

cached_features.AppendBoundsSensitiveFeaturesForSpan(candidate_spans[i],

&all_features);

}

// Run batched inference.

2681

const int batch_size = batch_end - batch_start;

2682

const int features_size = cached_features.OutputFeaturesSize();

2683

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2684

TensorView<float>(all_features.data(), {batch_size, features_size}),

2685

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2686

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2687

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2688

return false;

2689

}

2690

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2691

logits.dim(1) != 1) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2692

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int i = batch_start; i < batch_end; ++i) {

2698

scored_chunks->push_back(

2699

ScoredChunk{candidate_spans[i], logits.data()[i - batch_start]});

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2706

bool Annotator::DatetimeChunk(const UnicodeText& context_unicode,

2707

int64 reference_time_ms_utc,

2708

const std::string& reference_timezone,

2709

const std::string& locales, ModeFlag mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2710

AnnotationUsecase annotation_usecase,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2711

bool is_serialized_entity_data_enabled,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2712

std::vector<AnnotatedSpan>* result) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2713

std::vector<DatetimeParseResultSpan> datetime_spans;

2714

if (cfg_datetime_parser_) {

2715

if (!(model_->grammar_datetime_model()->enabled_modes() & mode)) {

2716

return true;

2717

}

2718

std::vector<Locale> parsed_locales;

2719

ParseLocales(locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame^]

2720

cfg_datetime_parser_->Parse(

2721

context_unicode.ToUTF8String(),

2722

ToDateAnnotationOptions(

2723

model_->grammar_datetime_model()->annotation_options(),

2724

reference_timezone, reference_time_ms_utc),

2725

parsed_locales, &datetime_spans);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2726

} else if (datetime_parser_) {

2727

if (!datetime_parser_->Parse(context_unicode, reference_time_ms_utc,

2728

reference_timezone, locales, mode,

2729

annotation_usecase,

2730

/*anchor_start_end=*/false, &datetime_spans)) {

2731

return false;

2732

}

2733

} else {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2734

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2735

}

2736

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2737

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2738

AnnotatedSpan annotated_span;

2739

annotated_span.span = datetime_span.span;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2740

for (const DatetimeParseResult& parse_result : datetime_span.data) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2741

annotated_span.classification.emplace_back(

2742

PickCollectionForDatetime(parse_result),

2743

datetime_span.target_classification_score,

2744

datetime_span.priority_score);

2745

annotated_span.classification.back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2746

if (is_serialized_entity_data_enabled) {

2747

annotated_span.classification.back().serialized_entity_data =

2748

CreateDatetimeSerializedEntityData(parse_result);

2749

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2750

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2751

annotated_span.source = AnnotatedSpan::Source::DATETIME;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2752

result->push_back(std::move(annotated_span));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

return true;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2757

const Model* Annotator::model() const { return model_; }

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2758

const reflection::Schema* Annotator::entity_data_schema() const {

2759

return entity_data_schema_;

2760

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2761

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2762

const Model* ViewModel(const void* buffer, int size) {

if (!buffer) {

return nullptr;

}

return LoadAndVerifyModel(buffer, size);

2768

}

2769

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2770

bool Annotator::LookUpKnowledgeEntity(

2771

const std::string& id, std::string* serialized_knowledge_result) const {

2772

return knowledge_engine_ &&

2773

knowledge_engine_->LookUpEntity(id, serialized_knowledge_result);

2774

}

2775

Tony Mak