Blame - native/annotator/annotator.cc - platform/external/libtextclassifier

2018-01-24 11:11:20 +0100

[diff] [blame]

1

/*

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3

*

4

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

17

#include "annotator/annotator.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

18

19

#include <algorithm>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

20

#include <cmath>

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

21

#include <cstddef>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

22

#include <iterator>

23

#include <numeric>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

24

#include <string>

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

25

#include <unordered_map>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

26

#include <vector>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

27

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

28

#include "annotator/collections.h"

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

29

#include "annotator/flatbuffer-utils.h"

30

#include "annotator/knowledge/knowledge-engine-types.h"

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

31

#include "annotator/model_generated.h"

32

#include "annotator/types.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

33

#include "utils/base/logging.h"

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

34

#include "utils/base/status.h"

35

#include "utils/base/statusor.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

36

#include "utils/checksum.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

37

#include "utils/i18n/locale.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

38

#include "utils/math/softmax.h"

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

39

#include "utils/normalization.h"

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

40

#include "utils/optional.h"

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

41

#include "utils/regex-match.h"

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

42

#include "utils/strings/append.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

43

#include "utils/strings/numbers.h"

44

#include "utils/strings/split.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

45

#include "utils/utf8/unicodetext.h"

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

46

#include "utils/utf8/unilib-common.h"

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

47

#include "utils/zlib/zlib_regex.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

48

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

49

namespace libtextclassifier3 {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

50

51

using SortedIntSet = std::set<int, std::function<bool(int, int)>>;

52

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

53

const std::string& Annotator::kPhoneCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

54

*[]() { return new std::string("phone"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

55

const std::string& Annotator::kAddressCollection =

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

56

*[]() { return new std::string("address"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

57

const std::string& Annotator::kDateCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

58

*[]() { return new std::string("date"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

59

const std::string& Annotator::kUrlCollection =

60

*[]() { return new std::string("url"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

61

const std::string& Annotator::kEmailCollection =

62

*[]() { return new std::string("email"); }();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

63

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

64

namespace {

65

const Model* LoadAndVerifyModel(const void* addr, int size) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

66

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

Tony Mak

51a9e54

2018-11-02 13:36:22 +0000

[diff] [blame]

67

if (VerifyModelBuffer(verifier)) {

68

return GetModel(addr);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

73

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

74

const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,

75

int size) {

76

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

77

if (VerifyPersonNameModelBuffer(verifier)) {

78

return GetPersonNameModel(addr);

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

84

// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will

85

// create a new instance, assign ownership to owned_lib, and return it.

86

const UniLib* MaybeCreateUnilib(const UniLib* lib,

87

std::unique_ptr<UniLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new UniLib);

92

return owned_lib->get();

}

}

// As above, but for CalendarLib.

97

const CalendarLib* MaybeCreateCalendarlib(

98

const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new CalendarLib);

103

return owned_lib->get();

}

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

107

// Returns whether the provided input is valid:

108

// * Valid utf8 text.

109

// * Sane span indices.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

110

bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan& span) {

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame]

111

if (!context.is_valid()) {

112

return false;

113

}

114

return (span.first >= 0 && span.first < span.second &&

115

span.second <= context.size_codepoints());

116

}

117

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

118

std::unordered_set<char32> FlatbuffersIntVectorToChar32UnorderedSet(

119

const flatbuffers::Vector<int32_t>* ints) {

120

if (ints == nullptr) {

121

return {};

122

}

123

std::unordered_set<char32> ints_set;

124

for (auto value : *ints) {

125

ints_set.insert(static_cast<char32>(value));

}

return ints_set;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

130

DateAnnotationOptions ToDateAnnotationOptions(

131

const GrammarDatetimeModel_::AnnotationOptions* fb_annotation_options,

132

const std::string& reference_timezone, const int64 reference_time_ms_utc) {

133

DateAnnotationOptions result_annotation_options;

134

result_annotation_options.base_timestamp_millis = reference_time_ms_utc;

135

result_annotation_options.reference_timezone = reference_timezone;

136

if (fb_annotation_options != nullptr) {

137

result_annotation_options.enable_special_day_offset =

138

fb_annotation_options->enable_special_day_offset();

139

result_annotation_options.merge_adjacent_components =

140

fb_annotation_options->merge_adjacent_components();

141

result_annotation_options.enable_date_range =

142

fb_annotation_options->enable_date_range();

143

result_annotation_options.include_preposition =

144

fb_annotation_options->include_preposition();

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

145

if (fb_annotation_options->extra_requested_dates() != nullptr) {

146

for (const auto& extra_requested_date :

147

*fb_annotation_options->extra_requested_dates()) {

148

result_annotation_options.extra_requested_dates.push_back(

149

extra_requested_date->str());

150

}

151

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

152

if (fb_annotation_options->ignored_spans() != nullptr) {

153

for (const auto& ignored_span : *fb_annotation_options->ignored_spans()) {

154

result_annotation_options.ignored_spans.push_back(ignored_span->str());

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

155

}

156

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

157

}

158

return result_annotation_options;

159

}

160

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

161

} // namespace

162

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

163

tflite::Interpreter* InterpreterManager::SelectionInterpreter() {

164

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

165

TC3_CHECK(selection_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

166

selection_interpreter_ = selection_executor_->CreateInterpreter();

167

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

168

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

169

}

170

}

171

return selection_interpreter_.get();

172

}

173

174

tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {

175

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

176

TC3_CHECK(classification_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

177

classification_interpreter_ = classification_executor_->CreateInterpreter();

178

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

179

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

180

}

181

}

182

return classification_interpreter_.get();

183

}

184

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

185

std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(

186

const char* buffer, int size, const UniLib* unilib,

187

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

188

const Model* model = LoadAndVerifyModel(buffer, size);

189

if (model == nullptr) {

return nullptr;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

193

auto classifier =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

194

std::unique_ptr<Annotator>(new Annotator(model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

195

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

202

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

203

std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,

204

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

205

if (!(*mmap)->handle().ok()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

206

TC3_VLOG(1) << "Mmap failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

211

(*mmap)->handle().num_bytes());

212

if (!model) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

213

TC3_LOG(ERROR) << "Model verification failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

217

auto classifier = std::unique_ptr<Annotator>(

218

new Annotator(mmap, model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

219

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

226

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

227

std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,

228

std::unique_ptr<CalendarLib> calendarlib) {

229

if (!(*mmap)->handle().ok()) {

230

TC3_VLOG(1) << "Mmap failed.";

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

235

(*mmap)->handle().num_bytes());

236

if (model == nullptr) {

237

TC3_LOG(ERROR) << "Model verification failed.";

return nullptr;

}

auto classifier = std::unique_ptr<Annotator>(

242

new Annotator(mmap, model, std::move(unilib), std::move(calendarlib)));

243

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

250

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

251

int fd, int offset, int size, const UniLib* unilib,

252

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

253

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

254

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

255

}

256

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

257

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

258

int fd, int offset, int size, std::unique_ptr<UniLib> unilib,

259

std::unique_ptr<CalendarLib> calendarlib) {

260

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

261

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

262

}

263

264

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

265

int fd, const UniLib* unilib, const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

266

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

267

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

268

}

269

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

270

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

271

int fd, std::unique_ptr<UniLib> unilib,

272

std::unique_ptr<CalendarLib> calendarlib) {

273

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

274

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

275

}

276

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

277

std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,

278

const UniLib* unilib,

279

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

280

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

281

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

282

}

283

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

284

std::unique_ptr<Annotator> Annotator::FromPath(

285

const std::string& path, std::unique_ptr<UniLib> unilib,

286

std::unique_ptr<CalendarLib> calendarlib) {

287

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

288

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

289

}

290

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

291

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

292

const UniLib* unilib, const CalendarLib* calendarlib)

293

: model_(model),

294

mmap_(std::move(*mmap)),

295

owned_unilib_(nullptr),

296

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

297

owned_calendarlib_(nullptr),

298

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

299

ValidateAndInitialize();

300

}

301

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

302

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

303

std::unique_ptr<UniLib> unilib,

304

std::unique_ptr<CalendarLib> calendarlib)

305

: model_(model),

306

mmap_(std::move(*mmap)),

307

owned_unilib_(std::move(unilib)),

308

unilib_(owned_unilib_.get()),

309

owned_calendarlib_(std::move(calendarlib)),

310

calendarlib_(owned_calendarlib_.get()) {

311

ValidateAndInitialize();

312

}

313

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

314

Annotator::Annotator(const Model* model, const UniLib* unilib,

315

const CalendarLib* calendarlib)

316

: model_(model),

317

owned_unilib_(nullptr),

318

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

319

owned_calendarlib_(nullptr),

320

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

321

ValidateAndInitialize();

322

}

323

324

void Annotator::ValidateAndInitialize() {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

325

initialized_ = false;

326

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

327

if (model_ == nullptr) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

328

TC3_LOG(ERROR) << "No model specified.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

332

const bool model_enabled_for_annotation =

333

(model_->triggering_options() != nullptr &&

334

(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));

335

const bool model_enabled_for_classification =

336

(model_->triggering_options() != nullptr &&

337

(model_->triggering_options()->enabled_modes() &

338

ModeFlag_CLASSIFICATION));

339

const bool model_enabled_for_selection =

340

(model_->triggering_options() != nullptr &&

341

(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));

342

343

// Annotation requires the selection model.

344

if (model_enabled_for_annotation || model_enabled_for_selection) {

345

if (!model_->selection_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

346

TC3_LOG(ERROR) << "No selection options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

347

return;

348

}

349

if (!model_->selection_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

350

TC3_LOG(ERROR) << "No selection feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

351

return;

352

}

353

if (!model_->selection_feature_options()->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

354

TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

355

return;

356

}

357

if (!model_->selection_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

358

TC3_LOG(ERROR) << "No selection model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

359

return;

360

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

361

selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

362

if (!selection_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

363

TC3_LOG(ERROR) << "Could not initialize selection executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

364

return;

365

}

366

selection_feature_processor_.reset(

367

new FeatureProcessor(model_->selection_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

368

}

369

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

370

// Annotation requires the classification model for conflict resolution and

371

// scoring.

372

// Selection requires the classification model for conflict resolution.

373

if (model_enabled_for_annotation || model_enabled_for_classification ||

374

model_enabled_for_selection) {

375

if (!model_->classification_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

376

TC3_LOG(ERROR) << "No classification options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

381

TC3_LOG(ERROR) << "No classification feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()

386

->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

387

TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

388

return;

389

}

390

if (!model_->classification_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

391

TC3_LOG(ERROR) << "No clf model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

395

classification_executor_ =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

396

ModelExecutor::FromBuffer(model_->classification_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

397

if (!classification_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

398

TC3_LOG(ERROR) << "Could not initialize classification executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

classification_feature_processor_.reset(new FeatureProcessor(

403

model_->classification_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

404

}

405

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

406

// The embeddings need to be specified if the model is to be used for

407

// classification or selection.

408

if (model_enabled_for_annotation || model_enabled_for_classification ||

409

model_enabled_for_selection) {

410

if (!model_->embedding_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

411

TC3_LOG(ERROR) << "No embedding model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

412

return;

413

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

414

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

415

// Check that the embedding size of the selection and classification model

416

// matches, as they are using the same embeddings.

417

if (model_enabled_for_selection &&

418

(model_->selection_feature_options()->embedding_size() !=

419

model_->classification_feature_options()->embedding_size() ||

420

model_->selection_feature_options()->embedding_quantization_bits() !=

421

model_->classification_feature_options()

422

->embedding_quantization_bits())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

423

TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

424

return;

425

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

426

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

427

embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

428

model_->embedding_model(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

429

model_->classification_feature_options()->embedding_size(),

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

430

model_->classification_feature_options()->embedding_quantization_bits(),

431

model_->embedding_pruning_mask());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

432

if (!embedding_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

433

TC3_LOG(ERROR) << "Could not initialize embedding executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

434

return;

435

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

436

}

437

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

438

std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

439

if (model_->regex_model()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

440

if (!InitializeRegexModel(decompressor.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

441

TC3_LOG(ERROR) << "Could not initialize regex model.";

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

442

return;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

443

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

444

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

445

if (model_->grammar_datetime_model() &&

446

model_->grammar_datetime_model()->datetime_rules()) {

447

cfg_datetime_parser_.reset(new dates::CfgDatetimeAnnotator(

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

448

unilib_,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

449

/*tokenizer_options=*/

450

model_->grammar_datetime_model()->grammar_tokenizer_options(),

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

451

calendarlib_,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

452

/*datetime_rules=*/model_->grammar_datetime_model()->datetime_rules(),

453

model_->grammar_datetime_model()->target_classification_score(),

454

model_->grammar_datetime_model()->priority_score()));

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

455

if (!cfg_datetime_parser_) {

456

TC3_LOG(ERROR) << "Could not initialize context free grammar based "

457

"datetime parser.";

458

return;

459

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

460

}

461

462

if (model_->datetime_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

463

datetime_parser_ = DatetimeParser::Instance(

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

464

model_->datetime_model(), unilib_, calendarlib_, decompressor.get());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

465

if (!datetime_parser_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

466

TC3_LOG(ERROR) << "Could not initialize datetime parser.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

471

if (model_->output_options()) {

472

if (model_->output_options()->filtered_collections_annotation()) {

473

for (const auto collection :

474

*model_->output_options()->filtered_collections_annotation()) {

475

filtered_collections_annotation_.insert(collection->str());

476

}

477

}

478

if (model_->output_options()->filtered_collections_classification()) {

479

for (const auto collection :

480

*model_->output_options()->filtered_collections_classification()) {

481

filtered_collections_classification_.insert(collection->str());

482

}

483

}

484

if (model_->output_options()->filtered_collections_selection()) {

485

for (const auto collection :

486

*model_->output_options()->filtered_collections_selection()) {

487

filtered_collections_selection_.insert(collection->str());

}

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

492

if (model_->number_annotator_options() &&

493

model_->number_annotator_options()->enabled()) {

494

number_annotator_.reset(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

495

new NumberAnnotator(model_->number_annotator_options(), unilib_));

496

}

497

498

if (model_->money_parsing_options()) {

499

money_separators_ = FlatbuffersIntVectorToChar32UnorderedSet(

500

model_->money_parsing_options()->separators());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

501

}

502

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

503

if (model_->duration_annotator_options() &&

504

model_->duration_annotator_options()->enabled()) {

505

duration_annotator_.reset(

506

new DurationAnnotator(model_->duration_annotator_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

507

selection_feature_processor_.get(), unilib_));

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

508

}

509

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

510

if (model_->grammar_model()) {

511

grammar_annotator_.reset(new GrammarAnnotator(

512

unilib_, model_->grammar_model(), entity_data_builder_.get()));

513

}

514

Tony Mak

2020-09-18 16:41:23 +0100

[diff] [blame]

515

// The following #ifdef is here to aid quality evaluation of a situation, when

516

// a POD NER kill switch in AiAi is invoked, when a model that has POD NER in

517

// it.

518

#if !defined(TC3_DISABLE_POD_NER)

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

519

if (model_->pod_ner_model()) {

520

pod_ner_annotator_ =

521

PodNerAnnotator::Create(model_->pod_ner_model(), *unilib_);

522

}

Tony Mak

2020-09-18 16:41:23 +0100

[diff] [blame]

523

#endif

524

525

if (model_->vocab_model()) {

526

vocab_annotator_ = VocabAnnotator::Create(

527

model_->vocab_model(), *selection_feature_processor_, *unilib_);

528

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

529

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

530

if (model_->entity_data_schema()) {

531

entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(

532

model_->entity_data_schema()->Data(),

533

model_->entity_data_schema()->size());

534

if (entity_data_schema_ == nullptr) {

535

TC3_LOG(ERROR) << "Could not load entity data schema data.";

return;

}

entity_data_builder_.reset(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

540

new MutableFlatbufferBuilder(entity_data_schema_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

541

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

542

entity_data_schema_ = nullptr;

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

543

entity_data_builder_ = nullptr;

544

}

545

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

546

if (model_->triggering_locales() &&

547

!ParseLocales(model_->triggering_locales()->c_str(),

548

&model_triggering_locales_)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

549

TC3_LOG(ERROR) << "Could not parse model supported locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

554

model_->triggering_options()->locales() != nullptr &&

555

!ParseLocales(model_->triggering_options()->locales()->c_str(),

556

&ml_model_triggering_locales_)) {

557

TC3_LOG(ERROR) << "Could not parse supported ML model locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

562

model_->triggering_options()->dictionary_locales() != nullptr &&

563

!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),

564

&dictionary_locales_)) {

565

TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";

return;

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

569

if (model_->conflict_resolution_options() != nullptr) {

570

prioritize_longest_annotation_ =

571

model_->conflict_resolution_options()->prioritize_longest_annotation();

572

do_conflict_resolution_in_raw_mode_ =

573

model_->conflict_resolution_options()

574

->do_conflict_resolution_in_raw_mode();

575

}

576

Chang Li

cac0b44

2020-05-21 15:09:37 +0100

[diff] [blame]

577

#ifdef TC3_EXPERIMENTAL

578

TC3_LOG(WARNING) << "Enabling experimental annotators.";

579

InitializeExperimentalAnnotators();

580

#endif

581

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

initialized_ = true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

585

bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

586

if (!model_->regex_model()->patterns()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

587

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

588

}

589

590

// Initialize pattern recognizers.

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

591

int regex_pattern_id = 0;

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

592

for (const auto regex_pattern : *model_->regex_model()->patterns()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

593

std::unique_ptr<UniLib::RegexPattern> compiled_pattern =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

594

UncompressMakeRegexPattern(

595

*unilib_, regex_pattern->pattern(),

596

regex_pattern->compressed_pattern(),

597

model_->regex_model()->lazy_regex_compilation(), decompressor);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

598

if (!compiled_pattern) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

599

TC3_LOG(INFO) << "Failed to load regex pattern";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

600

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

601

}

602

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

603

if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

604

annotation_regex_patterns_.push_back(regex_pattern_id);

605

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

606

if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

607

classification_regex_patterns_.push_back(regex_pattern_id);

608

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

609

if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

610

selection_regex_patterns_.push_back(regex_pattern_id);

611

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

612

regex_patterns_.push_back({

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

613

regex_pattern,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

614

std::move(compiled_pattern),

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

615

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

616

++regex_pattern_id;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

617

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

618

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

619

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

620

}

621

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

622

bool Annotator::InitializeKnowledgeEngine(

623

const std::string& serialized_config) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

624

std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

625

if (!knowledge_engine->Initialize(serialized_config, unilib_)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

626

TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";

627

return false;

628

}

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

629

if (model_->triggering_options() != nullptr) {

630

knowledge_engine->SetPriorityScore(

631

model_->triggering_options()->knowledge_priority_score());

632

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

633

knowledge_engine_ = std::move(knowledge_engine);

return true;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

637

bool Annotator::InitializeContactEngine(const std::string& serialized_config) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

638

std::unique_ptr<ContactEngine> contact_engine(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

639

new ContactEngine(selection_feature_processor_.get(), unilib_,

640

model_->contact_annotator_options()));

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

641

if (!contact_engine->Initialize(serialized_config)) {

642

TC3_LOG(ERROR) << "Failed to initialize the contact engine.";

643

return false;

644

}

645

contact_engine_ = std::move(contact_engine);

return true;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

649

bool Annotator::InitializeInstalledAppEngine(

650

const std::string& serialized_config) {

651

std::unique_ptr<InstalledAppEngine> installed_app_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

652

new InstalledAppEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

653

if (!installed_app_engine->Initialize(serialized_config)) {

654

TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";

655

return false;

656

}

657

installed_app_engine_ = std::move(installed_app_engine);

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

661

void Annotator::SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id) {

662

lang_id_ = lang_id;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

663

if (lang_id_ != nullptr && model_->translate_annotator_options() &&

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

664

model_->translate_annotator_options()->enabled()) {

665

translate_annotator_.reset(new TranslateAnnotator(

666

model_->translate_annotator_options(), lang_id_, unilib_));

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

667

} else {

668

translate_annotator_.reset(nullptr);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

}

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

672

bool Annotator::InitializePersonNameEngineFromUnownedBuffer(const void* buffer,

673

int size) {

674

const PersonNameModel* person_name_model =

675

LoadAndVerifyPersonNameModel(buffer, size);

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

676

677

if (person_name_model == nullptr) {

678

TC3_LOG(ERROR) << "Person name model verification failed.";

return false;

}

if (!person_name_model->enabled()) {

return true;

}

std::unique_ptr<PersonNameEngine> person_name_engine(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

687

new PersonNameEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

688

if (!person_name_engine->Initialize(person_name_model)) {

689

TC3_LOG(ERROR) << "Failed to initialize the person name engine.";

690

return false;

691

}

692

person_name_engine_ = std::move(person_name_engine);

return true;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

696

bool Annotator::InitializePersonNameEngineFromScopedMmap(

697

const ScopedMmap& mmap) {

698

if (!mmap.handle().ok()) {

699

TC3_LOG(ERROR) << "Mmap for person name model failed.";

return false;

}

return InitializePersonNameEngineFromUnownedBuffer(mmap.handle().start(),

704

mmap.handle().num_bytes());

705

}

706

707

bool Annotator::InitializePersonNameEngineFromPath(const std::string& path) {

708

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

709

return InitializePersonNameEngineFromScopedMmap(*mmap);

710

}

711

712

bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,

713

int size) {

714

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

715

return InitializePersonNameEngineFromScopedMmap(*mmap);

716

}

717

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

718

bool Annotator::InitializeExperimentalAnnotators() {

719

if (ExperimentalAnnotator::IsEnabled()) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame]

720

experimental_annotator_.reset(new ExperimentalAnnotator(

721

model_->experimental_model(), *selection_feature_processor_, *unilib_));

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

return true;

}

return false;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

727

namespace {

728

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

729

int CountDigits(const std::string& str,

730

const CodepointSpan& selection_indices) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

731

int count = 0;

732

int i = 0;

733

const UnicodeText unicode_str = UTF8ToUnicodeText(str, /*do_copy=*/false);

734

for (auto it = unicode_str.begin(); it != unicode_str.end(); ++it, ++i) {

735

if (i >= selection_indices.first && i < selection_indices.second &&

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

736

IsDigit(*it)) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

++count;

}

}

return count;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

743

} // namespace

744

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

745

namespace internal {

746

// Helper function, which if the initial 'span' contains only white-spaces,

747

// moves the selection to a single-codepoint selection on a left or right side

748

// of this space.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

749

CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

750

const UnicodeText& context_unicode,

751

const UniLib& unilib) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

752

TC3_CHECK(span.IsValid() && !span.IsEmpty());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

753

754

UnicodeText::const_iterator it;

755

756

// Check that the current selection is all whitespaces.

757

it = context_unicode.begin();

758

std::advance(it, span.first);

759

for (int i = 0; i < (span.second - span.first); ++i, ++it) {

760

if (!unilib.IsWhitespace(*it)) {

return span;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

765

// Try moving left.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

766

CodepointSpan result = span;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

767

it = context_unicode.begin();

768

std::advance(it, span.first);

769

while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {

--result.first;

--it;

}

result.second = result.first + 1;

774

if (!unilib.IsWhitespace(*it)) {

return result;

}

// If moving left didn't find a non-whitespace character, just return the

// original span.

return span;

}

} // namespace internal

783

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

784

bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

785

return !span.classification.empty() &&

786

filtered_collections_annotation_.find(

787

span.classification[0].collection) !=

788

filtered_collections_annotation_.end();

789

}

790

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

791

bool Annotator::FilteredForClassification(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

792

const ClassificationResult& classification) const {

793

return filtered_collections_classification_.find(classification.collection) !=

794

filtered_collections_classification_.end();

795

}

796

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

797

bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

798

return !span.classification.empty() &&

799

filtered_collections_selection_.find(

800

span.classification[0].collection) !=

801

filtered_collections_selection_.end();

802

}

803

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

804

namespace {

805

inline bool ClassifiedAsOther(

806

const std::vector<ClassificationResult>& classification) {

807

return !classification.empty() &&

808

classification[0].collection == Collections::Other();

809

}

810

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

811

} // namespace

812

813

float Annotator::GetPriorityScore(

814

const std::vector<ClassificationResult>& classification) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

815

if (!classification.empty() && !ClassifiedAsOther(classification)) {

816

return classification[0].priority_score;

817

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

818

if (model_->triggering_options() != nullptr) {

819

return model_->triggering_options()->other_collection_priority_score();

820

} else {

821

return -1000.0;

822

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

823

}

824

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

825

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

826

bool Annotator::VerifyRegexMatchCandidate(

827

const std::string& context, const VerificationOptions* verification_options,

828

const std::string& match, const UniLib::RegexMatcher* matcher) const {

829

if (verification_options == nullptr) {

830

return true;

831

}

832

if (verification_options->verify_luhn_checksum() &&

833

!VerifyLuhnChecksum(match)) {

834

return false;

835

}

836

const int lua_verifier = verification_options->lua_verifier();

837

if (lua_verifier >= 0) {

838

if (model_->regex_model()->lua_verifier() == nullptr ||

839

lua_verifier >= model_->regex_model()->lua_verifier()->size()) {

840

TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;

return false;

}

return VerifyMatch(

context, matcher,

model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

850

CodepointSpan Annotator::SuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

851

const std::string& context, CodepointSpan click_indices,

852

const SelectionOptions& options) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

853

CodepointSpan original_click_indices = click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

854

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

855

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

856

return original_click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

857

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

858

if (options.annotation_usecase !=

859

AnnotationUsecase_ANNOTATION_USECASE_SMART) {

860

TC3_LOG(WARNING)

861

<< "Invoking SuggestSelection, which is not supported in RAW mode.";

862

return original_click_indices;

863

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

864

if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

865

return original_click_indices;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

866

}

867

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

868

std::vector<Locale> detected_text_language_tags;

869

if (!ParseLocales(options.detected_text_language_tags,

870

&detected_text_language_tags)) {

871

TC3_LOG(WARNING)

872

<< "Failed to parse the detected_text_language_tags in options: "

873

<< options.detected_text_language_tags;

874

}

875

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

876

model_triggering_locales_,

877

/*default_value=*/true)) {

878

return original_click_indices;

879

}

880

Lukas Zilka

df710db

2018-02-27 12:44:09 +0100

[diff] [blame]

881

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

882

/*do_copy=*/false);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

883

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame]

884

if (!IsValidSpanInput(context_unicode, click_indices)) {

885

TC3_VLOG(1)

886

<< "Trying to run SuggestSelection with invalid input, indices: "

887

<< click_indices.first << " " << click_indices.second;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

888

return original_click_indices;

889

}

890

891

if (model_->snap_whitespace_selections()) {

892

// We want to expand a purely white-space selection to a multi-selection it

893

// would've been part of. But with this feature disabled we would do a no-

894

// op, because no token is found. Therefore, we need to modify the

895

// 'click_indices' a bit to include a part of the token, so that the click-

896

// finding logic finds the clicked token correctly. This modification is

897

// done by the following function. Note, that it's enough to check the left

898

// side of the current selection, because if the white-space is a part of a

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

899

// multi-selection, necessarily both tokens - on the left and the right

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

900

// sides need to be selected. Thus snapping only to the left is sufficient

901

// (there's a check at the bottom that makes sure that if we snap to the

902

// left token but the result does not contain the initial white-space,

903

// returns the original indices).

904

click_indices = internal::SnapLeftIfWhitespaceSelection(

905

click_indices, context_unicode, *unilib_);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

906

}

907

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

908

Annotations candidates;

909

// As we process a single string of context, the candidates will only

910

// contain one vector of AnnotatedSpan.

911

candidates.annotated_spans.resize(1);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

912

InterpreterManager interpreter_manager(selection_executor_.get(),

913

classification_executor_.get());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

914

std::vector<Token> tokens;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

915

if (!ModelSuggestSelection(context_unicode, click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

916

detected_text_language_tags, &interpreter_manager,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

917

&tokens, &candidates.annotated_spans[0])) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

918

TC3_LOG(ERROR) << "Model suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

919

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

920

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

921

const std::unordered_set<std::string> set;

922

const EnabledEntityTypes is_entity_type_enabled(set);

923

if (!RegexChunk(context_unicode, selection_regex_patterns_,

924

/*is_serialized_entity_data_enabled=*/false,

925

is_entity_type_enabled, options.annotation_usecase,

926

&candidates.annotated_spans[0])) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

927

TC3_LOG(ERROR) << "Regex suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

928

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

929

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

930

if (!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

931

/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",

932

options.locales, ModeFlag_SELECTION,

933

options.annotation_usecase,

934

/*is_serialized_entity_data_enabled=*/false,

935

&candidates.annotated_spans[0])) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

936

TC3_LOG(ERROR) << "Datetime suggest selection failed.";

937

return original_click_indices;

938

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

939

if (knowledge_engine_ != nullptr &&

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

940

!knowledge_engine_->Chunk(context, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

941

options.location_context, Permissions(),

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

942

AnnotateMode::kEntityAnnotation, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

943

TC3_LOG(ERROR) << "Knowledge suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

944

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

945

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

946

if (contact_engine_ != nullptr &&

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

947

!contact_engine_->Chunk(context_unicode, tokens,

948

&candidates.annotated_spans[0])) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

949

TC3_LOG(ERROR) << "Contact suggest selection failed.";

950

return original_click_indices;

951

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

952

if (installed_app_engine_ != nullptr &&

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

953

!installed_app_engine_->Chunk(context_unicode, tokens,

954

&candidates.annotated_spans[0])) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

955

TC3_LOG(ERROR) << "Installed app suggest selection failed.";

956

return original_click_indices;

957

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

958

if (number_annotator_ != nullptr &&

959

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

960

&candidates.annotated_spans[0])) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

961

TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";

962

return original_click_indices;

963

}

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

964

if (duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

965

!duration_annotator_->FindAll(context_unicode, tokens,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

966

options.annotation_usecase,

967

&candidates.annotated_spans[0])) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

968

TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";

969

return original_click_indices;

970

}

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

971

if (person_name_engine_ != nullptr &&

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

972

!person_name_engine_->Chunk(context_unicode, tokens,

973

&candidates.annotated_spans[0])) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

974

TC3_LOG(ERROR) << "Person name suggest selection failed.";

975

return original_click_indices;

976

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

977

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

978

AnnotatedSpan grammar_suggested_span;

979

if (grammar_annotator_ != nullptr &&

980

grammar_annotator_->SuggestSelection(detected_text_language_tags,

981

context_unicode, click_indices,

982

&grammar_suggested_span)) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

983

candidates.annotated_spans[0].push_back(grammar_suggested_span);

984

}

985

986

if (pod_ner_annotator_ != nullptr && options.use_pod_ner) {

987

candidates.annotated_spans[0].push_back(

988

pod_ner_annotator_->SuggestSelection(context_unicode, click_indices));

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

989

}

990

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

991

if (experimental_annotator_ != nullptr) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

992

candidates.annotated_spans[0].push_back(

993

experimental_annotator_->SuggestSelection(context_unicode,

994

click_indices));

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

995

}

996

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

997

// Sort candidates according to their position in the input, so that the next

998

// code can assume that any connected component of overlapping spans forms a

999

// contiguous block.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1000

std::sort(candidates.annotated_spans[0].begin(),

1001

candidates.annotated_spans[0].end(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1002

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

1003

return a.span.first < b.span.first;

1004

});

1005

1006

std::vector<int> candidate_indices;

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1007

if (!ResolveConflicts(candidates.annotated_spans[0], context, tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1008

detected_text_language_tags, options.annotation_usecase,

1009

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1010

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1011

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1012

}

1013

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1014

std::sort(candidate_indices.begin(), candidate_indices.end(),

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1015

[this, &candidates](int a, int b) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1016

return GetPriorityScore(

1017

candidates.annotated_spans[0][a].classification) >

1018

GetPriorityScore(

1019

candidates.annotated_spans[0][b].classification);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1020

});

1021

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1022

for (const int i : candidate_indices) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1023

if (SpansOverlap(candidates.annotated_spans[0][i].span, click_indices) &&

1024

SpansOverlap(candidates.annotated_spans[0][i].span,

1025

original_click_indices)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1026

// Run model classification if not present but requested and there's a

1027

// classification collection filter specified.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1028

if (candidates.annotated_spans[0][i].classification.empty() &&

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1029

model_->selection_options()->always_classify_suggested_selection() &&

1030

!filtered_collections_selection_.empty()) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1031

if (!ModelClassifyText(

1032

context, detected_text_language_tags,

1033

candidates.annotated_spans[0][i].span, &interpreter_manager,

1034

/*embedding_cache=*/nullptr,

1035

&candidates.annotated_spans[0][i].classification)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1036

return original_click_indices;

}

}

// Ignore if span classification is filtered.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1041

if (FilteredForSelection(candidates.annotated_spans[0][i])) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1042

return original_click_indices;

1043

}

1044

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1045

return candidates.annotated_spans[0][i].span;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1049

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

namespace {

// Helper function that returns the index of the first candidate that

1054

// transitively does not overlap with the candidate on 'start_index'. If the end

1055

// of 'candidates' is reached, it returns the index that points right behind the

1056

// array.

1057

int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,

1058

int start_index) {

1059

int first_non_overlapping = start_index + 1;

1060

CodepointSpan conflicting_span = candidates[start_index].span;

1061

while (

1062

first_non_overlapping < candidates.size() &&

1063

SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {

1064

// Grow the span to include the current one.

1065

conflicting_span.second = std::max(

1066

conflicting_span.second, candidates[first_non_overlapping].span.second);

1067

1068

++first_non_overlapping;

1069

}

1070

return first_non_overlapping;

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1074

bool Annotator::ResolveConflicts(

1075

const std::vector<AnnotatedSpan>& candidates, const std::string& context,

1076

const std::vector<Token>& cached_tokens,

1077

const std::vector<Locale>& detected_text_language_tags,

1078

AnnotationUsecase annotation_usecase,

1079

InterpreterManager* interpreter_manager, std::vector<int>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1080

result->clear();

1081

result->reserve(candidates.size());

1082

for (int i = 0; i < candidates.size();) {

1083

int first_non_overlapping =

1084

FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);

1085

1086

const bool conflict_found = first_non_overlapping != (i + 1);

1087

if (conflict_found) {

1088

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1089

if (!ResolveConflict(context, cached_tokens, candidates,

1090

detected_text_language_tags, i,

1091

first_non_overlapping, annotation_usecase,

1092

interpreter_manager, &candidate_indices)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1093

return false;

1094

}

1095

result->insert(result->end(), candidate_indices.begin(),

1096

candidate_indices.end());

1097

} else {

1098

result->push_back(i);

1099

}

1100

1101

// Skip over the whole conflicting group/go to next candidate.

1102

i = first_non_overlapping;

}

return true;

}

namespace {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1108

// Returns true, if the given two sources do conflict in given annotation

1109

// usecase.

1110

// - In SMART usecase, all sources do conflict, because there's only 1 possible

1111

// annotation for a given span.

1112

// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime

1113

// and duration), while others not (e.g. duration and number).

1114

bool DoSourcesConflict(AnnotationUsecase annotation_usecase,

1115

const AnnotatedSpan::Source source1,

1116

const AnnotatedSpan::Source source2) {

1117

uint32 source_mask =

1118

(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));

1119

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1120

switch (annotation_usecase) {

1121

case AnnotationUsecase_ANNOTATION_USECASE_SMART:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1122

// In the SMART mode, all annotations conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1123

return true;

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1124

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1125

case AnnotationUsecase_ANNOTATION_USECASE_RAW:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1126

// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",

1127

// can have two non-conflicting annotations: "in 3 hours" (datetime), "3

1128

// hours" (duration).

1129

if ((source_mask &

1130

(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&

1131

(source_mask &

1132

(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {

1133

return false;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1134

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1135

1136

// A KNOWLEDGE entity does not conflict with anything.

1137

if ((source_mask &

1138

(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {

return false;

}

Tony Mak

2020-03-27 13:58:00 +0000

[diff] [blame]

1142

// A PERSONNAME entity does not conflict with anything.

1143

if ((source_mask &

1144

(1 << static_cast<int>(AnnotatedSpan::Source::PERSON_NAME)))) {

return false;

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1148

// Entities from other sources can conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1149

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1154

bool Annotator::ResolveConflict(

1155

const std::string& context, const std::vector<Token>& cached_tokens,

1156

const std::vector<AnnotatedSpan>& candidates,

1157

const std::vector<Locale>& detected_text_language_tags, int start_index,

1158

int end_index, AnnotationUsecase annotation_usecase,

1159

InterpreterManager* interpreter_manager,

1160

std::vector<int>* chosen_indices) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1161

std::vector<int> conflicting_indices;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1162

std::unordered_map<int, std::pair<float, int>> scores_lengths;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1163

for (int i = start_index; i < end_index; ++i) {

1164

conflicting_indices.push_back(i);

1165

if (!candidates[i].classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1166

scores_lengths[i] = {

1167

GetPriorityScore(candidates[i].classification),

1168

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

continue;

}

// OPTIMIZATION: So that we don't have to classify all the ML model

1173

// spans apriori, we wait until we get here, when they conflict with

1174

// something and we need the actual classification scores. So if the

1175

// candidate conflicts and comes from the model, we need to run a

1176

// classification to determine its priority:

1177

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1178

if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1179

candidates[i].span, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1180

/*embedding_cache=*/nullptr, &classification)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (!classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1185

scores_lengths[i] = {

1186

GetPriorityScore(classification),

1187

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1191

std::sort(

1192

conflicting_indices.begin(), conflicting_indices.end(),

1193

[this, &scores_lengths, candidates, conflicting_indices](int i, int j) {

1194

if (scores_lengths[i].first == scores_lengths[j].first &&

1195

prioritize_longest_annotation_) {

1196

return scores_lengths[i].second > scores_lengths[j].second;

1197

}

1198

return scores_lengths[i].first > scores_lengths[j].first;

1199

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1200

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1201

// Here we keep a set of indices that were chosen, per-source, to enable

1202

// effective computation.

1203

std::unordered_map<AnnotatedSpan::Source, SortedIntSet>

1204

chosen_indices_for_source_map;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1205

1206

// Greedily place the candidates if they don't conflict with the already

1207

// placed ones.

1208

for (int i = 0; i < conflicting_indices.size(); ++i) {

1209

const int considered_candidate = conflicting_indices[i];

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1210

1211

// See if there is a conflict between the candidate and all already placed

1212

// candidates.

1213

bool conflict = false;

1214

SortedIntSet* chosen_indices_for_source_ptr = nullptr;

1215

for (auto& source_set_pair : chosen_indices_for_source_map) {

1216

if (source_set_pair.first == candidates[considered_candidate].source) {

1217

chosen_indices_for_source_ptr = &source_set_pair.second;

1218

}

1219

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1220

const bool needs_conflict_resolution =

1221

annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_SMART ||

1222

(annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_RAW &&

1223

do_conflict_resolution_in_raw_mode_);

1224

if (needs_conflict_resolution &&

1225

DoSourcesConflict(annotation_usecase, source_set_pair.first,

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1226

candidates[considered_candidate].source) &&

1227

DoesCandidateConflict(considered_candidate, candidates,

1228

source_set_pair.second)) {

1229

conflict = true;

1230

break;

1231

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1232

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1233

1234

// Skip the candidate if a conflict was found.

if (conflict) {

continue;

}

// If the set of indices for the current source doesn't exist yet,

1240

// initialize it.

1241

if (chosen_indices_for_source_ptr == nullptr) {

1242

SortedIntSet new_set([&candidates](int a, int b) {

1243

return candidates[a].span.first < candidates[b].span.first;

1244

});

1245

chosen_indices_for_source_map[candidates[considered_candidate].source] =

1246

std::move(new_set);

1247

chosen_indices_for_source_ptr =

1248

&chosen_indices_for_source_map[candidates[considered_candidate]

.source];

}

// Place the candidate to the output and to the per-source conflict set.

1253

chosen_indices->push_back(considered_candidate);

1254

chosen_indices_for_source_ptr->insert(considered_candidate);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1255

}

1256

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1257

std::sort(chosen_indices->begin(), chosen_indices->end());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1262

bool Annotator::ModelSuggestSelection(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1263

const UnicodeText& context_unicode, const CodepointSpan& click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1264

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1265

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1266

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1267

if (model_->triggering_options() == nullptr ||

1268

!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1272

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1273

ml_model_triggering_locales_,

1274

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1278

int click_pos;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1279

*tokens = selection_feature_processor_->Tokenize(context_unicode);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1280

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1281

context_unicode, click_indices,

1282

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1283

tokens, &click_pos);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1284

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1285

TC3_VLOG(1) << "Could not calculate the click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1286

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1287

}

1288

1289

const int symmetry_context_size =

1290

model_->selection_options()->symmetry_context_size();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1291

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1292

bounds_sensitive_features = selection_feature_processor_->GetOptions()

1293

->bounds_sensitive_features();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1294

1295

// The symmetry context span is the clicked token with symmetry_context_size

1296

// tokens on either side.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1297

const TokenSpan symmetry_context_span =

1298

IntersectTokenSpans(TokenSpan(click_pos).Expand(

1299

/*num_tokens_left=*/symmetry_context_size,

1300

/*num_tokens_right=*/symmetry_context_size),

1301

AllOf(*tokens));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1302

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1303

// Compute the extraction span based on the model type.

1304

TokenSpan extraction_span;

1305

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1306

// The extraction span is the symmetry context span expanded to include

1307

// max_selection_span tokens on either side, which is how far a selection

1308

// can stretch from the click, plus a relevant number of tokens outside of

1309

// the bounds of the selection.

1310

const int max_selection_span =

1311

selection_feature_processor_->GetOptions()->max_selection_span();

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1312

extraction_span = symmetry_context_span.Expand(

1313

/*num_tokens_left=*/max_selection_span +

1314

bounds_sensitive_features->num_tokens_before(),

1315

/*num_tokens_right=*/max_selection_span +

1316

bounds_sensitive_features->num_tokens_after());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1317

} else {

1318

// The extraction span is the symmetry context span expanded to include

1319

// context_size tokens on either side.

1320

const int context_size =

1321

selection_feature_processor_->GetOptions()->context_size();

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1322

extraction_span = symmetry_context_span.Expand(

1323

/*num_tokens_left=*/context_size,

1324

/*num_tokens_right=*/context_size);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1325

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1326

extraction_span = IntersectTokenSpans(extraction_span, AllOf(*tokens));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1327

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1328

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1329

*tokens, extraction_span)) {

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1333

std::unique_ptr<CachedFeatures> cached_features;

1334

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1335

*tokens, extraction_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1336

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1337

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1338

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1339

selection_feature_processor_->EmbeddingSize() +

1340

selection_feature_processor_->DenseFeaturesCount(),

1341

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1342

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Produce selection model candidates.

1347

std::vector<TokenSpan> chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1348

if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1349

interpreter_manager->SelectionInterpreter(), *cached_features,

1350

&chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1351

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

for (const TokenSpan& chunk : chunks) {

1356

AnnotatedSpan candidate;

1357

candidate.span = selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1358

context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1359

if (model_->selection_options()->strip_unpaired_brackets()) {

1360

candidate.span =

1361

StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);

1362

}

1363

1364

// Only output non-empty spans.

1365

if (candidate.span.first != candidate.span.second) {

1366

result->push_back(candidate);

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1372

bool Annotator::ModelClassifyText(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1373

const std::string& context,

1374

const std::vector<Locale>& detected_text_language_tags,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1375

const CodepointSpan& selection_indices,

1376

InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1377

FeatureProcessor::EmbeddingCache* embedding_cache,

1378

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1379

return ModelClassifyText(context, {}, detected_text_language_tags,

1380

selection_indices, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1381

embedding_cache, classification_results);

}

namespace internal {

std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1386

const CodepointSpan& selection_indices,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1387

TokenSpan tokens_around_selection_to_copy) {

1388

const auto first_selection_token = std::upper_bound(

1389

cached_tokens.begin(), cached_tokens.end(), selection_indices.first,

1390

[](int selection_start, const Token& token) {

1391

return selection_start < token.end;

1392

});

1393

const auto last_selection_token = std::lower_bound(

1394

cached_tokens.begin(), cached_tokens.end(), selection_indices.second,

1395

[](const Token& token, int selection_end) {

1396

return token.start < selection_end;

1397

});

1398

1399

const int64 first_token = std::max(

1400

static_cast<int64>(0),

1401

static_cast<int64>((first_selection_token - cached_tokens.begin()) -

1402

tokens_around_selection_to_copy.first));

1403

const int64 last_token = std::min(

1404

static_cast<int64>(cached_tokens.size()),

1405

static_cast<int64>((last_selection_token - cached_tokens.begin()) +

1406

tokens_around_selection_to_copy.second));

1407

1408

std::vector<Token> tokens;

1409

tokens.reserve(last_token - first_token);

1410

for (int i = first_token; i < last_token; ++i) {

1411

tokens.push_back(cached_tokens[i]);

}

return tokens;

}

} // namespace internal

1416

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1417

TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1418

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1419

bounds_sensitive_features =

1420

classification_feature_processor_->GetOptions()

1421

->bounds_sensitive_features();

1422

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1423

// The extraction span is the selection span expanded to include a relevant

1424

// number of tokens outside of the bounds of the selection.

1425

return {bounds_sensitive_features->num_tokens_before(),

1426

bounds_sensitive_features->num_tokens_after()};

1427

} else {

1428

// The extraction span is the clicked token with context_size tokens on

1429

// either side.

1430

const int context_size =

1431

selection_feature_processor_->GetOptions()->context_size();

1432

return {context_size, context_size};

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1436

namespace {

1437

// Sorts the classification results from high score to low score.

1438

void SortClassificationResults(

1439

std::vector<ClassificationResult>* classification_results) {

1440

std::sort(classification_results->begin(), classification_results->end(),

1441

[](const ClassificationResult& a, const ClassificationResult& b) {

1442

return a.score > b.score;

});

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1447

bool Annotator::ModelClassifyText(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1448

const std::string& context, const std::vector<Token>& cached_tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1449

const std::vector<Locale>& detected_text_language_tags,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1450

const CodepointSpan& selection_indices,

1451

InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1452

FeatureProcessor::EmbeddingCache* embedding_cache,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1453

std::vector<ClassificationResult>* classification_results) const {

1454

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1455

return ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1456

selection_indices, interpreter_manager,

1457

embedding_cache, classification_results, &tokens);

1458

}

1459

1460

bool Annotator::ModelClassifyText(

1461

const std::string& context, const std::vector<Token>& cached_tokens,

1462

const std::vector<Locale>& detected_text_language_tags,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1463

const CodepointSpan& selection_indices,

1464

InterpreterManager* interpreter_manager,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1465

FeatureProcessor::EmbeddingCache* embedding_cache,

1466

std::vector<ClassificationResult>* classification_results,

1467

std::vector<Token>* tokens) const {

1468

if (model_->triggering_options() == nullptr ||

1469

!(model_->triggering_options()->enabled_modes() &

1470

ModeFlag_CLASSIFICATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1474

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1475

ml_model_triggering_locales_,

1476

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1480

if (cached_tokens.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1481

*tokens = classification_feature_processor_->Tokenize(context);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1482

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1483

*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,

1484

ClassifyTextUpperBoundNeededTokens());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1485

}

1486

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1487

int click_pos;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1488

classification_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1489

context, selection_indices,

1490

classification_feature_processor_->GetOptions()

1491

->only_use_line_with_click(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1492

tokens, &click_pos);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1493

const TokenSpan selection_token_span =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1494

CodepointSpanToTokenSpan(*tokens, selection_indices);

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1495

const int selection_num_tokens = selection_token_span.Size();

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1496

if (model_->classification_options()->max_num_tokens() > 0 &&

1497

model_->classification_options()->max_num_tokens() <

1498

selection_num_tokens) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1499

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1503

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1504

bounds_sensitive_features =

1505

classification_feature_processor_->GetOptions()

1506

->bounds_sensitive_features();

1507

if (selection_token_span.first == kInvalidIndex ||

1508

selection_token_span.second == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1509

TC3_LOG(ERROR) << "Could not determine span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Compute the extraction span based on the model type.

1514

TokenSpan extraction_span;

1515

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1516

// The extraction span is the selection span expanded to include a relevant

1517

// number of tokens outside of the bounds of the selection.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1518

extraction_span = selection_token_span.Expand(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1519

/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),

1520

/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());

1521

} else {

1522

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1523

TC3_LOG(ERROR) << "Couldn't choose a click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1524

return false;

1525

}

1526

// The extraction span is the clicked token with context_size tokens on

1527

// either side.

1528

const int context_size =

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1529

classification_feature_processor_->GetOptions()->context_size();

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1530

extraction_span = TokenSpan(click_pos).Expand(

1531

/*num_tokens_left=*/context_size,

1532

/*num_tokens_right=*/context_size);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1533

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1534

extraction_span = IntersectTokenSpans(extraction_span, AllOf(*tokens));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1535

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1536

if (!classification_feature_processor_->HasEnoughSupportedCodepoints(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1537

*tokens, extraction_span)) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1538

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1542

std::unique_ptr<CachedFeatures> cached_features;

1543

if (!classification_feature_processor_->ExtractFeatures(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1544

*tokens, extraction_span, selection_indices,

1545

embedding_executor_.get(), embedding_cache,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1546

classification_feature_processor_->EmbeddingSize() +

1547

classification_feature_processor_->DenseFeaturesCount(),

1548

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1549

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1550

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1551

}

1552

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1553

std::vector<float> features;

1554

features.reserve(cached_features->OutputFeaturesSize());

1555

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1556

cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,

1557

&features);

1558

} else {

1559

cached_features->AppendClickContextFeaturesForClick(click_pos, &features);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1560

}

1561

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1562

TensorView<float> logits = classification_executor_->ComputeLogits(

1563

TensorView<float>(features.data(),

1564

{1, static_cast<int>(features.size())}),

1565

interpreter_manager->ClassificationInterpreter());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1566

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1567

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (logits.dims() != 2 || logits.dim(0) != 1 ||

1572

logits.dim(1) != classification_feature_processor_->NumCollections()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1573

TC3_LOG(ERROR) << "Mismatching output";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

const std::vector<float> scores =

1578

ComputeSoftmax(logits.data(), logits.dim(1));

1579

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1580

if (scores.empty()) {

1581

*classification_results = {{Collections::Other(), 1.0}};

1582

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1583

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1584

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1585

const int best_score_index =

1586

std::max_element(scores.begin(), scores.end()) - scores.begin();

1587

const std::string top_collection =

1588

classification_feature_processor_->LabelToCollection(best_score_index);

1589

1590

// Sanity checks.

1591

if (top_collection == Collections::Phone()) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1592

const int digit_count = CountDigits(context, selection_indices);

1593

if (digit_count <

1594

model_->classification_options()->phone_min_num_digits() ||

1595

digit_count >

1596

model_->classification_options()->phone_max_num_digits()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1597

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1598

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1599

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1600

} else if (top_collection == Collections::Address()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1601

if (selection_num_tokens <

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1602

model_->classification_options()->address_min_num_tokens()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1603

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1604

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1605

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1606

} else if (top_collection == Collections::Dictionary()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1607

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1608

dictionary_locales_,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1609

/*default_value=*/false)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1610

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1611

return true;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1612

}

1613

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1614

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1615

*classification_results = {{top_collection, /*arg_score=*/1.0,

1616

/*arg_priority_score=*/scores[best_score_index]}};

1617

1618

// For some entities, we might want to clamp the priority score, for better

1619

// conflict resolution between entities.

1620

if (model_->triggering_options() != nullptr &&

1621

model_->triggering_options()->collection_to_priority() != nullptr) {

1622

if (auto entry =

1623

model_->triggering_options()->collection_to_priority()->LookupByKey(

1624

top_collection.c_str())) {

1625

(*classification_results)[0].priority_score *= entry->value();

1626

}

1627

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1628

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1629

}

1630

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1631

bool Annotator::RegexClassifyText(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1632

const std::string& context, const CodepointSpan& selection_indices,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1633

std::vector<ClassificationResult>* classification_result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1634

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1635

UTF8ToUnicodeText(context, /*do_copy=*/false)

1636

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1637

const UnicodeText selection_text_unicode(

1638

UTF8ToUnicodeText(selection_text, /*do_copy=*/false));

1639

1640

// Check whether any of the regular expressions match.

1641

for (const int pattern_id : classification_regex_patterns_) {

1642

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

1643

const std::unique_ptr<UniLib::RegexMatcher> matcher =

1644

regex_pattern.pattern->Matcher(selection_text_unicode);

1645

int status = UniLib::RegexMatcher::kNoError;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1646

bool matches;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1647

if (regex_pattern.config->use_approximate_matching()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1648

matches = matcher->ApproximatelyMatches(&status);

1649

} else {

1650

matches = matcher->Matches(&status);

1651

}

1652

if (status != UniLib::RegexMatcher::kNoError) {

1653

return false;

1654

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1655

if (matches && VerifyRegexMatchCandidate(

1656

context, regex_pattern.config->verification_options(),

1657

selection_text, matcher.get())) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1658

classification_result->push_back(

1659

{regex_pattern.config->collection_name()->str(),

1660

regex_pattern.config->target_classification_score(),

1661

regex_pattern.config->priority_score()});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1662

if (!SerializedEntityDataFromRegexMatch(

1663

regex_pattern.config, matcher.get(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1664

&classification_result->back().serialized_entity_data)) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1665

TC3_LOG(ERROR) << "Could not get entity data.";

1666

return false;

1667

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1671

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1672

}

1673

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1674

namespace {

1675

std::string PickCollectionForDatetime(

1676

const DatetimeParseResult& datetime_parse_result) {

1677

switch (datetime_parse_result.granularity) {

1678

case GRANULARITY_HOUR:

1679

case GRANULARITY_MINUTE:

1680

case GRANULARITY_SECOND:

1681

return Collections::DateTime();

1682

default:

1683

return Collections::Date();

1684

}

1685

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1686

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1687

} // namespace

1688

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1689

bool Annotator::DatetimeClassifyText(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1690

const std::string& context, const CodepointSpan& selection_indices,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1691

const ClassificationOptions& options,

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1692

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1693

if (!datetime_parser_ && !cfg_datetime_parser_) {

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1694

return true;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1695

}

1696

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1697

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1698

UTF8ToUnicodeText(context, /*do_copy=*/false)

1699

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1700

1701

std::vector<DatetimeParseResultSpan> datetime_spans;

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1702

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1703

if (cfg_datetime_parser_) {

1704

if (!(model_->grammar_datetime_model()->enabled_modes() &

1705

ModeFlag_CLASSIFICATION)) {

1706

return true;

1707

}

1708

std::vector<Locale> parsed_locales;

1709

ParseLocales(options.locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1710

cfg_datetime_parser_->Parse(

1711

selection_text,

1712

ToDateAnnotationOptions(

1713

model_->grammar_datetime_model()->annotation_options(),

1714

options.reference_timezone, options.reference_time_ms_utc),

1715

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1716

}

1717

1718

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1719

if (!datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,

1720

options.reference_timezone, options.locales,

1721

ModeFlag_CLASSIFICATION,

1722

options.annotation_usecase,

1723

/*anchor_start_end=*/true, &datetime_spans)) {

1724

TC3_LOG(ERROR) << "Error during parsing datetime.";

1725

return false;

1726

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1727

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1728

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1729

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

1730

// Only consider the result valid if the selection and extracted datetime

1731

// spans exactly match.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1732

if (CodepointSpan(datetime_span.span.first + selection_indices.first,

1733

datetime_span.span.second + selection_indices.first) ==

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1734

selection_indices) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1735

for (const DatetimeParseResult& parse_result : datetime_span.data) {

1736

classification_results->emplace_back(

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1737

PickCollectionForDatetime(parse_result),

1738

datetime_span.target_classification_score);

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1739

classification_results->back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1740

classification_results->back().serialized_entity_data =

1741

CreateDatetimeSerializedEntityData(parse_result);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1742

classification_results->back().priority_score =

1743

datetime_span.priority_score;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1744

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1745

return true;

1746

}

1747

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1748

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1749

}

1750

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1751

std::vector<ClassificationResult> Annotator::ClassifyText(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1752

const std::string& context, const CodepointSpan& selection_indices,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1753

const ClassificationOptions& options) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1754

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1755

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1756

return {};

1757

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1758

if (options.annotation_usecase !=

1759

AnnotationUsecase_ANNOTATION_USECASE_SMART) {

1760

TC3_LOG(WARNING)

1761

<< "Invoking ClassifyText, which is not supported in RAW mode.";

1762

return {};

1763

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1764

if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {

return {};

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1768

std::vector<Locale> detected_text_language_tags;

1769

if (!ParseLocales(options.detected_text_language_tags,

1770

&detected_text_language_tags)) {

1771

TC3_LOG(WARNING)

1772

<< "Failed to parse the detected_text_language_tags in options: "

1773

<< options.detected_text_language_tags;

1774

}

1775

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1776

model_triggering_locales_,

1777

/*default_value=*/true)) {

return {};

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

1781

if (!IsValidSpanInput(UTF8ToUnicodeText(context, /*do_copy=*/false),

1782

selection_indices)) {

1783

TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1784

<< selection_indices.first << " " << selection_indices.second;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1788

// We'll accumulate a list of candidates, and pick the best candidate in the

1789

// end.

1790

std::vector<AnnotatedSpan> candidates;

1791

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1792

// Try the knowledge engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1793

// TODO(b/126579108): Propagate error status.

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1794

ClassificationResult knowledge_result;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1795

if (knowledge_engine_ &&

1796

knowledge_engine_->ClassifyText(

1797

context, selection_indices, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

1798

options.location_context, Permissions(), &knowledge_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1799

candidates.push_back({selection_indices, {knowledge_result}});

1800

candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1801

}

1802

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1803

AddContactMetadataToKnowledgeClassificationResults(&candidates);

1804

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1805

// Try the contact engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1806

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1807

ClassificationResult contact_result;

1808

if (contact_engine_ && contact_engine_->ClassifyText(

1809

context, selection_indices, &contact_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1810

candidates.push_back({selection_indices, {contact_result}});

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1811

}

1812

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1813

// Try the person name engine.

1814

ClassificationResult person_name_result;

1815

if (person_name_engine_ &&

1816

person_name_engine_->ClassifyText(context, selection_indices,

1817

&person_name_result)) {

1818

candidates.push_back({selection_indices, {person_name_result}});

Tony Mak

d0ae7c6

2020-03-27 13:58:00 +0000

[diff] [blame]

1819

candidates.back().source = AnnotatedSpan::Source::PERSON_NAME;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1820

}

1821

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1822

// Try the installed app engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1823

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1824

ClassificationResult installed_app_result;

1825

if (installed_app_engine_ &&

1826

installed_app_engine_->ClassifyText(context, selection_indices,

1827

&installed_app_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1828

candidates.push_back({selection_indices, {installed_app_result}});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1829

}

1830

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1831

// Try the regular expression models.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1832

std::vector<ClassificationResult> regex_results;

1833

if (!RegexClassifyText(context, selection_indices, &regex_results)) {

1834

return {};

1835

}

1836

for (const ClassificationResult& result : regex_results) {

1837

candidates.push_back({selection_indices, {result}});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1838

}

1839

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1840

// Try the date model.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1841

//

1842

// DatetimeClassifyText only returns the first result, which can however have

1843

// more interpretations. They are inserted in the candidates as a single

1844

// AnnotatedSpan, so that they get treated together by the conflict resolution

1845

// algorithm.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1846

std::vector<ClassificationResult> datetime_results;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1847

if (!DatetimeClassifyText(context, selection_indices, options,

1848

&datetime_results)) {

1849

return {};

1850

}

1851

if (!datetime_results.empty()) {

1852

candidates.push_back({selection_indices, std::move(datetime_results)});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1853

candidates.back().source = AnnotatedSpan::Source::DATETIME;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1854

}

1855

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1856

const UnicodeText context_unicode =

1857

UTF8ToUnicodeText(context, /*do_copy=*/false);

1858

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1859

// Try the number annotator.

1860

// TODO(b/126579108): Propagate error status.

1861

ClassificationResult number_annotator_result;

1862

if (number_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1863

number_annotator_->ClassifyText(context_unicode, selection_indices,

1864

options.annotation_usecase,

1865

&number_annotator_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1866

candidates.push_back({selection_indices, {number_annotator_result}});

1867

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1868

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1869

// Try the duration annotator.

1870

ClassificationResult duration_annotator_result;

1871

if (duration_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1872

duration_annotator_->ClassifyText(context_unicode, selection_indices,

1873

options.annotation_usecase,

1874

&duration_annotator_result)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1875

candidates.push_back({selection_indices, {duration_annotator_result}});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1876

candidates.back().source = AnnotatedSpan::Source::DURATION;

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1877

}

1878

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1879

// Try the translate annotator.

1880

ClassificationResult translate_annotator_result;

1881

if (translate_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1882

translate_annotator_->ClassifyText(context_unicode, selection_indices,

1883

options.user_familiar_language_tags,

1884

&translate_annotator_result)) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1885

candidates.push_back({selection_indices, {translate_annotator_result}});

1886

}

1887

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1888

// Try the grammar model.

1889

ClassificationResult grammar_annotator_result;

1890

if (grammar_annotator_ && grammar_annotator_->ClassifyText(

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1891

detected_text_language_tags, context_unicode,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1892

selection_indices, &grammar_annotator_result)) {

1893

candidates.push_back({selection_indices, {grammar_annotator_result}});

1894

}

1895

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1896

ClassificationResult pod_ner_annotator_result;

1897

if (pod_ner_annotator_ && options.use_pod_ner &&

1898

pod_ner_annotator_->ClassifyText(context_unicode, selection_indices,

1899

&pod_ner_annotator_result)) {

1900

candidates.push_back({selection_indices, {pod_ner_annotator_result}});

1901

}

1902

Tony Mak

2020-09-18 16:41:23 +0100

[diff] [blame]

1903

ClassificationResult vocab_annotator_result;

1904

if (vocab_annotator_ &&

1905

vocab_annotator_->ClassifyText(

1906

context_unicode, selection_indices, detected_text_language_tags,

1907

options.trigger_dictionary_on_beginner_words,

1908

&vocab_annotator_result)) {

1909

candidates.push_back({selection_indices, {vocab_annotator_result}});

1910

}

1911

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

1912

if (experimental_annotator_) {

1913

experimental_annotator_->ClassifyText(context_unicode, selection_indices,

1914

candidates);

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1915

}

1916

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1917

// Try the ML model.

1918

//

1919

// The output of the model is considered as an exclusive 1-of-N choice. That's

1920

// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1

1921

// span for each candidate, like e.g. the regex model.

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1922

InterpreterManager interpreter_manager(selection_executor_.get(),

1923

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1924

std::vector<ClassificationResult> model_results;

1925

std::vector<Token> tokens;

1926

if (!ModelClassifyText(

1927

context, /*cached_tokens=*/{}, detected_text_language_tags,

1928

selection_indices, &interpreter_manager,

1929

/*embedding_cache=*/nullptr, &model_results, &tokens)) {

1930

return {};

1931

}

1932

if (!model_results.empty()) {

1933

candidates.push_back({selection_indices, std::move(model_results)});

1934

}

1935

1936

std::vector<int> candidate_indices;

1937

if (!ResolveConflicts(candidates, context, tokens,

1938

detected_text_language_tags, options.annotation_usecase,

1939

&interpreter_manager, &candidate_indices)) {

1940

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

return {};

}

std::vector<ClassificationResult> results;

1945

for (const int i : candidate_indices) {

1946

for (const ClassificationResult& result : candidates[i].classification) {

1947

if (!FilteredForClassification(result)) {

1948

results.push_back(result);

1949

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1950

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1951

}

1952

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1953

// Sort results according to score.

1954

std::sort(results.begin(), results.end(),

1955

[](const ClassificationResult& a, const ClassificationResult& b) {

1956

return a.score > b.score;

1957

});

1958

1959

if (results.empty()) {

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1960

results = {{Collections::Other(), 1.0}};

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1961

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1962

return results;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1963

}

1964

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1965

bool Annotator::ModelAnnotate(

1966

const std::string& context,

1967

const std::vector<Locale>& detected_text_language_tags,

1968

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

1969

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1970

if (model_->triggering_options() == nullptr ||

1971

!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1975

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1976

ml_model_triggering_locales_,

1977

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1981

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

1982

/*do_copy=*/false);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1983

std::vector<UnicodeTextRange> lines;

1984

if (!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {

1985

lines.push_back({context_unicode.begin(), context_unicode.end()});

1986

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1987

lines = selection_feature_processor_->SplitContext(

1988

context_unicode, selection_feature_processor_->GetOptions()

1989

->use_pipe_character_for_newline());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1990

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1991

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1992

const float min_annotate_confidence =

1993

(model_->triggering_options() != nullptr

1994

? model_->triggering_options()->min_annotate_confidence()

1995

: 0.f);

1996

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1997

for (const UnicodeTextRange& line : lines) {

Tony Mak

408c6b8

2019-03-08 17:57:27 +0000

[diff] [blame]

1998

FeatureProcessor::EmbeddingCache embedding_cache;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1999

const std::string line_str =

2000

UnicodeText::UTF8Substring(line.first, line.second);

2001

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2002

*tokens = selection_feature_processor_->Tokenize(line_str);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2003

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2004

line_str, {0, std::distance(line.first, line.second)},

2005

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2006

tokens,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2007

/*click_pos=*/nullptr);

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2008

const TokenSpan full_line_span = {0,

2009

static_cast<TokenIndex>(tokens->size())};

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2010

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2011

// TODO(zilka): Add support for greater granularity of this check.

2012

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

2013

*tokens, full_line_span)) {

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2017

std::unique_ptr<CachedFeatures> cached_features;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2018

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2019

*tokens, full_line_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2020

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

2021

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2022

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2023

selection_feature_processor_->EmbeddingSize() +

2024

selection_feature_processor_->DenseFeaturesCount(),

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2025

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2026

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2027

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2028

}

2029

2030

std::vector<TokenSpan> local_chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2031

if (!ModelChunk(tokens->size(), /*span_of_interest=*/full_line_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2032

interpreter_manager->SelectionInterpreter(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2033

*cached_features, &local_chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2034

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2035

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2036

}

2037

2038

const int offset = std::distance(context_unicode.begin(), line.first);

2039

for (const TokenSpan& chunk : local_chunks) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2040

CodepointSpan codepoint_span =

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2041

selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2042

line_str, TokenSpanToCodepointSpan(*tokens, chunk));

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2043

if (model_->selection_options()->strip_unpaired_brackets()) {

2044

codepoint_span =

2045

StripUnpairedBrackets(context_unicode, codepoint_span, *unilib_);

2046

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2047

2048

// Skip empty spans.

2049

if (codepoint_span.first != codepoint_span.second) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2050

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2051

if (!ModelClassifyText(line_str, *tokens, detected_text_language_tags,

2052

codepoint_span, interpreter_manager,

2053

&embedding_cache, &classification)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2054

TC3_LOG(ERROR) << "Could not classify text: "

2055

<< (codepoint_span.first + offset) << " "

2056

<< (codepoint_span.second + offset);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

// Do not include the span if it's classified as "other".

2061

if (!classification.empty() && !ClassifiedAsOther(classification) &&

2062

classification[0].score >= min_annotate_confidence) {

2063

AnnotatedSpan result_span;

2064

result_span.span = {codepoint_span.first + offset,

2065

codepoint_span.second + offset};

2066

result_span.classification = std::move(classification);

2067

result->push_back(std::move(result_span));

2068

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2069

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2070

}

2071

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2075

const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2076

return selection_feature_processor_.get();

2077

}

2078

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2079

const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2080

const {

2081

return classification_feature_processor_.get();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2082

}

2083

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2084

const DatetimeParser* Annotator::DatetimeParserForTests() const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2085

return datetime_parser_.get();

2086

}

2087

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2088

void Annotator::RemoveNotEnabledEntityTypes(

2089

const EnabledEntityTypes& is_entity_type_enabled,

2090

std::vector<AnnotatedSpan>* annotated_spans) const {

2091

for (AnnotatedSpan& annotated_span : *annotated_spans) {

2092

std::vector<ClassificationResult>& classifications =

2093

annotated_span.classification;

2094

classifications.erase(

2095

std::remove_if(classifications.begin(), classifications.end(),

2096

[&is_entity_type_enabled](

2097

const ClassificationResult& classification_result) {

2098

return !is_entity_type_enabled(

2099

classification_result.collection);

2100

}),

2101

classifications.end());

2102

}

2103

annotated_spans->erase(

2104

std::remove_if(annotated_spans->begin(), annotated_spans->end(),

2105

[](const AnnotatedSpan& annotated_span) {

2106

return annotated_span.classification.empty();

2107

}),

2108

annotated_spans->end());

2109

}

2110

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2111

void Annotator::AddContactMetadataToKnowledgeClassificationResults(

2112

std::vector<AnnotatedSpan>* candidates) const {

2113

if (candidates == nullptr || contact_engine_ == nullptr) {

2114

return;

2115

}

2116

for (auto& candidate : *candidates) {

2117

for (auto& classification_result : candidate.classification) {

2118

contact_engine_->AddContactMetadataToKnowledgeClassificationResult(

2119

&classification_result);

}

}

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2124

Status Annotator::AnnotateSingleInput(

2125

const std::string& context, const AnnotationOptions& options,

2126

std::vector<AnnotatedSpan>* candidates) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2127

if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2128

return Status(StatusCode::UNAVAILABLE, "Model annotation was not enabled.");

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2129

}

2130

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2131

const UnicodeText context_unicode =

2132

UTF8ToUnicodeText(context, /*do_copy=*/false);

2133

if (!context_unicode.is_valid()) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2134

return Status(StatusCode::INVALID_ARGUMENT,

2135

"Context string isn't valid UTF8.");

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2136

}

2137

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2138

std::vector<Locale> detected_text_language_tags;

2139

if (!ParseLocales(options.detected_text_language_tags,

2140

&detected_text_language_tags)) {

2141

TC3_LOG(WARNING)

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2142

<< "Failed to parse the detected_text_language_tags in options: "

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2143

<< options.detected_text_language_tags;

2144

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2145

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

2146

model_triggering_locales_,

2147

/*default_value=*/true)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2148

return Status(

2149

StatusCode::UNAVAILABLE,

2150

"The detected language tags are not in the supported locales.");

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2151

}

2152

2153

InterpreterManager interpreter_manager(selection_executor_.get(),

2154

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2155

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2156

// Annotate with the selection model.

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2157

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2158

if (!ModelAnnotate(context, detected_text_language_tags, &interpreter_manager,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2159

&tokens, candidates)) {

2160

return Status(StatusCode::INTERNAL, "Couldn't run ModelAnnotate.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2161

}

2162

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2163

const EnabledEntityTypes is_entity_type_enabled(options.entity_types);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2164

// Annotate with the regular expression models.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2165

if (!RegexChunk(

2166

UTF8ToUnicodeText(context, /*do_copy=*/false),

2167

annotation_regex_patterns_, options.is_serialized_entity_data_enabled,

2168

is_entity_type_enabled, options.annotation_usecase, candidates)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2169

return Status(StatusCode::INTERNAL, "Couldn't run RegexChunk.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2170

}

2171

2172

// Annotate with the datetime model.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2173

if ((is_entity_type_enabled(Collections::Date()) ||

2174

is_entity_type_enabled(Collections::DateTime())) &&

2175

!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2176

options.reference_time_ms_utc, options.reference_timezone,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2177

options.locales, ModeFlag_ANNOTATION,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2178

options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2179

options.is_serialized_entity_data_enabled, candidates)) {

2180

return Status(StatusCode::INTERNAL, "Couldn't run DatetimeChunk.");

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2181

}

2182

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2183

// Annotate with the contact engine.

2184

if (contact_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2185

!contact_engine_->Chunk(context_unicode, tokens, candidates)) {

2186

return Status(StatusCode::INTERNAL, "Couldn't run contact engine Chunk.");

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2187

}

2188

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2189

// Annotate with the installed app engine.

2190

if (installed_app_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2191

!installed_app_engine_->Chunk(context_unicode, tokens, candidates)) {

2192

return Status(StatusCode::INTERNAL,

2193

"Couldn't run installed app engine Chunk.");

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2194

}

2195

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2196

// Annotate with the number annotator.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2197

bool number_annotations_enabled = true;

2198

// Disable running the annotator in RAW mode if the number/percentage

2199

// annotations are not explicitly requested.

2200

if (options.annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_RAW &&

2201

!is_entity_type_enabled(Collections::Number()) &&

2202

!is_entity_type_enabled(Collections::Percentage())) {

2203

number_annotations_enabled = false;

2204

}

2205

if (number_annotations_enabled && number_annotator_ != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2206

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2207

candidates)) {

2208

return Status(StatusCode::INTERNAL,

2209

"Couldn't run number annotator FindAll.");

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

2210

}

2211

2212

// Annotate with the duration annotator.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2213

if (is_entity_type_enabled(Collections::Duration()) &&

2214

duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2215

!duration_annotator_->FindAll(context_unicode, tokens,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2216

options.annotation_usecase, candidates)) {

2217

return Status(StatusCode::INTERNAL,

2218

"Couldn't run duration annotator FindAll.");

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2219

}

2220

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2221

// Annotate with the person name engine.

2222

if (is_entity_type_enabled(Collections::PersonName()) &&

2223

person_name_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2224

!person_name_engine_->Chunk(context_unicode, tokens, candidates)) {

2225

return Status(StatusCode::INTERNAL,

2226

"Couldn't run person name engine Chunk.");

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2227

}

2228

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2229

// Annotate with the grammar annotators.

2230

if (grammar_annotator_ != nullptr &&

2231

!grammar_annotator_->Annotate(detected_text_language_tags,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2232

context_unicode, candidates)) {

2233

return Status(StatusCode::INTERNAL, "Couldn't run grammar annotators.");

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2234

}

2235

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2236

// Annotate with the POD NER annotator.

2237

if (pod_ner_annotator_ != nullptr && options.use_pod_ner &&

2238

!pod_ner_annotator_->Annotate(context_unicode, candidates)) {

2239

return Status(StatusCode::INTERNAL, "Couldn't run POD NER annotator.");

2240

}

2241

Tony Mak

2020-09-18 16:41:23 +0100

[diff] [blame]

2242

// Annotate with the vocab annotator.

2243

if (vocab_annotator_ != nullptr &&

2244

!vocab_annotator_->Annotate(context_unicode, detected_text_language_tags,

2245

options.trigger_dictionary_on_beginner_words,

2246

candidates)) {

2247

return Status(StatusCode::INTERNAL, "Couldn't run vocab annotator.");

2248

}

2249

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2250

// Annotate with the experimental annotator.

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2251

if (experimental_annotator_ != nullptr &&

2252

!experimental_annotator_->Annotate(context_unicode, candidates)) {

2253

return Status(StatusCode::INTERNAL, "Couldn't run experimental annotator.");

2254

}

2255

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2256

// Sort candidates according to their position in the input, so that the next

2257

// code can assume that any connected component of overlapping spans forms a

2258

// contiguous block.

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2259

// Also sort them according to the end position and collection, so that the

2260

// deduplication code below can assume that same spans and classifications

2261

// form contiguous blocks.

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2262

std::sort(candidates->begin(), candidates->end(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2263

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2264

if (a.span.first != b.span.first) {

2265

return a.span.first < b.span.first;

2266

}

2267

2268

if (a.span.second != b.span.second) {

2269

return a.span.second < b.span.second;

2270

}

2271

2272

return a.classification[0].collection <

2273

b.classification[0].collection;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2274

});

2275

2276

std::vector<int> candidate_indices;

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2277

if (!ResolveConflicts(*candidates, context, tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2278

detected_text_language_tags, options.annotation_usecase,

2279

&interpreter_manager, &candidate_indices)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2280

return Status(StatusCode::INTERNAL, "Couldn't resolve conflicts.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2281

}

2282

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2283

// Remove candidates that overlap exactly and have the same collection.

2284

// This can e.g. happen for phone coming from both ML model and regex.

2285

candidate_indices.erase(

2286

std::unique(candidate_indices.begin(), candidate_indices.end(),

2287

[&candidates](const int a_index, const int b_index) {

2288

const AnnotatedSpan& a = (*candidates)[a_index];

2289

const AnnotatedSpan& b = (*candidates)[b_index];

2290

return a.span == b.span &&

2291

a.classification[0].collection ==

2292

b.classification[0].collection;

2293

}),

2294

candidate_indices.end());

2295

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2296

std::vector<AnnotatedSpan> result;

2297

result.reserve(candidate_indices.size());

2298

for (const int i : candidate_indices) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2299

if ((*candidates)[i].classification.empty() ||

2300

ClassifiedAsOther((*candidates)[i].classification) ||

2301

FilteredForAnnotation((*candidates)[i])) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2302

continue;

2303

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2304

result.push_back(std::move((*candidates)[i]));

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2305

}

2306

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2307

// We generate all candidates and remove them later (with the exception of

2308

// date/time/duration entities) because there are complex interdependencies

2309

// between the entity types. E.g., the TLD of an email can be interpreted as a

2310

// URL, but most likely a user of the API does not want such annotations if

2311

// "url" is enabled and "email" is not.

2312

RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);

2313

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2314

for (AnnotatedSpan& annotated_span : result) {

2315

SortClassificationResults(&annotated_span.classification);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2316

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2317

*candidates = result;

2318

return Status::OK;

2319

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2320

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2321

StatusOr<Annotations> Annotator::AnnotateStructuredInput(

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2322

const std::vector<InputFragment>& string_fragments,

2323

const AnnotationOptions& options) const {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2324

Annotations annotation_candidates;

2325

annotation_candidates.annotated_spans.resize(string_fragments.size());

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2326

2327

std::vector<std::string> text_to_annotate;

2328

text_to_annotate.reserve(string_fragments.size());

2329

for (const auto& string_fragment : string_fragments) {

2330

text_to_annotate.push_back(string_fragment.text);

2331

}

2332

2333

// KnowledgeEngine is special, because it supports annotation of multiple

2334

// fragments at once.

2335

if (knowledge_engine_ &&

2336

!knowledge_engine_

2337

->ChunkMultipleSpans(text_to_annotate, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

2338

options.location_context, options.permissions,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2339

options.annotate_mode, &annotation_candidates)

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2340

.ok()) {

2341

return Status(StatusCode::INTERNAL, "Couldn't run knowledge engine Chunk.");

2342

}

2343

// The annotator engines shouldn't change the number of annotation vectors.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2344

if (annotation_candidates.annotated_spans.size() != text_to_annotate.size()) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2345

TC3_LOG(ERROR) << "Received " << text_to_annotate.size()

2346

<< " texts to annotate but generated a different number of "

2347

"lists of annotations:"

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2348

<< annotation_candidates.annotated_spans.size();

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2349

return Status(StatusCode::INTERNAL,

2350

"Number of annotation candidates differs from "

2351

"number of texts to annotate.");

2352

}

2353

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2354

// As an optimization, if the only annotated type is Entity, we skip all the

2355

// other annotators than the KnowledgeEngine. This only happens in the raw

2356

// mode, to make sure it does not affect the result.

2357

if (options.annotation_usecase == ANNOTATION_USECASE_RAW &&

2358

options.entity_types.size() == 1 &&

2359

*options.entity_types.begin() == Collections::Entity()) {

2360

return annotation_candidates;

2361

}

2362

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2363

// Other annotators run on each fragment independently.

2364

for (int i = 0; i < text_to_annotate.size(); ++i) {

2365

AnnotationOptions annotation_options = options;

2366

if (string_fragments[i].datetime_options.has_value()) {

2367

DatetimeOptions reference_datetime =

2368

string_fragments[i].datetime_options.value();

2369

annotation_options.reference_time_ms_utc =

2370

reference_datetime.reference_time_ms_utc;

2371

annotation_options.reference_timezone =

2372

reference_datetime.reference_timezone;

2373

}

2374

2375

AddContactMetadataToKnowledgeClassificationResults(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2376

&annotation_candidates.annotated_spans[i]);

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2377

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2378

Status annotation_status =

2379

AnnotateSingleInput(text_to_annotate[i], annotation_options,

2380

&annotation_candidates.annotated_spans[i]);

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2381

if (!annotation_status.ok()) {

2382

return annotation_status;

2383

}

2384

}

2385

return annotation_candidates;

2386

}

2387

2388

std::vector<AnnotatedSpan> Annotator::Annotate(

2389

const std::string& context, const AnnotationOptions& options) const {

2390

std::vector<InputFragment> string_fragments;

2391

string_fragments.push_back({.text = context});

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2392

StatusOr<Annotations> annotations =

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2393

AnnotateStructuredInput(string_fragments, options);

2394

if (!annotations.ok()) {

2395

TC3_LOG(ERROR) << "Returned error when calling AnnotateStructuredInput: "

2396

<< annotations.status().error_message();

2397

return {};

2398

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2399

return annotations.ValueOrDie().annotated_spans[0];

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2400

}

2401

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2402

CodepointSpan Annotator::ComputeSelectionBoundaries(

2403

const UniLib::RegexMatcher* match,

2404

const RegexModel_::Pattern* config) const {

2405

if (config->capturing_group() == nullptr) {

2406

// Use first capturing group to specify the selection.

2407

int status = UniLib::RegexMatcher::kNoError;

2408

const CodepointSpan result = {match->Start(1, &status),

2409

match->End(1, &status)};

2410

if (status != UniLib::RegexMatcher::kNoError) {

2411

return {kInvalidIndex, kInvalidIndex};

}

return result;

}

CodepointSpan result = {kInvalidIndex, kInvalidIndex};

2417

const int num_groups = config->capturing_group()->size();

2418

for (int i = 0; i < num_groups; i++) {

2419

if (!config->capturing_group()->Get(i)->extend_selection()) {

continue;

}

int status = UniLib::RegexMatcher::kNoError;

2424

// Check match and adjust bounds.

2425

const int group_start = match->Start(i, &status);

2426

const int group_end = match->End(i, &status);

2427

if (status != UniLib::RegexMatcher::kNoError) {

2428

return {kInvalidIndex, kInvalidIndex};

2429

}

2430

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

2431

continue;

2432

}

2433

if (result.first == kInvalidIndex) {

2434

result = {group_start, group_end};

2435

} else {

2436

result.first = std::min(result.first, group_start);

2437

result.second = std::max(result.second, group_end);

}

}

return result;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2443

bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2444

if (pattern->serialized_entity_data() != nullptr ||

2445

pattern->entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2446

return true;

2447

}

2448

if (pattern->capturing_group() != nullptr) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2449

for (const CapturingGroup* group : *pattern->capturing_group()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2450

if (group->entity_field_path() != nullptr) {

2451

return true;

2452

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2453

if (group->serialized_entity_data() != nullptr ||

2454

group->entity_data() != nullptr) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2455

return true;

2456

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

return false;

}

bool Annotator::SerializedEntityDataFromRegexMatch(

2463

const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,

2464

std::string* serialized_entity_data) const {

2465

if (!HasEntityData(pattern)) {

2466

serialized_entity_data->clear();

2467

return true;

2468

}

2469

TC3_CHECK(entity_data_builder_ != nullptr);

2470

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2471

std::unique_ptr<MutableFlatbuffer> entity_data =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2472

entity_data_builder_->NewRoot();

2473

2474

TC3_CHECK(entity_data != nullptr);

2475

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2476

// Set fixed entity data.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2477

if (pattern->serialized_entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2478

entity_data->MergeFromSerializedFlatbuffer(

2479

StringPiece(pattern->serialized_entity_data()->c_str(),

2480

pattern->serialized_entity_data()->size()));

2481

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2482

if (pattern->entity_data() != nullptr) {

2483

entity_data->MergeFrom(

2484

reinterpret_cast<const flatbuffers::Table*>(pattern->entity_data()));

2485

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2486

2487

// Add entity data from rule capturing groups.

2488

if (pattern->capturing_group() != nullptr) {

2489

const int num_groups = pattern->capturing_group()->size();

2490

for (int i = 0; i < num_groups; i++) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2491

const CapturingGroup* group = pattern->capturing_group()->Get(i);

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2492

2493

// Check whether the group matched.

2494

Optional<std::string> group_match_text =

2495

GetCapturingGroupText(matcher, /*group_id=*/i);

2496

if (!group_match_text.has_value()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2497

continue;

2498

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2499

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2500

// Set fixed entity data from capturing group match.

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2501

if (group->serialized_entity_data() != nullptr) {

2502

entity_data->MergeFromSerializedFlatbuffer(

2503

StringPiece(group->serialized_entity_data()->c_str(),

2504

group->serialized_entity_data()->size()));

2505

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2506

if (group->entity_data() != nullptr) {

2507

entity_data->MergeFrom(reinterpret_cast<const flatbuffers::Table*>(

2508

pattern->entity_data()));

2509

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2510

2511

// Set entity field from capturing group text.

2512

if (group->entity_field_path() != nullptr) {

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2513

UnicodeText normalized_group_match_text =

2514

UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);

2515

2516

// Apply normalization if specified.

2517

if (group->normalization_options() != nullptr) {

2518

normalized_group_match_text =

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

2519

NormalizeText(*unilib_, group->normalization_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2520

normalized_group_match_text);

2521

}

2522

2523

if (!entity_data->ParseAndSet(

2524

group->entity_field_path(),

2525

normalized_group_match_text.ToUTF8String())) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2526

TC3_LOG(ERROR)

2527

<< "Could not set entity data from rule capturing group.";

2528

return false;

2529

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

}

*serialized_entity_data = entity_data->Serialize();

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2538

UnicodeText RemoveMoneySeparators(

2539

const std::unordered_set<char32>& decimal_separators,

2540

const UnicodeText& amount,

2541

UnicodeText::const_iterator it_decimal_separator) {

2542

UnicodeText whole_amount;

2543

for (auto it = amount.begin();

2544

it != amount.end() && it != it_decimal_separator; ++it) {

2545

if (std::find(decimal_separators.begin(), decimal_separators.end(),

2546

static_cast<char32>(*it)) == decimal_separators.end()) {

2547

whole_amount.push_back(*it);

}

}

return whole_amount;

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2553

void Annotator::GetMoneyQuantityFromCapturingGroup(

2554

const UniLib::RegexMatcher* match, const RegexModel_::Pattern* config,

2555

const UnicodeText& context_unicode, std::string* quantity,

2556

int* exponent) const {

2557

if (config->capturing_group() == nullptr) {

*exponent = 0;

return;

}

const int num_groups = config->capturing_group()->size();

2563

for (int i = 0; i < num_groups; i++) {

2564

int status = UniLib::RegexMatcher::kNoError;

2565

const int group_start = match->Start(i, &status);

2566

const int group_end = match->End(i, &status);

2567

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

continue;

}

*quantity =

unilib_

->ToLowerText(UnicodeText::Substring(context_unicode, group_start,

2574

group_end, /*do_copy=*/false))

2575

.ToUTF8String();

2576

2577

if (auto entry = model_->money_parsing_options()

2578

->quantities_name_to_exponent()

2579

->LookupByKey((*quantity).c_str())) {

2580

*exponent = entry->value();

return;

}

}

*exponent = 0;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2587

bool Annotator::ParseAndFillInMoneyAmount(

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2588

std::string* serialized_entity_data, const UniLib::RegexMatcher* match,

2589

const RegexModel_::Pattern* config,

2590

const UnicodeText& context_unicode) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2591

std::unique_ptr<EntityDataT> data =

2592

LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(

2593

*serialized_entity_data);

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2594

if (data == nullptr) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame]

2595

if (model_->version() >= 706) {

2596

// This way of parsing money entity data is enabled for models newer than

2597

// v706, consequently logging errors only for them (b/156634162).

2598

TC3_LOG(ERROR)

2599

<< "Data field is null when trying to parse Money Entity Data";

2600

}

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2601

return false;

2602

}

2603

if (data->money->unnormalized_amount.empty()) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame]

2604

if (model_->version() >= 706) {

2605

// This way of parsing money entity data is enabled for models newer than

2606

// v706, consequently logging errors only for them (b/156634162).

2607

TC3_LOG(ERROR)

2608

<< "Data unnormalized_amount is empty when trying to parse "

2609

"Money Entity Data";

2610

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

UnicodeText amount =

UTF8ToUnicodeText(data->money->unnormalized_amount, /*do_copy=*/false);

2616

int separator_back_index = 0;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2617

auto it_decimal_separator = --amount.end();

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2618

for (; it_decimal_separator != amount.begin();

2619

--it_decimal_separator, ++separator_back_index) {

2620

if (std::find(money_separators_.begin(), money_separators_.end(),

2621

static_cast<char32>(*it_decimal_separator)) !=

2622

money_separators_.end()) {

break;

}

}

// If there are 3 digits after the last separator, we consider that a

2628

// thousands separator => the number is an int (e.g. 1.234 is considered int).

2629

// If there is no separator in number, also that number is an int.

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2630

if (separator_back_index == 3 || it_decimal_separator == amount.begin()) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2631

it_decimal_separator = amount.end();

2632

}

2633

2634

if (!unilib_->ParseInt32(RemoveMoneySeparators(money_separators_, amount,

2635

it_decimal_separator),

2636

&data->money->amount_whole_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2637

TC3_LOG(ERROR) << "Could not parse the money whole part as int32 from the "

2638

"amount: "

2639

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2640

return false;

2641

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2642

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2643

if (it_decimal_separator == amount.end()) {

2644

data->money->amount_decimal_part = 0;

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2645

data->money->nanos = 0;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2646

} else {

2647

const int amount_codepoints_size = amount.size_codepoints();

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2648

const UnicodeText decimal_part = UnicodeText::Substring(

2649

amount, amount_codepoints_size - separator_back_index,

2650

amount_codepoints_size, /*do_copy=*/false);

2651

if (!unilib_->ParseInt32(decimal_part, &data->money->amount_decimal_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2652

TC3_LOG(ERROR) << "Could not parse the money decimal part as int32 from "

2653

"the amount: "

2654

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2655

return false;

2656

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2657

data->money->nanos = data->money->amount_decimal_part *

2658

pow(10, 9 - decimal_part.size_codepoints());

2659

}

2660

2661

if (model_->money_parsing_options()->quantities_name_to_exponent() !=

2662

nullptr) {

2663

int quantity_exponent;

2664

std::string quantity;

2665

GetMoneyQuantityFromCapturingGroup(match, config, context_unicode,

2666

&quantity, &quantity_exponent);

Tony Mak

074ee38

2020-09-30 19:11:00 +0100

[diff] [blame^]

2667

if ((quantity_exponent > 0 && quantity_exponent < 9) ||

2668

(quantity_exponent == 9 && data->money->amount_whole_part <= 2)) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2669

data->money->amount_whole_part =

2670

data->money->amount_whole_part * pow(10, quantity_exponent) +

2671

data->money->nanos / pow(10, 9 - quantity_exponent);

2672

data->money->nanos = data->money->nanos %

2673

static_cast<int>(pow(10, 9 - quantity_exponent)) *

2674

pow(10, quantity_exponent);

Tony Mak

074ee38

2020-09-30 19:11:00 +0100

[diff] [blame^]

2675

}

2676

if (quantity_exponent > 0) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2677

data->money->unnormalized_amount = strings::JoinStrings(

2678

" ", {data->money->unnormalized_amount, quantity});

2679

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2680

}

2681

2682

*serialized_entity_data =

2683

PackFlatbuffer<libtextclassifier3::EntityData>(data.get());

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2687

bool Annotator::RegexChunk(const UnicodeText& context_unicode,

2688

const std::vector<int>& rules,

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2689

bool is_serialized_entity_data_enabled,

2690

const EnabledEntityTypes& enabled_entity_types,

2691

const AnnotationUsecase& annotation_usecase,

2692

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2693

for (int pattern_id : rules) {

2694

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2695

if (!enabled_entity_types(regex_pattern.config->collection_name()->str()) &&

2696

annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_RAW) {

2697

// No regex annotation type has been requested, skip regex annotation.

2698

continue;

2699

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2700

const auto matcher = regex_pattern.pattern->Matcher(context_unicode);

2701

if (!matcher) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2702

TC3_LOG(ERROR) << "Could not get regex matcher for pattern: "

2703

<< pattern_id;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

int status = UniLib::RegexMatcher::kNoError;

2708

while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2709

if (regex_pattern.config->verification_options()) {

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2710

if (!VerifyRegexMatchCandidate(

2711

context_unicode.ToUTF8String(),

2712

regex_pattern.config->verification_options(),

2713

matcher->Group(1, &status).ToUTF8String(), matcher.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2714

continue;

2715

}

2716

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2717

2718

std::string serialized_entity_data;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2719

if (is_serialized_entity_data_enabled) {

2720

if (!SerializedEntityDataFromRegexMatch(

2721

regex_pattern.config, matcher.get(), &serialized_entity_data)) {

2722

TC3_LOG(ERROR) << "Could not get entity data.";

2723

return false;

2724

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2725

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2726

// Further parsing of money amount. Need this since regexes cannot have

2727

// empty groups that fill in entity data (amount_decimal_part and

2728

// quantity might be empty groups).

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2729

if (regex_pattern.config->collection_name()->str() ==

2730

Collections::Money()) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2731

if (!ParseAndFillInMoneyAmount(&serialized_entity_data, matcher.get(),

2732

regex_pattern.config,

2733

context_unicode)) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame]

2734

if (model_->version() >= 706) {

2735

// This way of parsing money entity data is enabled for models

2736

// newer than v706 => logging errors only for them (b/156634162).

2737

TC3_LOG(ERROR) << "Could not parse and fill in money amount.";

2738

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2739

}

2740

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2741

}

2742

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2743

result->emplace_back();

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2744

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2745

// Selection/annotation regular expressions need to specify a capturing

2746

// group specifying the selection.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2747

result->back().span =

2748

ComputeSelectionBoundaries(matcher.get(), regex_pattern.config);

2749

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2750

result->back().classification = {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2751

{regex_pattern.config->collection_name()->str(),

2752

regex_pattern.config->target_classification_score(),

2753

regex_pattern.config->priority_score()}};

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2754

2755

result->back().classification[0].serialized_entity_data =

2756

serialized_entity_data;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2762

bool Annotator::ModelChunk(int num_tokens, const TokenSpan& span_of_interest,

2763

tflite::Interpreter* selection_interpreter,

2764

const CachedFeatures& cached_features,

2765

std::vector<TokenSpan>* chunks) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2766

const int max_selection_span =

2767

selection_feature_processor_->GetOptions()->max_selection_span();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2768

// The inference span is the span of interest expanded to include

2769

// max_selection_span tokens on either side, which is how far a selection can

2770

// stretch from the click.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2771

const TokenSpan inference_span =

2772

IntersectTokenSpans(span_of_interest.Expand(

2773

/*num_tokens_left=*/max_selection_span,

2774

/*num_tokens_right=*/max_selection_span),

2775

{0, num_tokens});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2776

2777

std::vector<ScoredChunk> scored_chunks;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2778

if (selection_feature_processor_->GetOptions()->bounds_sensitive_features() &&

2779

selection_feature_processor_->GetOptions()

2780

->bounds_sensitive_features()

2781

->enabled()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2782

if (!ModelBoundsSensitiveScoreChunks(

2783

num_tokens, span_of_interest, inference_span, cached_features,

2784

selection_interpreter, &scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

} else {

if (!ModelClickContextScoreChunks(num_tokens, span_of_interest,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2789

cached_features, selection_interpreter,

2790

&scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2791

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2792

}

2793

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2794

std::sort(scored_chunks.rbegin(), scored_chunks.rend(),

2795

[](const ScoredChunk& lhs, const ScoredChunk& rhs) {

2796

return lhs.score < rhs.score;

2797

});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2798

2799

// Traverse the candidate chunks from highest-scoring to lowest-scoring. Pick

2800

// them greedily as long as they do not overlap with any previously picked

2801

// chunks.

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2802

std::vector<bool> token_used(inference_span.Size());

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2803

chunks->clear();

2804

for (const ScoredChunk& scored_chunk : scored_chunks) {

2805

bool feasible = true;

2806

for (int i = scored_chunk.token_span.first;

2807

i < scored_chunk.token_span.second; ++i) {

2808

if (token_used[i - inference_span.first]) {

feasible = false;

break;

}

}

if (!feasible) {

continue;

}

for (int i = scored_chunk.token_span.first;

2819

i < scored_chunk.token_span.second; ++i) {

2820

token_used[i - inference_span.first] = true;

2821

}

2822

2823

chunks->push_back(scored_chunk.token_span);

2824

}

2825

2826

std::sort(chunks->begin(), chunks->end());

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2831

namespace {

2832

// Updates the value at the given key in the map to maximum of the current value

2833

// and the given value, or simply inserts the value if the key is not yet there.

2834

template <typename Map>

2835

void UpdateMax(Map* map, typename Map::key_type key,

2836

typename Map::mapped_type value) {

2837

const auto it = map->find(key);

2838

if (it != map->end()) {

2839

it->second = std::max(it->second, value);

} else {

(*map)[key] = value;

}

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2846

bool Annotator::ModelClickContextScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2847

int num_tokens, const TokenSpan& span_of_interest,

2848

const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2849

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2850

std::vector<ScoredChunk>* scored_chunks) const {

2851

const int max_batch_size = model_->selection_options()->batch_size();

2852

2853

std::vector<float> all_features;

2854

std::map<TokenSpan, float> chunk_scores;

2855

for (int batch_start = span_of_interest.first;

2856

batch_start < span_of_interest.second; batch_start += max_batch_size) {

2857

const int batch_end =

2858

std::min(batch_start + max_batch_size, span_of_interest.second);

2859

2860

// Prepare features for the whole batch.

2861

all_features.clear();

2862

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2863

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2864

cached_features.AppendClickContextFeaturesForClick(click_pos,

&all_features);

}

// Run batched inference.

2869

const int batch_size = batch_end - batch_start;

2870

const int features_size = cached_features.OutputFeaturesSize();

2871

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2872

TensorView<float>(all_features.data(), {batch_size, features_size}),

2873

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2874

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2875

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2876

return false;

2877

}

2878

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2879

logits.dim(1) !=

2880

selection_feature_processor_->GetSelectionLabelCount()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2881

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2887

const std::vector<float> scores = ComputeSoftmax(

2888

logits.data() + logits.dim(1) * (click_pos - batch_start),

2889

logits.dim(1));

2890

for (int j = 0;

2891

j < selection_feature_processor_->GetSelectionLabelCount(); ++j) {

2892

TokenSpan relative_token_span;

2893

if (!selection_feature_processor_->LabelToTokenSpan(

2894

j, &relative_token_span)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2895

TC3_LOG(ERROR) << "Couldn't map the label to a token span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2896

return false;

2897

}

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2898

const TokenSpan candidate_span = TokenSpan(click_pos).Expand(

2899

relative_token_span.first, relative_token_span.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2900

if (candidate_span.first >= 0 && candidate_span.second <= num_tokens) {

2901

UpdateMax(&chunk_scores, candidate_span, scores[j]);

}

}

}

}

scored_chunks->clear();

2908

scored_chunks->reserve(chunk_scores.size());

2909

for (const auto& entry : chunk_scores) {

2910

scored_chunks->push_back(ScoredChunk{entry.first, entry.second});

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2916

bool Annotator::ModelBoundsSensitiveScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2917

int num_tokens, const TokenSpan& span_of_interest,

2918

const TokenSpan& inference_span, const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2919

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2920

std::vector<ScoredChunk>* scored_chunks) const {

2921

const int max_selection_span =

2922

selection_feature_processor_->GetOptions()->max_selection_span();

2923

const int max_chunk_length = selection_feature_processor_->GetOptions()

2924

->selection_reduced_output_space()

2925

? max_selection_span + 1

2926

: 2 * max_selection_span + 1;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2927

const bool score_single_token_spans_as_zero =

2928

selection_feature_processor_->GetOptions()

2929

->bounds_sensitive_features()

2930

->score_single_token_spans_as_zero();

2931

2932

scored_chunks->clear();

2933

if (score_single_token_spans_as_zero) {

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2934

scored_chunks->reserve(span_of_interest.Size());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2935

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2936

2937

// Prepare all chunk candidates into one batch:

2938

// - Are contained in the inference span

2939

// - Have a non-empty intersection with the span of interest

2940

// - Are at least one token long

2941

// - Are not longer than the maximum chunk length

2942

std::vector<TokenSpan> candidate_spans;

2943

for (int start = inference_span.first; start < span_of_interest.second;

2944

++start) {

2945

const int leftmost_end_index = std::max(start, span_of_interest.first) + 1;

2946

for (int end = leftmost_end_index;

2947

end <= inference_span.second && end - start <= max_chunk_length;

2948

++end) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2949

const TokenSpan candidate_span = {start, end};

Tony Mak

2020-08-13 18:57:10 +0100

[diff] [blame]

2950

if (score_single_token_spans_as_zero && candidate_span.Size() == 1) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2951

// Do not include the single token span in the batch, add a zero score

2952

// for it directly to the output.

2953

scored_chunks->push_back(ScoredChunk{candidate_span, 0.0f});

2954

} else {

2955

candidate_spans.push_back(candidate_span);

2956

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

const int max_batch_size = model_->selection_options()->batch_size();

2961

2962

std::vector<float> all_features;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2963

scored_chunks->reserve(scored_chunks->size() + candidate_spans.size());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2964

for (int batch_start = 0; batch_start < candidate_spans.size();

2965

batch_start += max_batch_size) {

2966

const int batch_end = std::min(batch_start + max_batch_size,

2967

static_cast<int>(candidate_spans.size()));

2968

2969

// Prepare features for the whole batch.

2970

all_features.clear();

2971

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2972

for (int i = batch_start; i < batch_end; ++i) {

2973

cached_features.AppendBoundsSensitiveFeaturesForSpan(candidate_spans[i],

&all_features);

}

// Run batched inference.

2978

const int batch_size = batch_end - batch_start;

2979

const int features_size = cached_features.OutputFeaturesSize();

2980

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2981

TensorView<float>(all_features.data(), {batch_size, features_size}),

2982

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2983

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2984

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2985

return false;

2986

}

2987

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2988

logits.dim(1) != 1) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2989

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int i = batch_start; i < batch_end; ++i) {

2995

scored_chunks->push_back(

2996

ScoredChunk{candidate_spans[i], logits.data()[i - batch_start]});

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

3003

bool Annotator::DatetimeChunk(const UnicodeText& context_unicode,

3004

int64 reference_time_ms_utc,

3005

const std::string& reference_timezone,

3006

const std::string& locales, ModeFlag mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

3007

AnnotationUsecase annotation_usecase,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

3008

bool is_serialized_entity_data_enabled,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

3009

std::vector<AnnotatedSpan>* result) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

3010

std::vector<DatetimeParseResultSpan> datetime_spans;

3011

if (cfg_datetime_parser_) {

3012

if (!(model_->grammar_datetime_model()->enabled_modes() & mode)) {

3013

return true;

3014

}

3015

std::vector<Locale> parsed_locales;

3016

ParseLocales(locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

3017

cfg_datetime_parser_->Parse(

3018

context_unicode.ToUTF8String(),

3019

ToDateAnnotationOptions(

3020

model_->grammar_datetime_model()->annotation_options(),

3021

reference_timezone, reference_time_ms_utc),

3022

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

3023

}

3024

3025

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

3026

if (!datetime_parser_->Parse(context_unicode, reference_time_ms_utc,

3027

reference_timezone, locales, mode,

3028

annotation_usecase,

3029

/*anchor_start_end=*/false, &datetime_spans)) {

3030

return false;

3031

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

3032

}

3033

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

3034

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

3035

AnnotatedSpan annotated_span;

3036

annotated_span.span = datetime_span.span;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

3037

for (const DatetimeParseResult& parse_result : datetime_span.data) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

3038

annotated_span.classification.emplace_back(

3039

PickCollectionForDatetime(parse_result),

3040

datetime_span.target_classification_score,

3041

datetime_span.priority_score);

3042

annotated_span.classification.back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

3043

if (is_serialized_entity_data_enabled) {

3044

annotated_span.classification.back().serialized_entity_data =

3045

CreateDatetimeSerializedEntityData(parse_result);

3046

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

3047

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

3048

annotated_span.source = AnnotatedSpan::Source::DATETIME;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

3049

result->push_back(std::move(annotated_span));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

return true;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

3054

const Model* Annotator::model() const { return model_; }

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

3055

const reflection::Schema* Annotator::entity_data_schema() const {

3056

return entity_data_schema_;

3057

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

3058

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3059

const Model* ViewModel(const void* buffer, int size) {

if (!buffer) {

return nullptr;

}

return LoadAndVerifyModel(buffer, size);

3065

}

3066

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

3067

bool Annotator::LookUpKnowledgeEntity(

3068

const std::string& id, std::string* serialized_knowledge_result) const {

3069

return knowledge_engine_ &&

3070

knowledge_engine_->LookUpEntity(id, serialized_knowledge_result);

3071

}

3072

Tony Mak