Blame - native/annotator/annotator.cc - platform/external/libtextclassifier

2018-01-24 11:11:20 +0100

[diff] [blame]

1

/*

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3

*

4

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

17

#include "annotator/annotator.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

18

19

#include <algorithm>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

20

#include <cmath>

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

21

#include <cstddef>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

22

#include <iterator>

23

#include <numeric>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

24

#include <string>

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

25

#include <unordered_map>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

26

#include <vector>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

27

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

28

#include "annotator/collections.h"

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

29

#include "annotator/model_generated.h"

30

#include "annotator/types.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

31

#include "utils/base/logging.h"

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

32

#include "utils/base/status.h"

33

#include "utils/base/statusor.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

34

#include "utils/checksum.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

35

#include "utils/i18n/locale.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

36

#include "utils/math/softmax.h"

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

37

#include "utils/normalization.h"

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

38

#include "utils/optional.h"

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

39

#include "utils/regex-match.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

40

#include "utils/strings/numbers.h"

41

#include "utils/strings/split.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

42

#include "utils/utf8/unicodetext.h"

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

43

#include "utils/utf8/unilib-common.h"

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

44

#include "utils/zlib/zlib_regex.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

45

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

46

namespace libtextclassifier3 {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

47

48

using SortedIntSet = std::set<int, std::function<bool(int, int)>>;

49

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

50

const std::string& Annotator::kPhoneCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

51

*[]() { return new std::string("phone"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

52

const std::string& Annotator::kAddressCollection =

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

53

*[]() { return new std::string("address"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

54

const std::string& Annotator::kDateCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

55

*[]() { return new std::string("date"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

56

const std::string& Annotator::kUrlCollection =

57

*[]() { return new std::string("url"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

58

const std::string& Annotator::kEmailCollection =

59

*[]() { return new std::string("email"); }();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

60

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

61

namespace {

62

const Model* LoadAndVerifyModel(const void* addr, int size) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

63

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

Tony Mak

51a9e54

2018-11-02 13:36:22 +0000

[diff] [blame]

64

if (VerifyModelBuffer(verifier)) {

65

return GetModel(addr);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

70

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

71

const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,

72

int size) {

73

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

74

if (VerifyPersonNameModelBuffer(verifier)) {

75

return GetPersonNameModel(addr);

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

81

// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will

82

// create a new instance, assign ownership to owned_lib, and return it.

83

const UniLib* MaybeCreateUnilib(const UniLib* lib,

84

std::unique_ptr<UniLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new UniLib);

89

return owned_lib->get();

}

}

// As above, but for CalendarLib.

94

const CalendarLib* MaybeCreateCalendarlib(

95

const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new CalendarLib);

100

return owned_lib->get();

}

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

104

// Returns whether the provided input is valid:

105

// * Valid utf8 text.

106

// * Sane span indices.

107

bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan span) {

108

if (!context.is_valid()) {

109

return false;

110

}

111

return (span.first >= 0 && span.first < span.second &&

112

span.second <= context.size_codepoints());

113

}

114

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

115

std::unordered_set<char32> FlatbuffersIntVectorToChar32UnorderedSet(

116

const flatbuffers::Vector<int32_t>* ints) {

117

if (ints == nullptr) {

118

return {};

119

}

120

std::unordered_set<char32> ints_set;

121

for (auto value : *ints) {

122

ints_set.insert(static_cast<char32>(value));

}

return ints_set;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

127

DateAnnotationOptions ToDateAnnotationOptions(

128

const GrammarDatetimeModel_::AnnotationOptions* fb_annotation_options,

129

const std::string& reference_timezone, const int64 reference_time_ms_utc) {

130

DateAnnotationOptions result_annotation_options;

131

result_annotation_options.base_timestamp_millis = reference_time_ms_utc;

132

result_annotation_options.reference_timezone = reference_timezone;

133

if (fb_annotation_options != nullptr) {

134

result_annotation_options.enable_special_day_offset =

135

fb_annotation_options->enable_special_day_offset();

136

result_annotation_options.merge_adjacent_components =

137

fb_annotation_options->merge_adjacent_components();

138

result_annotation_options.enable_date_range =

139

fb_annotation_options->enable_date_range();

140

result_annotation_options.include_preposition =

141

fb_annotation_options->include_preposition();

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

142

if (fb_annotation_options->extra_requested_dates() != nullptr) {

143

for (const auto& extra_requested_date :

144

*fb_annotation_options->extra_requested_dates()) {

145

result_annotation_options.extra_requested_dates.push_back(

146

extra_requested_date->str());

147

}

148

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

149

if (fb_annotation_options->ignored_spans() != nullptr) {

150

for (const auto& ignored_span : *fb_annotation_options->ignored_spans()) {

151

result_annotation_options.ignored_spans.push_back(ignored_span->str());

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

152

}

153

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

154

}

155

return result_annotation_options;

156

}

157

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

158

} // namespace

159

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

160

tflite::Interpreter* InterpreterManager::SelectionInterpreter() {

161

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

162

TC3_CHECK(selection_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

163

selection_interpreter_ = selection_executor_->CreateInterpreter();

164

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

165

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

166

}

167

}

168

return selection_interpreter_.get();

169

}

170

171

tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {

172

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

173

TC3_CHECK(classification_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

174

classification_interpreter_ = classification_executor_->CreateInterpreter();

175

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

176

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

177

}

178

}

179

return classification_interpreter_.get();

180

}

181

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

182

std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(

183

const char* buffer, int size, const UniLib* unilib,

184

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

185

const Model* model = LoadAndVerifyModel(buffer, size);

186

if (model == nullptr) {

return nullptr;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

190

auto classifier =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

191

std::unique_ptr<Annotator>(new Annotator(model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

192

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

199

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

200

std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,

201

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

202

if (!(*mmap)->handle().ok()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

203

TC3_VLOG(1) << "Mmap failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

208

(*mmap)->handle().num_bytes());

209

if (!model) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

210

TC3_LOG(ERROR) << "Model verification failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

214

auto classifier = std::unique_ptr<Annotator>(

215

new Annotator(mmap, model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

216

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

223

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

224

std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,

225

std::unique_ptr<CalendarLib> calendarlib) {

226

if (!(*mmap)->handle().ok()) {

227

TC3_VLOG(1) << "Mmap failed.";

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

232

(*mmap)->handle().num_bytes());

233

if (model == nullptr) {

234

TC3_LOG(ERROR) << "Model verification failed.";

return nullptr;

}

auto classifier = std::unique_ptr<Annotator>(

239

new Annotator(mmap, model, std::move(unilib), std::move(calendarlib)));

240

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

247

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

248

int fd, int offset, int size, const UniLib* unilib,

249

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

250

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

251

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

252

}

253

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

254

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

255

int fd, int offset, int size, std::unique_ptr<UniLib> unilib,

256

std::unique_ptr<CalendarLib> calendarlib) {

257

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

258

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

259

}

260

261

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

262

int fd, const UniLib* unilib, const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

263

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

264

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

265

}

266

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

267

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

268

int fd, std::unique_ptr<UniLib> unilib,

269

std::unique_ptr<CalendarLib> calendarlib) {

270

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

271

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

272

}

273

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

274

std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,

275

const UniLib* unilib,

276

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

277

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

278

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

279

}

280

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

281

std::unique_ptr<Annotator> Annotator::FromPath(

282

const std::string& path, std::unique_ptr<UniLib> unilib,

283

std::unique_ptr<CalendarLib> calendarlib) {

284

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

285

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

286

}

287

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

288

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

289

const UniLib* unilib, const CalendarLib* calendarlib)

290

: model_(model),

291

mmap_(std::move(*mmap)),

292

owned_unilib_(nullptr),

293

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

294

owned_calendarlib_(nullptr),

295

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

296

ValidateAndInitialize();

297

}

298

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

299

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

300

std::unique_ptr<UniLib> unilib,

301

std::unique_ptr<CalendarLib> calendarlib)

302

: model_(model),

303

mmap_(std::move(*mmap)),

304

owned_unilib_(std::move(unilib)),

305

unilib_(owned_unilib_.get()),

306

owned_calendarlib_(std::move(calendarlib)),

307

calendarlib_(owned_calendarlib_.get()) {

308

ValidateAndInitialize();

309

}

310

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

311

Annotator::Annotator(const Model* model, const UniLib* unilib,

312

const CalendarLib* calendarlib)

313

: model_(model),

314

owned_unilib_(nullptr),

315

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

316

owned_calendarlib_(nullptr),

317

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

318

ValidateAndInitialize();

319

}

320

321

void Annotator::ValidateAndInitialize() {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

322

initialized_ = false;

323

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

324

if (model_ == nullptr) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

325

TC3_LOG(ERROR) << "No model specified.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

329

const bool model_enabled_for_annotation =

330

(model_->triggering_options() != nullptr &&

331

(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));

332

const bool model_enabled_for_classification =

333

(model_->triggering_options() != nullptr &&

334

(model_->triggering_options()->enabled_modes() &

335

ModeFlag_CLASSIFICATION));

336

const bool model_enabled_for_selection =

337

(model_->triggering_options() != nullptr &&

338

(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));

339

340

// Annotation requires the selection model.

341

if (model_enabled_for_annotation || model_enabled_for_selection) {

342

if (!model_->selection_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

343

TC3_LOG(ERROR) << "No selection options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

344

return;

345

}

346

if (!model_->selection_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

347

TC3_LOG(ERROR) << "No selection feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

348

return;

349

}

350

if (!model_->selection_feature_options()->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

351

TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

352

return;

353

}

354

if (!model_->selection_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

355

TC3_LOG(ERROR) << "No selection model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

356

return;

357

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

358

selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

359

if (!selection_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

360

TC3_LOG(ERROR) << "Could not initialize selection executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

361

return;

362

}

363

selection_feature_processor_.reset(

364

new FeatureProcessor(model_->selection_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

365

}

366

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

367

// Annotation requires the classification model for conflict resolution and

368

// scoring.

369

// Selection requires the classification model for conflict resolution.

370

if (model_enabled_for_annotation || model_enabled_for_classification ||

371

model_enabled_for_selection) {

372

if (!model_->classification_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

373

TC3_LOG(ERROR) << "No classification options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

378

TC3_LOG(ERROR) << "No classification feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()

383

->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

384

TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

385

return;

386

}

387

if (!model_->classification_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

388

TC3_LOG(ERROR) << "No clf model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

392

classification_executor_ =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

393

ModelExecutor::FromBuffer(model_->classification_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

394

if (!classification_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

395

TC3_LOG(ERROR) << "Could not initialize classification executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

classification_feature_processor_.reset(new FeatureProcessor(

400

model_->classification_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

401

}

402

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

403

// The embeddings need to be specified if the model is to be used for

404

// classification or selection.

405

if (model_enabled_for_annotation || model_enabled_for_classification ||

406

model_enabled_for_selection) {

407

if (!model_->embedding_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

408

TC3_LOG(ERROR) << "No embedding model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

409

return;

410

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

411

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

412

// Check that the embedding size of the selection and classification model

413

// matches, as they are using the same embeddings.

414

if (model_enabled_for_selection &&

415

(model_->selection_feature_options()->embedding_size() !=

416

model_->classification_feature_options()->embedding_size() ||

417

model_->selection_feature_options()->embedding_quantization_bits() !=

418

model_->classification_feature_options()

419

->embedding_quantization_bits())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

420

TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

421

return;

422

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

423

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

424

embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

425

model_->embedding_model(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

426

model_->classification_feature_options()->embedding_size(),

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

427

model_->classification_feature_options()->embedding_quantization_bits(),

428

model_->embedding_pruning_mask());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

429

if (!embedding_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

430

TC3_LOG(ERROR) << "Could not initialize embedding executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

431

return;

432

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

433

}

434

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

435

std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

436

if (model_->regex_model()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

437

if (!InitializeRegexModel(decompressor.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

438

TC3_LOG(ERROR) << "Could not initialize regex model.";

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

439

return;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

440

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

441

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

442

if (model_->grammar_datetime_model() &&

443

model_->grammar_datetime_model()->datetime_rules()) {

444

cfg_datetime_parser_.reset(new dates::CfgDatetimeAnnotator(

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

445

unilib_,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

446

/*tokenizer_options=*/

447

model_->grammar_datetime_model()->grammar_tokenizer_options(),

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

448

calendarlib_,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

449

/*datetime_rules=*/model_->grammar_datetime_model()->datetime_rules(),

450

model_->grammar_datetime_model()->target_classification_score(),

451

model_->grammar_datetime_model()->priority_score()));

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

452

if (!cfg_datetime_parser_) {

453

TC3_LOG(ERROR) << "Could not initialize context free grammar based "

454

"datetime parser.";

455

return;

456

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

457

}

458

459

if (model_->datetime_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

460

datetime_parser_ = DatetimeParser::Instance(

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

461

model_->datetime_model(), unilib_, calendarlib_, decompressor.get());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

462

if (!datetime_parser_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

463

TC3_LOG(ERROR) << "Could not initialize datetime parser.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

468

if (model_->output_options()) {

469

if (model_->output_options()->filtered_collections_annotation()) {

470

for (const auto collection :

471

*model_->output_options()->filtered_collections_annotation()) {

472

filtered_collections_annotation_.insert(collection->str());

473

}

474

}

475

if (model_->output_options()->filtered_collections_classification()) {

476

for (const auto collection :

477

*model_->output_options()->filtered_collections_classification()) {

478

filtered_collections_classification_.insert(collection->str());

479

}

480

}

481

if (model_->output_options()->filtered_collections_selection()) {

482

for (const auto collection :

483

*model_->output_options()->filtered_collections_selection()) {

484

filtered_collections_selection_.insert(collection->str());

}

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

489

if (model_->number_annotator_options() &&

490

model_->number_annotator_options()->enabled()) {

491

number_annotator_.reset(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

492

new NumberAnnotator(model_->number_annotator_options(), unilib_));

493

}

494

495

if (model_->money_parsing_options()) {

496

money_separators_ = FlatbuffersIntVectorToChar32UnorderedSet(

497

model_->money_parsing_options()->separators());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

498

}

499

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

500

if (model_->duration_annotator_options() &&

501

model_->duration_annotator_options()->enabled()) {

502

duration_annotator_.reset(

503

new DurationAnnotator(model_->duration_annotator_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

504

selection_feature_processor_.get(), unilib_));

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

505

}

506

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

507

if (model_->entity_data_schema()) {

508

entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(

509

model_->entity_data_schema()->Data(),

510

model_->entity_data_schema()->size());

511

if (entity_data_schema_ == nullptr) {

512

TC3_LOG(ERROR) << "Could not load entity data schema data.";

return;

}

entity_data_builder_.reset(

517

new ReflectiveFlatbufferBuilder(entity_data_schema_));

518

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

519

entity_data_schema_ = nullptr;

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

520

entity_data_builder_ = nullptr;

521

}

522

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

523

if (model_->grammar_model()) {

524

grammar_annotator_.reset(new GrammarAnnotator(

525

unilib_, model_->grammar_model(), entity_data_builder_.get()));

526

}

527

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

528

if (model_->triggering_locales() &&

529

!ParseLocales(model_->triggering_locales()->c_str(),

530

&model_triggering_locales_)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

531

TC3_LOG(ERROR) << "Could not parse model supported locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

536

model_->triggering_options()->locales() != nullptr &&

537

!ParseLocales(model_->triggering_options()->locales()->c_str(),

538

&ml_model_triggering_locales_)) {

539

TC3_LOG(ERROR) << "Could not parse supported ML model locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

544

model_->triggering_options()->dictionary_locales() != nullptr &&

545

!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),

546

&dictionary_locales_)) {

547

TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";

return;

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

551

if (model_->conflict_resolution_options() != nullptr) {

552

prioritize_longest_annotation_ =

553

model_->conflict_resolution_options()->prioritize_longest_annotation();

554

do_conflict_resolution_in_raw_mode_ =

555

model_->conflict_resolution_options()

556

->do_conflict_resolution_in_raw_mode();

557

}

558

Chang Li

cac0b44

2020-05-21 15:09:37 +0100

[diff] [blame]

559

#ifdef TC3_EXPERIMENTAL

560

TC3_LOG(WARNING) << "Enabling experimental annotators.";

561

InitializeExperimentalAnnotators();

562

#endif

563

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

initialized_ = true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

567

bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

568

if (!model_->regex_model()->patterns()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

569

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

570

}

571

572

// Initialize pattern recognizers.

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

573

int regex_pattern_id = 0;

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

574

for (const auto regex_pattern : *model_->regex_model()->patterns()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

575

std::unique_ptr<UniLib::RegexPattern> compiled_pattern =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

576

UncompressMakeRegexPattern(

577

*unilib_, regex_pattern->pattern(),

578

regex_pattern->compressed_pattern(),

579

model_->regex_model()->lazy_regex_compilation(), decompressor);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

580

if (!compiled_pattern) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

581

TC3_LOG(INFO) << "Failed to load regex pattern";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

582

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

583

}

584

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

585

if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

586

annotation_regex_patterns_.push_back(regex_pattern_id);

587

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

588

if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

589

classification_regex_patterns_.push_back(regex_pattern_id);

590

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

591

if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

592

selection_regex_patterns_.push_back(regex_pattern_id);

593

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

594

regex_patterns_.push_back({

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

595

regex_pattern,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

596

std::move(compiled_pattern),

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

597

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

598

++regex_pattern_id;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

599

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

600

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

601

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

602

}

603

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

604

bool Annotator::InitializeKnowledgeEngine(

605

const std::string& serialized_config) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

606

std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

607

if (!knowledge_engine->Initialize(serialized_config, unilib_)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

608

TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";

609

return false;

610

}

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

611

if (model_->triggering_options() != nullptr) {

612

knowledge_engine->SetPriorityScore(

613

model_->triggering_options()->knowledge_priority_score());

614

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

615

knowledge_engine_ = std::move(knowledge_engine);

return true;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

619

bool Annotator::InitializeContactEngine(const std::string& serialized_config) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

620

std::unique_ptr<ContactEngine> contact_engine(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

621

new ContactEngine(selection_feature_processor_.get(), unilib_,

622

model_->contact_annotator_options()));

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

623

if (!contact_engine->Initialize(serialized_config)) {

624

TC3_LOG(ERROR) << "Failed to initialize the contact engine.";

625

return false;

626

}

627

contact_engine_ = std::move(contact_engine);

return true;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

631

bool Annotator::InitializeInstalledAppEngine(

632

const std::string& serialized_config) {

633

std::unique_ptr<InstalledAppEngine> installed_app_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

634

new InstalledAppEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

635

if (!installed_app_engine->Initialize(serialized_config)) {

636

TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";

637

return false;

638

}

639

installed_app_engine_ = std::move(installed_app_engine);

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

643

void Annotator::SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id) {

644

lang_id_ = lang_id;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

645

if (lang_id_ != nullptr && model_->translate_annotator_options() &&

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

646

model_->translate_annotator_options()->enabled()) {

647

translate_annotator_.reset(new TranslateAnnotator(

648

model_->translate_annotator_options(), lang_id_, unilib_));

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

649

} else {

650

translate_annotator_.reset(nullptr);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

}

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

654

bool Annotator::InitializePersonNameEngineFromUnownedBuffer(const void* buffer,

655

int size) {

656

const PersonNameModel* person_name_model =

657

LoadAndVerifyPersonNameModel(buffer, size);

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

658

659

if (person_name_model == nullptr) {

660

TC3_LOG(ERROR) << "Person name model verification failed.";

return false;

}

if (!person_name_model->enabled()) {

return true;

}

std::unique_ptr<PersonNameEngine> person_name_engine(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

669

new PersonNameEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

670

if (!person_name_engine->Initialize(person_name_model)) {

671

TC3_LOG(ERROR) << "Failed to initialize the person name engine.";

672

return false;

673

}

674

person_name_engine_ = std::move(person_name_engine);

return true;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

678

bool Annotator::InitializePersonNameEngineFromScopedMmap(

679

const ScopedMmap& mmap) {

680

if (!mmap.handle().ok()) {

681

TC3_LOG(ERROR) << "Mmap for person name model failed.";

return false;

}

return InitializePersonNameEngineFromUnownedBuffer(mmap.handle().start(),

686

mmap.handle().num_bytes());

687

}

688

689

bool Annotator::InitializePersonNameEngineFromPath(const std::string& path) {

690

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

691

return InitializePersonNameEngineFromScopedMmap(*mmap);

692

}

693

694

bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,

695

int size) {

696

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

697

return InitializePersonNameEngineFromScopedMmap(*mmap);

698

}

699

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

700

bool Annotator::InitializeExperimentalAnnotators() {

701

if (ExperimentalAnnotator::IsEnabled()) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame^]

702

experimental_annotator_.reset(new ExperimentalAnnotator(

703

model_->experimental_model(), *selection_feature_processor_, *unilib_));

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

return true;

}

return false;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

709

namespace {

710

711

int CountDigits(const std::string& str, CodepointSpan selection_indices) {

712

int count = 0;

713

int i = 0;

714

const UnicodeText unicode_str = UTF8ToUnicodeText(str, /*do_copy=*/false);

715

for (auto it = unicode_str.begin(); it != unicode_str.end(); ++it, ++i) {

716

if (i >= selection_indices.first && i < selection_indices.second &&

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

717

IsDigit(*it)) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

++count;

}

}

return count;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

724

} // namespace

725

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

726

namespace internal {

727

// Helper function, which if the initial 'span' contains only white-spaces,

728

// moves the selection to a single-codepoint selection on a left or right side

729

// of this space.

730

CodepointSpan SnapLeftIfWhitespaceSelection(CodepointSpan span,

731

const UnicodeText& context_unicode,

732

const UniLib& unilib) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

733

TC3_CHECK(ValidNonEmptySpan(span));

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

734

735

UnicodeText::const_iterator it;

736

737

// Check that the current selection is all whitespaces.

738

it = context_unicode.begin();

739

std::advance(it, span.first);

740

for (int i = 0; i < (span.second - span.first); ++i, ++it) {

741

if (!unilib.IsWhitespace(*it)) {

return span;

}

}

CodepointSpan result;

// Try moving left.

result = span;

it = context_unicode.begin();

751

std::advance(it, span.first);

752

while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {

--result.first;

--it;

}

result.second = result.first + 1;

757

if (!unilib.IsWhitespace(*it)) {

return result;

}

// If moving left didn't find a non-whitespace character, just return the

// original span.

return span;

}

} // namespace internal

766

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

767

bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

768

return !span.classification.empty() &&

769

filtered_collections_annotation_.find(

770

span.classification[0].collection) !=

771

filtered_collections_annotation_.end();

772

}

773

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

774

bool Annotator::FilteredForClassification(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

775

const ClassificationResult& classification) const {

776

return filtered_collections_classification_.find(classification.collection) !=

777

filtered_collections_classification_.end();

778

}

779

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

780

bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

781

return !span.classification.empty() &&

782

filtered_collections_selection_.find(

783

span.classification[0].collection) !=

784

filtered_collections_selection_.end();

785

}

786

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

787

namespace {

788

inline bool ClassifiedAsOther(

789

const std::vector<ClassificationResult>& classification) {

790

return !classification.empty() &&

791

classification[0].collection == Collections::Other();

792

}

793

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

794

} // namespace

795

796

float Annotator::GetPriorityScore(

797

const std::vector<ClassificationResult>& classification) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

798

if (!classification.empty() && !ClassifiedAsOther(classification)) {

799

return classification[0].priority_score;

800

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

801

if (model_->triggering_options() != nullptr) {

802

return model_->triggering_options()->other_collection_priority_score();

803

} else {

804

return -1000.0;

805

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

806

}

807

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

808

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

809

bool Annotator::VerifyRegexMatchCandidate(

810

const std::string& context, const VerificationOptions* verification_options,

811

const std::string& match, const UniLib::RegexMatcher* matcher) const {

812

if (verification_options == nullptr) {

813

return true;

814

}

815

if (verification_options->verify_luhn_checksum() &&

816

!VerifyLuhnChecksum(match)) {

817

return false;

818

}

819

const int lua_verifier = verification_options->lua_verifier();

820

if (lua_verifier >= 0) {

821

if (model_->regex_model()->lua_verifier() == nullptr ||

822

lua_verifier >= model_->regex_model()->lua_verifier()->size()) {

823

TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;

return false;

}

return VerifyMatch(

context, matcher,

model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

833

CodepointSpan Annotator::SuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

834

const std::string& context, CodepointSpan click_indices,

835

const SelectionOptions& options) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

836

CodepointSpan original_click_indices = click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

837

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

838

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

839

return original_click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

840

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

841

if (options.annotation_usecase !=

842

AnnotationUsecase_ANNOTATION_USECASE_SMART) {

843

TC3_LOG(WARNING)

844

<< "Invoking SuggestSelection, which is not supported in RAW mode.";

845

return original_click_indices;

846

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

847

if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

848

return original_click_indices;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

849

}

850

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

851

std::vector<Locale> detected_text_language_tags;

852

if (!ParseLocales(options.detected_text_language_tags,

853

&detected_text_language_tags)) {

854

TC3_LOG(WARNING)

855

<< "Failed to parse the detected_text_language_tags in options: "

856

<< options.detected_text_language_tags;

857

}

858

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

859

model_triggering_locales_,

860

/*default_value=*/true)) {

861

return original_click_indices;

862

}

863

Lukas Zilka

df710db

2018-02-27 12:44:09 +0100

[diff] [blame]

864

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

865

/*do_copy=*/false);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

866

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame]

867

if (!IsValidSpanInput(context_unicode, click_indices)) {

868

TC3_VLOG(1)

869

<< "Trying to run SuggestSelection with invalid input, indices: "

870

<< click_indices.first << " " << click_indices.second;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

871

return original_click_indices;

872

}

873

874

if (model_->snap_whitespace_selections()) {

875

// We want to expand a purely white-space selection to a multi-selection it

876

// would've been part of. But with this feature disabled we would do a no-

877

// op, because no token is found. Therefore, we need to modify the

878

// 'click_indices' a bit to include a part of the token, so that the click-

879

// finding logic finds the clicked token correctly. This modification is

880

// done by the following function. Note, that it's enough to check the left

881

// side of the current selection, because if the white-space is a part of a

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

882

// multi-selection, necessarily both tokens - on the left and the right

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

883

// sides need to be selected. Thus snapping only to the left is sufficient

884

// (there's a check at the bottom that makes sure that if we snap to the

885

// left token but the result does not contain the initial white-space,

886

// returns the original indices).

887

click_indices = internal::SnapLeftIfWhitespaceSelection(

888

click_indices, context_unicode, *unilib_);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

889

}

890

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

891

std::vector<AnnotatedSpan> candidates;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

892

InterpreterManager interpreter_manager(selection_executor_.get(),

893

classification_executor_.get());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

894

std::vector<Token> tokens;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

895

if (!ModelSuggestSelection(context_unicode, click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

896

detected_text_language_tags, &interpreter_manager,

897

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

898

TC3_LOG(ERROR) << "Model suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

899

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

900

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

901

if (!RegexChunk(context_unicode, selection_regex_patterns_, &candidates,

902

/*is_serialized_entity_data_enabled=*/false)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

903

TC3_LOG(ERROR) << "Regex suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

904

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

905

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

906

if (!DatetimeChunk(

907

UTF8ToUnicodeText(context, /*do_copy=*/false),

908

/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",

909

options.locales, ModeFlag_SELECTION, options.annotation_usecase,

910

/*is_serialized_entity_data_enabled=*/false, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

911

TC3_LOG(ERROR) << "Datetime suggest selection failed.";

912

return original_click_indices;

913

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

914

if (knowledge_engine_ != nullptr &&

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

915

!knowledge_engine_->Chunk(context, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

916

options.location_context, Permissions(),

917

&candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

918

TC3_LOG(ERROR) << "Knowledge suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

919

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

920

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

921

if (contact_engine_ != nullptr &&

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

922

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

923

TC3_LOG(ERROR) << "Contact suggest selection failed.";

924

return original_click_indices;

925

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

926

if (installed_app_engine_ != nullptr &&

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

927

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

928

TC3_LOG(ERROR) << "Installed app suggest selection failed.";

929

return original_click_indices;

930

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

931

if (number_annotator_ != nullptr &&

932

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

933

&candidates)) {

934

TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";

935

return original_click_indices;

936

}

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

937

if (duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

938

!duration_annotator_->FindAll(context_unicode, tokens,

939

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

940

TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";

941

return original_click_indices;

942

}

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

943

if (person_name_engine_ != nullptr &&

944

!person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {

945

TC3_LOG(ERROR) << "Person name suggest selection failed.";

946

return original_click_indices;

947

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

948

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

949

AnnotatedSpan grammar_suggested_span;

950

if (grammar_annotator_ != nullptr &&

951

grammar_annotator_->SuggestSelection(detected_text_language_tags,

952

context_unicode, click_indices,

953

&grammar_suggested_span)) {

954

candidates.push_back(grammar_suggested_span);

955

}

956

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

957

if (experimental_annotator_ != nullptr) {

958

candidates.push_back(experimental_annotator_->SuggestSelection(

959

context_unicode, click_indices));

960

}

961

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

962

// Sort candidates according to their position in the input, so that the next

963

// code can assume that any connected component of overlapping spans forms a

964

// contiguous block.

965

std::sort(candidates.begin(), candidates.end(),

966

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

967

return a.span.first < b.span.first;

968

});

969

970

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

971

if (!ResolveConflicts(candidates, context, tokens,

972

detected_text_language_tags, options.annotation_usecase,

973

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

974

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

975

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

976

}

977

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

978

std::sort(candidate_indices.begin(), candidate_indices.end(),

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

979

[this, &candidates](int a, int b) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

980

return GetPriorityScore(candidates[a].classification) >

981

GetPriorityScore(candidates[b].classification);

982

});

983

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

984

for (const int i : candidate_indices) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

985

if (SpansOverlap(candidates[i].span, click_indices) &&

986

SpansOverlap(candidates[i].span, original_click_indices)) {

987

// Run model classification if not present but requested and there's a

988

// classification collection filter specified.

989

if (candidates[i].classification.empty() &&

990

model_->selection_options()->always_classify_suggested_selection() &&

991

!filtered_collections_selection_.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

992

if (!ModelClassifyText(context, detected_text_language_tags,

993

candidates[i].span, &interpreter_manager,

994

/*embedding_cache=*/nullptr,

995

&candidates[i].classification)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

996

return original_click_indices;

}

}

// Ignore if span classification is filtered.

1001

if (FilteredForSelection(candidates[i])) {

1002

return original_click_indices;

1003

}

1004

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1005

return candidates[i].span;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1009

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

namespace {

// Helper function that returns the index of the first candidate that

1014

// transitively does not overlap with the candidate on 'start_index'. If the end

1015

// of 'candidates' is reached, it returns the index that points right behind the

1016

// array.

1017

int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,

1018

int start_index) {

1019

int first_non_overlapping = start_index + 1;

1020

CodepointSpan conflicting_span = candidates[start_index].span;

1021

while (

1022

first_non_overlapping < candidates.size() &&

1023

SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {

1024

// Grow the span to include the current one.

1025

conflicting_span.second = std::max(

1026

conflicting_span.second, candidates[first_non_overlapping].span.second);

1027

1028

++first_non_overlapping;

1029

}

1030

return first_non_overlapping;

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1034

bool Annotator::ResolveConflicts(

1035

const std::vector<AnnotatedSpan>& candidates, const std::string& context,

1036

const std::vector<Token>& cached_tokens,

1037

const std::vector<Locale>& detected_text_language_tags,

1038

AnnotationUsecase annotation_usecase,

1039

InterpreterManager* interpreter_manager, std::vector<int>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1040

result->clear();

1041

result->reserve(candidates.size());

1042

for (int i = 0; i < candidates.size();) {

1043

int first_non_overlapping =

1044

FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);

1045

1046

const bool conflict_found = first_non_overlapping != (i + 1);

1047

if (conflict_found) {

1048

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1049

if (!ResolveConflict(context, cached_tokens, candidates,

1050

detected_text_language_tags, i,

1051

first_non_overlapping, annotation_usecase,

1052

interpreter_manager, &candidate_indices)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1053

return false;

1054

}

1055

result->insert(result->end(), candidate_indices.begin(),

1056

candidate_indices.end());

1057

} else {

1058

result->push_back(i);

1059

}

1060

1061

// Skip over the whole conflicting group/go to next candidate.

1062

i = first_non_overlapping;

}

return true;

}

namespace {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1068

// Returns true, if the given two sources do conflict in given annotation

1069

// usecase.

1070

// - In SMART usecase, all sources do conflict, because there's only 1 possible

1071

// annotation for a given span.

1072

// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime

1073

// and duration), while others not (e.g. duration and number).

1074

bool DoSourcesConflict(AnnotationUsecase annotation_usecase,

1075

const AnnotatedSpan::Source source1,

1076

const AnnotatedSpan::Source source2) {

1077

uint32 source_mask =

1078

(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));

1079

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1080

switch (annotation_usecase) {

1081

case AnnotationUsecase_ANNOTATION_USECASE_SMART:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1082

// In the SMART mode, all annotations conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1083

return true;

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1084

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1085

case AnnotationUsecase_ANNOTATION_USECASE_RAW:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1086

// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",

1087

// can have two non-conflicting annotations: "in 3 hours" (datetime), "3

1088

// hours" (duration).

1089

if ((source_mask &

1090

(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&

1091

(source_mask &

1092

(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {

1093

return false;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1094

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1095

1096

// A KNOWLEDGE entity does not conflict with anything.

1097

if ((source_mask &

1098

(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {

return false;

}

Tony Mak

2020-03-27 13:58:00 +0000

[diff] [blame]

1102

// A PERSONNAME entity does not conflict with anything.

1103

if ((source_mask &

1104

(1 << static_cast<int>(AnnotatedSpan::Source::PERSON_NAME)))) {

return false;

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1108

// Entities from other sources can conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1109

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1114

bool Annotator::ResolveConflict(

1115

const std::string& context, const std::vector<Token>& cached_tokens,

1116

const std::vector<AnnotatedSpan>& candidates,

1117

const std::vector<Locale>& detected_text_language_tags, int start_index,

1118

int end_index, AnnotationUsecase annotation_usecase,

1119

InterpreterManager* interpreter_manager,

1120

std::vector<int>* chosen_indices) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1121

std::vector<int> conflicting_indices;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1122

std::unordered_map<int, std::pair<float, int>> scores_lengths;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1123

for (int i = start_index; i < end_index; ++i) {

1124

conflicting_indices.push_back(i);

1125

if (!candidates[i].classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1126

scores_lengths[i] = {

1127

GetPriorityScore(candidates[i].classification),

1128

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

continue;

}

// OPTIMIZATION: So that we don't have to classify all the ML model

1133

// spans apriori, we wait until we get here, when they conflict with

1134

// something and we need the actual classification scores. So if the

1135

// candidate conflicts and comes from the model, we need to run a

1136

// classification to determine its priority:

1137

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1138

if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1139

candidates[i].span, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1140

/*embedding_cache=*/nullptr, &classification)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (!classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1145

scores_lengths[i] = {

1146

GetPriorityScore(classification),

1147

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1151

std::sort(

1152

conflicting_indices.begin(), conflicting_indices.end(),

1153

[this, &scores_lengths, candidates, conflicting_indices](int i, int j) {

1154

if (scores_lengths[i].first == scores_lengths[j].first &&

1155

prioritize_longest_annotation_) {

1156

return scores_lengths[i].second > scores_lengths[j].second;

1157

}

1158

return scores_lengths[i].first > scores_lengths[j].first;

1159

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1160

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1161

// Here we keep a set of indices that were chosen, per-source, to enable

1162

// effective computation.

1163

std::unordered_map<AnnotatedSpan::Source, SortedIntSet>

1164

chosen_indices_for_source_map;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1165

1166

// Greedily place the candidates if they don't conflict with the already

1167

// placed ones.

1168

for (int i = 0; i < conflicting_indices.size(); ++i) {

1169

const int considered_candidate = conflicting_indices[i];

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1170

1171

// See if there is a conflict between the candidate and all already placed

1172

// candidates.

1173

bool conflict = false;

1174

SortedIntSet* chosen_indices_for_source_ptr = nullptr;

1175

for (auto& source_set_pair : chosen_indices_for_source_map) {

1176

if (source_set_pair.first == candidates[considered_candidate].source) {

1177

chosen_indices_for_source_ptr = &source_set_pair.second;

1178

}

1179

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1180

const bool needs_conflict_resolution =

1181

annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_SMART ||

1182

(annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_RAW &&

1183

do_conflict_resolution_in_raw_mode_);

1184

if (needs_conflict_resolution &&

1185

DoSourcesConflict(annotation_usecase, source_set_pair.first,

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1186

candidates[considered_candidate].source) &&

1187

DoesCandidateConflict(considered_candidate, candidates,

1188

source_set_pair.second)) {

1189

conflict = true;

1190

break;

1191

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1192

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1193

1194

// Skip the candidate if a conflict was found.

if (conflict) {

continue;

}

// If the set of indices for the current source doesn't exist yet,

1200

// initialize it.

1201

if (chosen_indices_for_source_ptr == nullptr) {

1202

SortedIntSet new_set([&candidates](int a, int b) {

1203

return candidates[a].span.first < candidates[b].span.first;

1204

});

1205

chosen_indices_for_source_map[candidates[considered_candidate].source] =

1206

std::move(new_set);

1207

chosen_indices_for_source_ptr =

1208

&chosen_indices_for_source_map[candidates[considered_candidate]

.source];

}

// Place the candidate to the output and to the per-source conflict set.

1213

chosen_indices->push_back(considered_candidate);

1214

chosen_indices_for_source_ptr->insert(considered_candidate);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1215

}

1216

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1217

std::sort(chosen_indices->begin(), chosen_indices->end());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1222

bool Annotator::ModelSuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1223

const UnicodeText& context_unicode, CodepointSpan click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1224

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1225

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1226

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1227

if (model_->triggering_options() == nullptr ||

1228

!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1232

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1233

ml_model_triggering_locales_,

1234

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1238

int click_pos;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1239

*tokens = selection_feature_processor_->Tokenize(context_unicode);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1240

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1241

context_unicode, click_indices,

1242

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1243

tokens, &click_pos);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1244

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1245

TC3_VLOG(1) << "Could not calculate the click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1246

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1247

}

1248

1249

const int symmetry_context_size =

1250

model_->selection_options()->symmetry_context_size();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1251

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1252

bounds_sensitive_features = selection_feature_processor_->GetOptions()

1253

->bounds_sensitive_features();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1254

1255

// The symmetry context span is the clicked token with symmetry_context_size

1256

// tokens on either side.

1257

const TokenSpan symmetry_context_span = IntersectTokenSpans(

1258

ExpandTokenSpan(SingleTokenSpan(click_pos),

1259

/*num_tokens_left=*/symmetry_context_size,

1260

/*num_tokens_right=*/symmetry_context_size),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1261

{0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1262

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1263

// Compute the extraction span based on the model type.

1264

TokenSpan extraction_span;

1265

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1266

// The extraction span is the symmetry context span expanded to include

1267

// max_selection_span tokens on either side, which is how far a selection

1268

// can stretch from the click, plus a relevant number of tokens outside of

1269

// the bounds of the selection.

1270

const int max_selection_span =

1271

selection_feature_processor_->GetOptions()->max_selection_span();

1272

extraction_span =

1273

ExpandTokenSpan(symmetry_context_span,

1274

/*num_tokens_left=*/max_selection_span +

1275

bounds_sensitive_features->num_tokens_before(),

1276

/*num_tokens_right=*/max_selection_span +

1277

bounds_sensitive_features->num_tokens_after());

1278

} else {

1279

// The extraction span is the symmetry context span expanded to include

1280

// context_size tokens on either side.

1281

const int context_size =

1282

selection_feature_processor_->GetOptions()->context_size();

1283

extraction_span = ExpandTokenSpan(symmetry_context_span,

1284

/*num_tokens_left=*/context_size,

1285

/*num_tokens_right=*/context_size);

1286

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1287

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1288

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1289

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1290

*tokens, extraction_span)) {

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1294

std::unique_ptr<CachedFeatures> cached_features;

1295

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1296

*tokens, extraction_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1297

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1298

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1299

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1300

selection_feature_processor_->EmbeddingSize() +

1301

selection_feature_processor_->DenseFeaturesCount(),

1302

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1303

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Produce selection model candidates.

1308

std::vector<TokenSpan> chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1309

if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1310

interpreter_manager->SelectionInterpreter(), *cached_features,

1311

&chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1312

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

for (const TokenSpan& chunk : chunks) {

1317

AnnotatedSpan candidate;

1318

candidate.span = selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1319

context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1320

if (model_->selection_options()->strip_unpaired_brackets()) {

1321

candidate.span =

1322

StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);

1323

}

1324

1325

// Only output non-empty spans.

1326

if (candidate.span.first != candidate.span.second) {

1327

result->push_back(candidate);

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1333

bool Annotator::ModelClassifyText(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1334

const std::string& context,

1335

const std::vector<Locale>& detected_text_language_tags,

1336

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1337

FeatureProcessor::EmbeddingCache* embedding_cache,

1338

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1339

return ModelClassifyText(context, {}, detected_text_language_tags,

1340

selection_indices, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1341

embedding_cache, classification_results);

}

namespace internal {

std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,

1346

CodepointSpan selection_indices,

1347

TokenSpan tokens_around_selection_to_copy) {

1348

const auto first_selection_token = std::upper_bound(

1349

cached_tokens.begin(), cached_tokens.end(), selection_indices.first,

1350

[](int selection_start, const Token& token) {

1351

return selection_start < token.end;

1352

});

1353

const auto last_selection_token = std::lower_bound(

1354

cached_tokens.begin(), cached_tokens.end(), selection_indices.second,

1355

[](const Token& token, int selection_end) {

1356

return token.start < selection_end;

1357

});

1358

1359

const int64 first_token = std::max(

1360

static_cast<int64>(0),

1361

static_cast<int64>((first_selection_token - cached_tokens.begin()) -

1362

tokens_around_selection_to_copy.first));

1363

const int64 last_token = std::min(

1364

static_cast<int64>(cached_tokens.size()),

1365

static_cast<int64>((last_selection_token - cached_tokens.begin()) +

1366

tokens_around_selection_to_copy.second));

1367

1368

std::vector<Token> tokens;

1369

tokens.reserve(last_token - first_token);

1370

for (int i = first_token; i < last_token; ++i) {

1371

tokens.push_back(cached_tokens[i]);

}

return tokens;

}

} // namespace internal

1376

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1377

TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1378

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1379

bounds_sensitive_features =

1380

classification_feature_processor_->GetOptions()

1381

->bounds_sensitive_features();

1382

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1383

// The extraction span is the selection span expanded to include a relevant

1384

// number of tokens outside of the bounds of the selection.

1385

return {bounds_sensitive_features->num_tokens_before(),

1386

bounds_sensitive_features->num_tokens_after()};

1387

} else {

1388

// The extraction span is the clicked token with context_size tokens on

1389

// either side.

1390

const int context_size =

1391

selection_feature_processor_->GetOptions()->context_size();

1392

return {context_size, context_size};

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1396

namespace {

1397

// Sorts the classification results from high score to low score.

1398

void SortClassificationResults(

1399

std::vector<ClassificationResult>* classification_results) {

1400

std::sort(classification_results->begin(), classification_results->end(),

1401

[](const ClassificationResult& a, const ClassificationResult& b) {

1402

return a.score > b.score;

});

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1407

bool Annotator::ModelClassifyText(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1408

const std::string& context, const std::vector<Token>& cached_tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1409

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1410

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1411

FeatureProcessor::EmbeddingCache* embedding_cache,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1412

std::vector<ClassificationResult>* classification_results) const {

1413

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1414

return ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1415

selection_indices, interpreter_manager,

1416

embedding_cache, classification_results, &tokens);

1417

}

1418

1419

bool Annotator::ModelClassifyText(

1420

const std::string& context, const std::vector<Token>& cached_tokens,

1421

const std::vector<Locale>& detected_text_language_tags,

1422

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1423

FeatureProcessor::EmbeddingCache* embedding_cache,

1424

std::vector<ClassificationResult>* classification_results,

1425

std::vector<Token>* tokens) const {

1426

if (model_->triggering_options() == nullptr ||

1427

!(model_->triggering_options()->enabled_modes() &

1428

ModeFlag_CLASSIFICATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1432

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1433

ml_model_triggering_locales_,

1434

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1438

if (cached_tokens.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1439

*tokens = classification_feature_processor_->Tokenize(context);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1440

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1441

*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,

1442

ClassifyTextUpperBoundNeededTokens());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1443

}

1444

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1445

int click_pos;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1446

classification_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1447

context, selection_indices,

1448

classification_feature_processor_->GetOptions()

1449

->only_use_line_with_click(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1450

tokens, &click_pos);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1451

const TokenSpan selection_token_span =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1452

CodepointSpanToTokenSpan(*tokens, selection_indices);

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1453

const int selection_num_tokens = TokenSpanSize(selection_token_span);

1454

if (model_->classification_options()->max_num_tokens() > 0 &&

1455

model_->classification_options()->max_num_tokens() <

1456

selection_num_tokens) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1457

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1461

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1462

bounds_sensitive_features =

1463

classification_feature_processor_->GetOptions()

1464

->bounds_sensitive_features();

1465

if (selection_token_span.first == kInvalidIndex ||

1466

selection_token_span.second == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1467

TC3_LOG(ERROR) << "Could not determine span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Compute the extraction span based on the model type.

1472

TokenSpan extraction_span;

1473

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1474

// The extraction span is the selection span expanded to include a relevant

1475

// number of tokens outside of the bounds of the selection.

1476

extraction_span = ExpandTokenSpan(

1477

selection_token_span,

1478

/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),

1479

/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());

1480

} else {

1481

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1482

TC3_LOG(ERROR) << "Couldn't choose a click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1483

return false;

1484

}

1485

// The extraction span is the clicked token with context_size tokens on

1486

// either side.

1487

const int context_size =

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1488

classification_feature_processor_->GetOptions()->context_size();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1489

extraction_span = ExpandTokenSpan(SingleTokenSpan(click_pos),

1490

/*num_tokens_left=*/context_size,

1491

/*num_tokens_right=*/context_size);

1492

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1493

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1494

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1495

if (!classification_feature_processor_->HasEnoughSupportedCodepoints(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1496

*tokens, extraction_span)) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1497

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1501

std::unique_ptr<CachedFeatures> cached_features;

1502

if (!classification_feature_processor_->ExtractFeatures(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1503

*tokens, extraction_span, selection_indices,

1504

embedding_executor_.get(), embedding_cache,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1505

classification_feature_processor_->EmbeddingSize() +

1506

classification_feature_processor_->DenseFeaturesCount(),

1507

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1508

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1509

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1510

}

1511

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1512

std::vector<float> features;

1513

features.reserve(cached_features->OutputFeaturesSize());

1514

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1515

cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,

1516

&features);

1517

} else {

1518

cached_features->AppendClickContextFeaturesForClick(click_pos, &features);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1519

}

1520

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1521

TensorView<float> logits = classification_executor_->ComputeLogits(

1522

TensorView<float>(features.data(),

1523

{1, static_cast<int>(features.size())}),

1524

interpreter_manager->ClassificationInterpreter());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1525

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1526

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (logits.dims() != 2 || logits.dim(0) != 1 ||

1531

logits.dim(1) != classification_feature_processor_->NumCollections()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1532

TC3_LOG(ERROR) << "Mismatching output";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

const std::vector<float> scores =

1537

ComputeSoftmax(logits.data(), logits.dim(1));

1538

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1539

if (scores.empty()) {

1540

*classification_results = {{Collections::Other(), 1.0}};

1541

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1542

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1543

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1544

const int best_score_index =

1545

std::max_element(scores.begin(), scores.end()) - scores.begin();

1546

const std::string top_collection =

1547

classification_feature_processor_->LabelToCollection(best_score_index);

1548

1549

// Sanity checks.

1550

if (top_collection == Collections::Phone()) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1551

const int digit_count = CountDigits(context, selection_indices);

1552

if (digit_count <

1553

model_->classification_options()->phone_min_num_digits() ||

1554

digit_count >

1555

model_->classification_options()->phone_max_num_digits()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1556

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1557

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1558

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1559

} else if (top_collection == Collections::Address()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1560

if (selection_num_tokens <

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1561

model_->classification_options()->address_min_num_tokens()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1562

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1563

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1564

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1565

} else if (top_collection == Collections::Dictionary()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1566

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1567

dictionary_locales_,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1568

/*default_value=*/false)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1569

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1570

return true;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1571

}

1572

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1573

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1574

*classification_results = {{top_collection, /*arg_score=*/1.0,

1575

/*arg_priority_score=*/scores[best_score_index]}};

1576

1577

// For some entities, we might want to clamp the priority score, for better

1578

// conflict resolution between entities.

1579

if (model_->triggering_options() != nullptr &&

1580

model_->triggering_options()->collection_to_priority() != nullptr) {

1581

if (auto entry =

1582

model_->triggering_options()->collection_to_priority()->LookupByKey(

1583

top_collection.c_str())) {

1584

(*classification_results)[0].priority_score *= entry->value();

1585

}

1586

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1587

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1588

}

1589

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1590

bool Annotator::RegexClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1591

const std::string& context, CodepointSpan selection_indices,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1592

std::vector<ClassificationResult>* classification_result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1593

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1594

UTF8ToUnicodeText(context, /*do_copy=*/false)

1595

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1596

const UnicodeText selection_text_unicode(

1597

UTF8ToUnicodeText(selection_text, /*do_copy=*/false));

1598

1599

// Check whether any of the regular expressions match.

1600

for (const int pattern_id : classification_regex_patterns_) {

1601

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

1602

const std::unique_ptr<UniLib::RegexMatcher> matcher =

1603

regex_pattern.pattern->Matcher(selection_text_unicode);

1604

int status = UniLib::RegexMatcher::kNoError;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1605

bool matches;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1606

if (regex_pattern.config->use_approximate_matching()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1607

matches = matcher->ApproximatelyMatches(&status);

1608

} else {

1609

matches = matcher->Matches(&status);

1610

}

1611

if (status != UniLib::RegexMatcher::kNoError) {

1612

return false;

1613

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1614

if (matches && VerifyRegexMatchCandidate(

1615

context, regex_pattern.config->verification_options(),

1616

selection_text, matcher.get())) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1617

classification_result->push_back(

1618

{regex_pattern.config->collection_name()->str(),

1619

regex_pattern.config->target_classification_score(),

1620

regex_pattern.config->priority_score()});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1621

if (!SerializedEntityDataFromRegexMatch(

1622

regex_pattern.config, matcher.get(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1623

&classification_result->back().serialized_entity_data)) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1624

TC3_LOG(ERROR) << "Could not get entity data.";

1625

return false;

1626

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1630

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1631

}

1632

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1633

namespace {

1634

std::string PickCollectionForDatetime(

1635

const DatetimeParseResult& datetime_parse_result) {

1636

switch (datetime_parse_result.granularity) {

1637

case GRANULARITY_HOUR:

1638

case GRANULARITY_MINUTE:

1639

case GRANULARITY_SECOND:

1640

return Collections::DateTime();

1641

default:

1642

return Collections::Date();

1643

}

1644

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1645

1646

std::string CreateDatetimeSerializedEntityData(

1647

const DatetimeParseResult& parse_result) {

1648

EntityDataT entity_data;

1649

entity_data.datetime.reset(new EntityData_::DatetimeT());

1650

entity_data.datetime->time_ms_utc = parse_result.time_ms_utc;

1651

entity_data.datetime->granularity =

1652

static_cast<EntityData_::Datetime_::Granularity>(

1653

parse_result.granularity);

1654

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1655

for (const auto& c : parse_result.datetime_components) {

1656

EntityData_::Datetime_::DatetimeComponentT datetime_component;

1657

datetime_component.absolute_value = c.value;

1658

datetime_component.relative_count = c.relative_count;

1659

datetime_component.component_type =

1660

static_cast<EntityData_::Datetime_::DatetimeComponent_::ComponentType>(

1661

c.component_type);

1662

datetime_component.relation_type =

1663

EntityData_::Datetime_::DatetimeComponent_::RelationType_ABSOLUTE;

1664

if (c.relative_qualifier !=

1665

DatetimeComponent::RelativeQualifier::UNSPECIFIED) {

1666

datetime_component.relation_type =

1667

EntityData_::Datetime_::DatetimeComponent_::RelationType_RELATIVE;

1668

}

1669

entity_data.datetime->datetime_component.emplace_back(

1670

new EntityData_::Datetime_::DatetimeComponentT(datetime_component));

1671

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1672

flatbuffers::FlatBufferBuilder builder;

1673

FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));

1674

return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),

1675

builder.GetSize());

1676

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1677

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1678

} // namespace

1679

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1680

bool Annotator::DatetimeClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1681

const std::string& context, CodepointSpan selection_indices,

1682

const ClassificationOptions& options,

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1683

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1684

if (!datetime_parser_ && !cfg_datetime_parser_) {

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1685

return true;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1686

}

1687

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1688

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1689

UTF8ToUnicodeText(context, /*do_copy=*/false)

1690

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1691

1692

std::vector<DatetimeParseResultSpan> datetime_spans;

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1693

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1694

if (cfg_datetime_parser_) {

1695

if (!(model_->grammar_datetime_model()->enabled_modes() &

1696

ModeFlag_CLASSIFICATION)) {

1697

return true;

1698

}

1699

std::vector<Locale> parsed_locales;

1700

ParseLocales(options.locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1701

cfg_datetime_parser_->Parse(

1702

selection_text,

1703

ToDateAnnotationOptions(

1704

model_->grammar_datetime_model()->annotation_options(),

1705

options.reference_timezone, options.reference_time_ms_utc),

1706

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1707

}

1708

1709

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1710

if (!datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,

1711

options.reference_timezone, options.locales,

1712

ModeFlag_CLASSIFICATION,

1713

options.annotation_usecase,

1714

/*anchor_start_end=*/true, &datetime_spans)) {

1715

TC3_LOG(ERROR) << "Error during parsing datetime.";

1716

return false;

1717

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1718

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1719

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1720

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

1721

// Only consider the result valid if the selection and extracted datetime

1722

// spans exactly match.

1723

if (std::make_pair(datetime_span.span.first + selection_indices.first,

1724

datetime_span.span.second + selection_indices.first) ==

1725

selection_indices) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1726

for (const DatetimeParseResult& parse_result : datetime_span.data) {

1727

classification_results->emplace_back(

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1728

PickCollectionForDatetime(parse_result),

1729

datetime_span.target_classification_score);

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1730

classification_results->back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1731

classification_results->back().serialized_entity_data =

1732

CreateDatetimeSerializedEntityData(parse_result);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1733

classification_results->back().priority_score =

1734

datetime_span.priority_score;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1735

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1736

return true;

1737

}

1738

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1739

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1740

}

1741

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1742

std::vector<ClassificationResult> Annotator::ClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1743

const std::string& context, CodepointSpan selection_indices,

1744

const ClassificationOptions& options) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1745

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1746

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1747

return {};

1748

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1749

if (options.annotation_usecase !=

1750

AnnotationUsecase_ANNOTATION_USECASE_SMART) {

1751

TC3_LOG(WARNING)

1752

<< "Invoking ClassifyText, which is not supported in RAW mode.";

1753

return {};

1754

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1755

if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {

return {};

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1759

std::vector<Locale> detected_text_language_tags;

1760

if (!ParseLocales(options.detected_text_language_tags,

1761

&detected_text_language_tags)) {

1762

TC3_LOG(WARNING)

1763

<< "Failed to parse the detected_text_language_tags in options: "

1764

<< options.detected_text_language_tags;

1765

}

1766

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1767

model_triggering_locales_,

1768

/*default_value=*/true)) {

return {};

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

1772

if (!IsValidSpanInput(UTF8ToUnicodeText(context, /*do_copy=*/false),

1773

selection_indices)) {

1774

TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1775

<< std::get<0>(selection_indices) << " "

1776

<< std::get<1>(selection_indices);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1780

// We'll accumulate a list of candidates, and pick the best candidate in the

1781

// end.

1782

std::vector<AnnotatedSpan> candidates;

1783

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1784

// Try the knowledge engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1785

// TODO(b/126579108): Propagate error status.

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1786

ClassificationResult knowledge_result;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1787

if (knowledge_engine_ &&

1788

knowledge_engine_->ClassifyText(

1789

context, selection_indices, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

1790

options.location_context, Permissions(), &knowledge_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1791

candidates.push_back({selection_indices, {knowledge_result}});

1792

candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1793

}

1794

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1795

AddContactMetadataToKnowledgeClassificationResults(&candidates);

1796

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1797

// Try the contact engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1798

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1799

ClassificationResult contact_result;

1800

if (contact_engine_ && contact_engine_->ClassifyText(

1801

context, selection_indices, &contact_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1802

candidates.push_back({selection_indices, {contact_result}});

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1803

}

1804

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1805

// Try the person name engine.

1806

ClassificationResult person_name_result;

1807

if (person_name_engine_ &&

1808

person_name_engine_->ClassifyText(context, selection_indices,

1809

&person_name_result)) {

1810

candidates.push_back({selection_indices, {person_name_result}});

Tony Mak

d0ae7c6

2020-03-27 13:58:00 +0000

[diff] [blame]

1811

candidates.back().source = AnnotatedSpan::Source::PERSON_NAME;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1812

}

1813

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1814

// Try the installed app engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1815

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1816

ClassificationResult installed_app_result;

1817

if (installed_app_engine_ &&

1818

installed_app_engine_->ClassifyText(context, selection_indices,

1819

&installed_app_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1820

candidates.push_back({selection_indices, {installed_app_result}});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1821

}

1822

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1823

// Try the regular expression models.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1824

std::vector<ClassificationResult> regex_results;

1825

if (!RegexClassifyText(context, selection_indices, &regex_results)) {

1826

return {};

1827

}

1828

for (const ClassificationResult& result : regex_results) {

1829

candidates.push_back({selection_indices, {result}});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1830

}

1831

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1832

// Try the date model.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1833

//

1834

// DatetimeClassifyText only returns the first result, which can however have

1835

// more interpretations. They are inserted in the candidates as a single

1836

// AnnotatedSpan, so that they get treated together by the conflict resolution

1837

// algorithm.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1838

std::vector<ClassificationResult> datetime_results;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1839

if (!DatetimeClassifyText(context, selection_indices, options,

1840

&datetime_results)) {

1841

return {};

1842

}

1843

if (!datetime_results.empty()) {

1844

candidates.push_back({selection_indices, std::move(datetime_results)});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1845

candidates.back().source = AnnotatedSpan::Source::DATETIME;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1846

}

1847

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1848

const UnicodeText context_unicode =

1849

UTF8ToUnicodeText(context, /*do_copy=*/false);

1850

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1851

// Try the number annotator.

1852

// TODO(b/126579108): Propagate error status.

1853

ClassificationResult number_annotator_result;

1854

if (number_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1855

number_annotator_->ClassifyText(context_unicode, selection_indices,

1856

options.annotation_usecase,

1857

&number_annotator_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1858

candidates.push_back({selection_indices, {number_annotator_result}});

1859

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1860

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1861

// Try the duration annotator.

1862

ClassificationResult duration_annotator_result;

1863

if (duration_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1864

duration_annotator_->ClassifyText(context_unicode, selection_indices,

1865

options.annotation_usecase,

1866

&duration_annotator_result)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1867

candidates.push_back({selection_indices, {duration_annotator_result}});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1868

candidates.back().source = AnnotatedSpan::Source::DURATION;

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1869

}

1870

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1871

// Try the translate annotator.

1872

ClassificationResult translate_annotator_result;

1873

if (translate_annotator_ &&

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1874

translate_annotator_->ClassifyText(context_unicode, selection_indices,

1875

options.user_familiar_language_tags,

1876

&translate_annotator_result)) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1877

candidates.push_back({selection_indices, {translate_annotator_result}});

1878

}

1879

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1880

// Try the grammar model.

1881

ClassificationResult grammar_annotator_result;

1882

if (grammar_annotator_ && grammar_annotator_->ClassifyText(

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1883

detected_text_language_tags, context_unicode,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1884

selection_indices, &grammar_annotator_result)) {

1885

candidates.push_back({selection_indices, {grammar_annotator_result}});

1886

}

1887

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

1888

ClassificationResult experimental_annotator_result;

1889

if (experimental_annotator_ &&

1890

experimental_annotator_->ClassifyText(context_unicode, selection_indices,

1891

&experimental_annotator_result)) {

1892

candidates.push_back({selection_indices, {experimental_annotator_result}});

1893

}

1894

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1895

// Try the ML model.

1896

//

1897

// The output of the model is considered as an exclusive 1-of-N choice. That's

1898

// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1

1899

// span for each candidate, like e.g. the regex model.

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1900

InterpreterManager interpreter_manager(selection_executor_.get(),

1901

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1902

std::vector<ClassificationResult> model_results;

1903

std::vector<Token> tokens;

1904

if (!ModelClassifyText(

1905

context, /*cached_tokens=*/{}, detected_text_language_tags,

1906

selection_indices, &interpreter_manager,

1907

/*embedding_cache=*/nullptr, &model_results, &tokens)) {

1908

return {};

1909

}

1910

if (!model_results.empty()) {

1911

candidates.push_back({selection_indices, std::move(model_results)});

1912

}

1913

1914

std::vector<int> candidate_indices;

1915

if (!ResolveConflicts(candidates, context, tokens,

1916

detected_text_language_tags, options.annotation_usecase,

1917

&interpreter_manager, &candidate_indices)) {

1918

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

return {};

}

std::vector<ClassificationResult> results;

1923

for (const int i : candidate_indices) {

1924

for (const ClassificationResult& result : candidates[i].classification) {

1925

if (!FilteredForClassification(result)) {

1926

results.push_back(result);

1927

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1928

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1929

}

1930

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1931

// Sort results according to score.

1932

std::sort(results.begin(), results.end(),

1933

[](const ClassificationResult& a, const ClassificationResult& b) {

1934

return a.score > b.score;

1935

});

1936

1937

if (results.empty()) {

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1938

results = {{Collections::Other(), 1.0}};

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1939

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1940

return results;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1941

}

1942

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1943

bool Annotator::ModelAnnotate(

1944

const std::string& context,

1945

const std::vector<Locale>& detected_text_language_tags,

1946

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

1947

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1948

if (model_->triggering_options() == nullptr ||

1949

!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1953

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1954

ml_model_triggering_locales_,

1955

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1959

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

1960

/*do_copy=*/false);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1961

std::vector<UnicodeTextRange> lines;

1962

if (!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {

1963

lines.push_back({context_unicode.begin(), context_unicode.end()});

1964

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1965

lines = selection_feature_processor_->SplitContext(

1966

context_unicode, selection_feature_processor_->GetOptions()

1967

->use_pipe_character_for_newline());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1968

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1969

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1970

const float min_annotate_confidence =

1971

(model_->triggering_options() != nullptr

1972

? model_->triggering_options()->min_annotate_confidence()

1973

: 0.f);

1974

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1975

for (const UnicodeTextRange& line : lines) {

Tony Mak

408c6b8

2019-03-08 17:57:27 +0000

[diff] [blame]

1976

FeatureProcessor::EmbeddingCache embedding_cache;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1977

const std::string line_str =

1978

UnicodeText::UTF8Substring(line.first, line.second);

1979

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1980

*tokens = selection_feature_processor_->Tokenize(line_str);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1981

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1982

line_str, {0, std::distance(line.first, line.second)},

1983

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1984

tokens,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1985

/*click_pos=*/nullptr);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1986

const TokenSpan full_line_span = {0, tokens->size()};

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1987

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1988

// TODO(zilka): Add support for greater granularity of this check.

1989

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1990

*tokens, full_line_span)) {

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1994

std::unique_ptr<CachedFeatures> cached_features;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1995

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1996

*tokens, full_line_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1997

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1998

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1999

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2000

selection_feature_processor_->EmbeddingSize() +

2001

selection_feature_processor_->DenseFeaturesCount(),

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2002

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2003

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2004

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2005

}

2006

2007

std::vector<TokenSpan> local_chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2008

if (!ModelChunk(tokens->size(), /*span_of_interest=*/full_line_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2009

interpreter_manager->SelectionInterpreter(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2010

*cached_features, &local_chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2011

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2012

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2013

}

2014

2015

const int offset = std::distance(context_unicode.begin(), line.first);

2016

for (const TokenSpan& chunk : local_chunks) {

2017

const CodepointSpan codepoint_span =

2018

selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2019

line_str, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2020

2021

// Skip empty spans.

2022

if (codepoint_span.first != codepoint_span.second) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2023

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2024

if (!ModelClassifyText(line_str, *tokens, detected_text_language_tags,

2025

codepoint_span, interpreter_manager,

2026

&embedding_cache, &classification)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2027

TC3_LOG(ERROR) << "Could not classify text: "

2028

<< (codepoint_span.first + offset) << " "

2029

<< (codepoint_span.second + offset);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

// Do not include the span if it's classified as "other".

2034

if (!classification.empty() && !ClassifiedAsOther(classification) &&

2035

classification[0].score >= min_annotate_confidence) {

2036

AnnotatedSpan result_span;

2037

result_span.span = {codepoint_span.first + offset,

2038

codepoint_span.second + offset};

2039

result_span.classification = std::move(classification);

2040

result->push_back(std::move(result_span));

2041

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2042

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2043

}

2044

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2048

const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2049

return selection_feature_processor_.get();

2050

}

2051

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2052

const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2053

const {

2054

return classification_feature_processor_.get();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2055

}

2056

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2057

const DatetimeParser* Annotator::DatetimeParserForTests() const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2058

return datetime_parser_.get();

2059

}

2060

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2061

void Annotator::RemoveNotEnabledEntityTypes(

2062

const EnabledEntityTypes& is_entity_type_enabled,

2063

std::vector<AnnotatedSpan>* annotated_spans) const {

2064

for (AnnotatedSpan& annotated_span : *annotated_spans) {

2065

std::vector<ClassificationResult>& classifications =

2066

annotated_span.classification;

2067

classifications.erase(

2068

std::remove_if(classifications.begin(), classifications.end(),

2069

[&is_entity_type_enabled](

2070

const ClassificationResult& classification_result) {

2071

return !is_entity_type_enabled(

2072

classification_result.collection);

2073

}),

2074

classifications.end());

2075

}

2076

annotated_spans->erase(

2077

std::remove_if(annotated_spans->begin(), annotated_spans->end(),

2078

[](const AnnotatedSpan& annotated_span) {

2079

return annotated_span.classification.empty();

2080

}),

2081

annotated_spans->end());

2082

}

2083

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2084

void Annotator::AddContactMetadataToKnowledgeClassificationResults(

2085

std::vector<AnnotatedSpan>* candidates) const {

2086

if (candidates == nullptr || contact_engine_ == nullptr) {

2087

return;

2088

}

2089

for (auto& candidate : *candidates) {

2090

for (auto& classification_result : candidate.classification) {

2091

contact_engine_->AddContactMetadataToKnowledgeClassificationResult(

2092

&classification_result);

}

}

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2097

Status Annotator::AnnotateSingleInput(

2098

const std::string& context, const AnnotationOptions& options,

2099

std::vector<AnnotatedSpan>* candidates) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2100

if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2101

return Status(StatusCode::UNAVAILABLE, "Model annotation was not enabled.");

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2102

}

2103

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2104

const UnicodeText context_unicode =

2105

UTF8ToUnicodeText(context, /*do_copy=*/false);

2106

if (!context_unicode.is_valid()) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2107

return Status(StatusCode::INVALID_ARGUMENT,

2108

"Context string isn't valid UTF8.");

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2109

}

2110

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2111

std::vector<Locale> detected_text_language_tags;

2112

if (!ParseLocales(options.detected_text_language_tags,

2113

&detected_text_language_tags)) {

2114

TC3_LOG(WARNING)

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2115

<< "Failed to parse the detected_text_language_tags in options: "

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2116

<< options.detected_text_language_tags;

2117

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2118

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

2119

model_triggering_locales_,

2120

/*default_value=*/true)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2121

return Status(

2122

StatusCode::UNAVAILABLE,

2123

"The detected language tags are not in the supported locales.");

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2124

}

2125

2126

InterpreterManager interpreter_manager(selection_executor_.get(),

2127

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2128

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2129

// Annotate with the selection model.

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2130

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2131

if (!ModelAnnotate(context, detected_text_language_tags, &interpreter_manager,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2132

&tokens, candidates)) {

2133

return Status(StatusCode::INTERNAL, "Couldn't run ModelAnnotate.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2134

}

2135

2136

// Annotate with the regular expression models.

2137

if (!RegexChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2138

annotation_regex_patterns_, candidates,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2139

options.is_serialized_entity_data_enabled)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2140

return Status(StatusCode::INTERNAL, "Couldn't run RegexChunk.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2141

}

2142

2143

// Annotate with the datetime model.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2144

const EnabledEntityTypes is_entity_type_enabled(options.entity_types);

2145

if ((is_entity_type_enabled(Collections::Date()) ||

2146

is_entity_type_enabled(Collections::DateTime())) &&

2147

!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2148

options.reference_time_ms_utc, options.reference_timezone,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2149

options.locales, ModeFlag_ANNOTATION,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2150

options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2151

options.is_serialized_entity_data_enabled, candidates)) {

2152

return Status(StatusCode::INTERNAL, "Couldn't run DatetimeChunk.");

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2153

}

2154

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2155

// Annotate with the contact engine.

2156

if (contact_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2157

!contact_engine_->Chunk(context_unicode, tokens, candidates)) {

2158

return Status(StatusCode::INTERNAL, "Couldn't run contact engine Chunk.");

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2159

}

2160

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2161

// Annotate with the installed app engine.

2162

if (installed_app_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2163

!installed_app_engine_->Chunk(context_unicode, tokens, candidates)) {

2164

return Status(StatusCode::INTERNAL,

2165

"Couldn't run installed app engine Chunk.");

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2166

}

2167

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2168

// Annotate with the number annotator.

2169

if (number_annotator_ != nullptr &&

2170

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2171

candidates)) {

2172

return Status(StatusCode::INTERNAL,

2173

"Couldn't run number annotator FindAll.");

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

2174

}

2175

2176

// Annotate with the duration annotator.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2177

if (is_entity_type_enabled(Collections::Duration()) &&

2178

duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2179

!duration_annotator_->FindAll(context_unicode, tokens,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2180

options.annotation_usecase, candidates)) {

2181

return Status(StatusCode::INTERNAL,

2182

"Couldn't run duration annotator FindAll.");

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2183

}

2184

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2185

// Annotate with the person name engine.

2186

if (is_entity_type_enabled(Collections::PersonName()) &&

2187

person_name_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2188

!person_name_engine_->Chunk(context_unicode, tokens, candidates)) {

2189

return Status(StatusCode::INTERNAL,

2190

"Couldn't run person name engine Chunk.");

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2191

}

2192

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2193

// Annotate with the grammar annotators.

2194

if (grammar_annotator_ != nullptr &&

2195

!grammar_annotator_->Annotate(detected_text_language_tags,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2196

context_unicode, candidates)) {

2197

return Status(StatusCode::INTERNAL, "Couldn't run grammar annotators.");

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2198

}

2199

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2200

if (experimental_annotator_ != nullptr &&

2201

!experimental_annotator_->Annotate(context_unicode, candidates)) {

2202

return Status(StatusCode::INTERNAL, "Couldn't run experimental annotator.");

2203

}

2204

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2205

// Sort candidates according to their position in the input, so that the next

2206

// code can assume that any connected component of overlapping spans forms a

2207

// contiguous block.

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2208

// Also sort them according to the end position and collection, so that the

2209

// deduplication code below can assume that same spans and classifications

2210

// form contiguous blocks.

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2211

std::sort(candidates->begin(), candidates->end(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2212

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2213

if (a.span.first != b.span.first) {

2214

return a.span.first < b.span.first;

2215

}

2216

2217

if (a.span.second != b.span.second) {

2218

return a.span.second < b.span.second;

2219

}

2220

2221

return a.classification[0].collection <

2222

b.classification[0].collection;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2223

});

2224

2225

std::vector<int> candidate_indices;

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2226

if (!ResolveConflicts(*candidates, context, tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2227

detected_text_language_tags, options.annotation_usecase,

2228

&interpreter_manager, &candidate_indices)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2229

return Status(StatusCode::INTERNAL, "Couldn't resolve conflicts.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2230

}

2231

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2232

// Remove candidates that overlap exactly and have the same collection.

2233

// This can e.g. happen for phone coming from both ML model and regex.

2234

candidate_indices.erase(

2235

std::unique(candidate_indices.begin(), candidate_indices.end(),

2236

[&candidates](const int a_index, const int b_index) {

2237

const AnnotatedSpan& a = (*candidates)[a_index];

2238

const AnnotatedSpan& b = (*candidates)[b_index];

2239

return a.span == b.span &&

2240

a.classification[0].collection ==

2241

b.classification[0].collection;

2242

}),

2243

candidate_indices.end());

2244

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2245

std::vector<AnnotatedSpan> result;

2246

result.reserve(candidate_indices.size());

2247

for (const int i : candidate_indices) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2248

if ((*candidates)[i].classification.empty() ||

2249

ClassifiedAsOther((*candidates)[i].classification) ||

2250

FilteredForAnnotation((*candidates)[i])) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2251

continue;

2252

}

Tony Mak

2020-05-01 12:41:31 +0100

[diff] [blame]

2253

result.push_back(std::move((*candidates)[i]));

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2254

}

2255

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2256

// We generate all candidates and remove them later (with the exception of

2257

// date/time/duration entities) because there are complex interdependencies

2258

// between the entity types. E.g., the TLD of an email can be interpreted as a

2259

// URL, but most likely a user of the API does not want such annotations if

2260

// "url" is enabled and "email" is not.

2261

RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);

2262

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2263

for (AnnotatedSpan& annotated_span : result) {

2264

SortClassificationResults(&annotated_span.classification);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2265

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2266

*candidates = result;

2267

return Status::OK;

2268

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2269

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2270

StatusOr<std::vector<std::vector<AnnotatedSpan>>>

2271

Annotator::AnnotateStructuredInput(

2272

const std::vector<InputFragment>& string_fragments,

2273

const AnnotationOptions& options) const {

2274

std::vector<std::vector<AnnotatedSpan>> annotation_candidates(

2275

string_fragments.size());

2276

2277

std::vector<std::string> text_to_annotate;

2278

text_to_annotate.reserve(string_fragments.size());

2279

for (const auto& string_fragment : string_fragments) {

2280

text_to_annotate.push_back(string_fragment.text);

2281

}

2282

2283

// KnowledgeEngine is special, because it supports annotation of multiple

2284

// fragments at once.

2285

if (knowledge_engine_ &&

2286

!knowledge_engine_

2287

->ChunkMultipleSpans(text_to_annotate, options.annotation_usecase,

Tony Mak

90d5567

2020-04-15 18:20:44 +0100

[diff] [blame]

2288

options.location_context, options.permissions,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame]

2289

&annotation_candidates)

2290

.ok()) {

2291

return Status(StatusCode::INTERNAL, "Couldn't run knowledge engine Chunk.");

2292

}

2293

// The annotator engines shouldn't change the number of annotation vectors.

2294

if (annotation_candidates.size() != text_to_annotate.size()) {

2295

TC3_LOG(ERROR) << "Received " << text_to_annotate.size()

2296

<< " texts to annotate but generated a different number of "

2297

"lists of annotations:"

2298

<< annotation_candidates.size();

2299

return Status(StatusCode::INTERNAL,

2300

"Number of annotation candidates differs from "

2301

"number of texts to annotate.");

2302

}

2303

2304

// Other annotators run on each fragment independently.

2305

for (int i = 0; i < text_to_annotate.size(); ++i) {

2306

AnnotationOptions annotation_options = options;

2307

if (string_fragments[i].datetime_options.has_value()) {

2308

DatetimeOptions reference_datetime =

2309

string_fragments[i].datetime_options.value();

2310

annotation_options.reference_time_ms_utc =

2311

reference_datetime.reference_time_ms_utc;

2312

annotation_options.reference_timezone =

2313

reference_datetime.reference_timezone;

2314

}

2315

2316

AddContactMetadataToKnowledgeClassificationResults(

2317

&annotation_candidates[i]);

2318

2319

Status annotation_status = AnnotateSingleInput(

2320

text_to_annotate[i], annotation_options, &annotation_candidates[i]);

2321

if (!annotation_status.ok()) {

2322

return annotation_status;

2323

}

2324

}

2325

return annotation_candidates;

2326

}

2327

2328

std::vector<AnnotatedSpan> Annotator::Annotate(

2329

const std::string& context, const AnnotationOptions& options) const {

2330

std::vector<InputFragment> string_fragments;

2331

string_fragments.push_back({.text = context});

2332

StatusOr<std::vector<std::vector<AnnotatedSpan>>> annotations =

2333

AnnotateStructuredInput(string_fragments, options);

2334

if (!annotations.ok()) {

2335

TC3_LOG(ERROR) << "Returned error when calling AnnotateStructuredInput: "

2336

<< annotations.status().error_message();

2337

return {};

2338

}

2339

return annotations.ValueOrDie()[0];

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2340

}

2341

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2342

CodepointSpan Annotator::ComputeSelectionBoundaries(

2343

const UniLib::RegexMatcher* match,

2344

const RegexModel_::Pattern* config) const {

2345

if (config->capturing_group() == nullptr) {

2346

// Use first capturing group to specify the selection.

2347

int status = UniLib::RegexMatcher::kNoError;

2348

const CodepointSpan result = {match->Start(1, &status),

2349

match->End(1, &status)};

2350

if (status != UniLib::RegexMatcher::kNoError) {

2351

return {kInvalidIndex, kInvalidIndex};

}

return result;

}

CodepointSpan result = {kInvalidIndex, kInvalidIndex};

2357

const int num_groups = config->capturing_group()->size();

2358

for (int i = 0; i < num_groups; i++) {

2359

if (!config->capturing_group()->Get(i)->extend_selection()) {

continue;

}

int status = UniLib::RegexMatcher::kNoError;

2364

// Check match and adjust bounds.

2365

const int group_start = match->Start(i, &status);

2366

const int group_end = match->End(i, &status);

2367

if (status != UniLib::RegexMatcher::kNoError) {

2368

return {kInvalidIndex, kInvalidIndex};

2369

}

2370

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

2371

continue;

2372

}

2373

if (result.first == kInvalidIndex) {

2374

result = {group_start, group_end};

2375

} else {

2376

result.first = std::min(result.first, group_start);

2377

result.second = std::max(result.second, group_end);

}

}

return result;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2383

bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2384

if (pattern->serialized_entity_data() != nullptr ||

2385

pattern->entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2386

return true;

2387

}

2388

if (pattern->capturing_group() != nullptr) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2389

for (const CapturingGroup* group : *pattern->capturing_group()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2390

if (group->entity_field_path() != nullptr) {

2391

return true;

2392

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2393

if (group->serialized_entity_data() != nullptr ||

2394

group->entity_data() != nullptr) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2395

return true;

2396

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

return false;

}

bool Annotator::SerializedEntityDataFromRegexMatch(

2403

const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,

2404

std::string* serialized_entity_data) const {

2405

if (!HasEntityData(pattern)) {

2406

serialized_entity_data->clear();

2407

return true;

2408

}

2409

TC3_CHECK(entity_data_builder_ != nullptr);

2410

2411

std::unique_ptr<ReflectiveFlatbuffer> entity_data =

2412

entity_data_builder_->NewRoot();

2413

2414

TC3_CHECK(entity_data != nullptr);

2415

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2416

// Set fixed entity data.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2417

if (pattern->serialized_entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2418

entity_data->MergeFromSerializedFlatbuffer(

2419

StringPiece(pattern->serialized_entity_data()->c_str(),

2420

pattern->serialized_entity_data()->size()));

2421

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2422

if (pattern->entity_data() != nullptr) {

2423

entity_data->MergeFrom(

2424

reinterpret_cast<const flatbuffers::Table*>(pattern->entity_data()));

2425

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2426

2427

// Add entity data from rule capturing groups.

2428

if (pattern->capturing_group() != nullptr) {

2429

const int num_groups = pattern->capturing_group()->size();

2430

for (int i = 0; i < num_groups; i++) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2431

const CapturingGroup* group = pattern->capturing_group()->Get(i);

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2432

2433

// Check whether the group matched.

2434

Optional<std::string> group_match_text =

2435

GetCapturingGroupText(matcher, /*group_id=*/i);

2436

if (!group_match_text.has_value()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2437

continue;

2438

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2439

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2440

// Set fixed entity data from capturing group match.

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2441

if (group->serialized_entity_data() != nullptr) {

2442

entity_data->MergeFromSerializedFlatbuffer(

2443

StringPiece(group->serialized_entity_data()->c_str(),

2444

group->serialized_entity_data()->size()));

2445

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2446

if (group->entity_data() != nullptr) {

2447

entity_data->MergeFrom(reinterpret_cast<const flatbuffers::Table*>(

2448

pattern->entity_data()));

2449

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2450

2451

// Set entity field from capturing group text.

2452

if (group->entity_field_path() != nullptr) {

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2453

UnicodeText normalized_group_match_text =

2454

UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);

2455

2456

// Apply normalization if specified.

2457

if (group->normalization_options() != nullptr) {

2458

normalized_group_match_text =

Tony Mak

2020-04-29 13:41:53 +0100

[diff] [blame]

2459

NormalizeText(*unilib_, group->normalization_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2460

normalized_group_match_text);

2461

}

2462

2463

if (!entity_data->ParseAndSet(

2464

group->entity_field_path(),

2465

normalized_group_match_text.ToUTF8String())) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2466

TC3_LOG(ERROR)

2467

<< "Could not set entity data from rule capturing group.";

2468

return false;

2469

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

}

*serialized_entity_data = entity_data->Serialize();

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2478

UnicodeText RemoveMoneySeparators(

2479

const std::unordered_set<char32>& decimal_separators,

2480

const UnicodeText& amount,

2481

UnicodeText::const_iterator it_decimal_separator) {

2482

UnicodeText whole_amount;

2483

for (auto it = amount.begin();

2484

it != amount.end() && it != it_decimal_separator; ++it) {

2485

if (std::find(decimal_separators.begin(), decimal_separators.end(),

2486

static_cast<char32>(*it)) == decimal_separators.end()) {

2487

whole_amount.push_back(*it);

}

}

return whole_amount;

}

bool Annotator::ParseAndFillInMoneyAmount(

2494

std::string* serialized_entity_data) const {

2495

std::unique_ptr<EntityDataT> data =

2496

LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(

2497

*serialized_entity_data);

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2498

if (data == nullptr) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame^]

2499

if (model_->version() >= 706) {

2500

// This way of parsing money entity data is enabled for models newer than

2501

// v706, consequently logging errors only for them (b/156634162).

2502

TC3_LOG(ERROR)

2503

<< "Data field is null when trying to parse Money Entity Data";

2504

}

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2505

return false;

2506

}

2507

if (data->money->unnormalized_amount.empty()) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame^]

2508

if (model_->version() >= 706) {

2509

// This way of parsing money entity data is enabled for models newer than

2510

// v706, consequently logging errors only for them (b/156634162).

2511

TC3_LOG(ERROR)

2512

<< "Data unnormalized_amount is empty when trying to parse "

2513

"Money Entity Data";

2514

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

UnicodeText amount =

UTF8ToUnicodeText(data->money->unnormalized_amount, /*do_copy=*/false);

2520

int separator_back_index = 0;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2521

auto it_decimal_separator = --amount.end();

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2522

for (; it_decimal_separator != amount.begin();

2523

--it_decimal_separator, ++separator_back_index) {

2524

if (std::find(money_separators_.begin(), money_separators_.end(),

2525

static_cast<char32>(*it_decimal_separator)) !=

2526

money_separators_.end()) {

break;

}

}

// If there are 3 digits after the last separator, we consider that a

2532

// thousands separator => the number is an int (e.g. 1.234 is considered int).

2533

// If there is no separator in number, also that number is an int.

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2534

if (separator_back_index == 3 || it_decimal_separator == amount.begin()) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2535

it_decimal_separator = amount.end();

2536

}

2537

2538

if (!unilib_->ParseInt32(RemoveMoneySeparators(money_separators_, amount,

2539

it_decimal_separator),

2540

&data->money->amount_whole_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2541

TC3_LOG(ERROR) << "Could not parse the money whole part as int32 from the "

2542

"amount: "

2543

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2544

return false;

2545

}

2546

if (it_decimal_separator == amount.end()) {

2547

data->money->amount_decimal_part = 0;

2548

} else {

2549

const int amount_codepoints_size = amount.size_codepoints();

2550

if (!unilib_->ParseInt32(

2551

UnicodeText::Substring(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2552

amount, amount_codepoints_size - separator_back_index,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2553

amount_codepoints_size, /*do_copy=*/false),

2554

&data->money->amount_decimal_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2555

TC3_LOG(ERROR) << "Could not parse the money decimal part as int32 from "

2556

"the amount: "

2557

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

}

*serialized_entity_data =

2563

PackFlatbuffer<libtextclassifier3::EntityData>(data.get());

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2567

bool Annotator::RegexChunk(const UnicodeText& context_unicode,

2568

const std::vector<int>& rules,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2569

std::vector<AnnotatedSpan>* result,

2570

bool is_serialized_entity_data_enabled) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2571

for (int pattern_id : rules) {

2572

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

2573

const auto matcher = regex_pattern.pattern->Matcher(context_unicode);

2574

if (!matcher) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2575

TC3_LOG(ERROR) << "Could not get regex matcher for pattern: "

2576

<< pattern_id;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

int status = UniLib::RegexMatcher::kNoError;

2581

while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2582

if (regex_pattern.config->verification_options()) {

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2583

if (!VerifyRegexMatchCandidate(

2584

context_unicode.ToUTF8String(),

2585

regex_pattern.config->verification_options(),

2586

matcher->Group(1, &status).ToUTF8String(), matcher.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2587

continue;

2588

}

2589

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2590

2591

std::string serialized_entity_data;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2592

if (is_serialized_entity_data_enabled) {

2593

if (!SerializedEntityDataFromRegexMatch(

2594

regex_pattern.config, matcher.get(), &serialized_entity_data)) {

2595

TC3_LOG(ERROR) << "Could not get entity data.";

2596

return false;

2597

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2598

2599

// Further parsing unnormalized_amount for money into amount_whole_part

2600

// and amount_decimal_part. Can't do this with regexes because we cannot

2601

// have empty groups (amount_decimal_part might be an empty group).

2602

if (regex_pattern.config->collection_name()->str() ==

2603

Collections::Money()) {

2604

if (!ParseAndFillInMoneyAmount(&serialized_entity_data)) {

Tony Mak

2020-05-28 15:25:17 +0100

[diff] [blame^]

2605

if (model_->version() >= 706) {

2606

// This way of parsing money entity data is enabled for models

2607

// newer than v706 => logging errors only for them (b/156634162).

2608

TC3_LOG(ERROR) << "Could not parse and fill in money amount.";

2609

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2610

}

2611

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2612

}

2613

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2614

result->emplace_back();

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2615

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2616

// Selection/annotation regular expressions need to specify a capturing

2617

// group specifying the selection.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2618

result->back().span =

2619

ComputeSelectionBoundaries(matcher.get(), regex_pattern.config);

2620

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2621

result->back().classification = {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2622

{regex_pattern.config->collection_name()->str(),

2623

regex_pattern.config->target_classification_score(),

2624

regex_pattern.config->priority_score()}};

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2625

2626

result->back().classification[0].serialized_entity_data =

2627

serialized_entity_data;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2633

bool Annotator::ModelChunk(int num_tokens, const TokenSpan& span_of_interest,

2634

tflite::Interpreter* selection_interpreter,

2635

const CachedFeatures& cached_features,

2636

std::vector<TokenSpan>* chunks) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2637

const int max_selection_span =

2638

selection_feature_processor_->GetOptions()->max_selection_span();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2639

// The inference span is the span of interest expanded to include

2640

// max_selection_span tokens on either side, which is how far a selection can

2641

// stretch from the click.

2642

const TokenSpan inference_span = IntersectTokenSpans(

2643

ExpandTokenSpan(span_of_interest,

2644

/*num_tokens_left=*/max_selection_span,

2645

/*num_tokens_right=*/max_selection_span),

2646

{0, num_tokens});

2647

2648

std::vector<ScoredChunk> scored_chunks;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2649

if (selection_feature_processor_->GetOptions()->bounds_sensitive_features() &&

2650

selection_feature_processor_->GetOptions()

2651

->bounds_sensitive_features()

2652

->enabled()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2653

if (!ModelBoundsSensitiveScoreChunks(

2654

num_tokens, span_of_interest, inference_span, cached_features,

2655

selection_interpreter, &scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

} else {

if (!ModelClickContextScoreChunks(num_tokens, span_of_interest,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2660

cached_features, selection_interpreter,

2661

&scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2662

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2663

}

2664

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2665

std::sort(scored_chunks.rbegin(), scored_chunks.rend(),

2666

[](const ScoredChunk& lhs, const ScoredChunk& rhs) {

2667

return lhs.score < rhs.score;

2668

});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2669

2670

// Traverse the candidate chunks from highest-scoring to lowest-scoring. Pick

2671

// them greedily as long as they do not overlap with any previously picked

2672

// chunks.

2673

std::vector<bool> token_used(TokenSpanSize(inference_span));

2674

chunks->clear();

2675

for (const ScoredChunk& scored_chunk : scored_chunks) {

2676

bool feasible = true;

2677

for (int i = scored_chunk.token_span.first;

2678

i < scored_chunk.token_span.second; ++i) {

2679

if (token_used[i - inference_span.first]) {

feasible = false;

break;

}

}

if (!feasible) {

continue;

}

for (int i = scored_chunk.token_span.first;

2690

i < scored_chunk.token_span.second; ++i) {

2691

token_used[i - inference_span.first] = true;

2692

}

2693

2694

chunks->push_back(scored_chunk.token_span);

2695

}

2696

2697

std::sort(chunks->begin(), chunks->end());

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2702

namespace {

2703

// Updates the value at the given key in the map to maximum of the current value

2704

// and the given value, or simply inserts the value if the key is not yet there.

2705

template <typename Map>

2706

void UpdateMax(Map* map, typename Map::key_type key,

2707

typename Map::mapped_type value) {

2708

const auto it = map->find(key);

2709

if (it != map->end()) {

2710

it->second = std::max(it->second, value);

} else {

(*map)[key] = value;

}

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2717

bool Annotator::ModelClickContextScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2718

int num_tokens, const TokenSpan& span_of_interest,

2719

const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2720

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2721

std::vector<ScoredChunk>* scored_chunks) const {

2722

const int max_batch_size = model_->selection_options()->batch_size();

2723

2724

std::vector<float> all_features;

2725

std::map<TokenSpan, float> chunk_scores;

2726

for (int batch_start = span_of_interest.first;

2727

batch_start < span_of_interest.second; batch_start += max_batch_size) {

2728

const int batch_end =

2729

std::min(batch_start + max_batch_size, span_of_interest.second);

2730

2731

// Prepare features for the whole batch.

2732

all_features.clear();

2733

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2734

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2735

cached_features.AppendClickContextFeaturesForClick(click_pos,

&all_features);

}

// Run batched inference.

2740

const int batch_size = batch_end - batch_start;

2741

const int features_size = cached_features.OutputFeaturesSize();

2742

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2743

TensorView<float>(all_features.data(), {batch_size, features_size}),

2744

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2745

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2746

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2747

return false;

2748

}

2749

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2750

logits.dim(1) !=

2751

selection_feature_processor_->GetSelectionLabelCount()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2752

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2758

const std::vector<float> scores = ComputeSoftmax(

2759

logits.data() + logits.dim(1) * (click_pos - batch_start),

2760

logits.dim(1));

2761

for (int j = 0;

2762

j < selection_feature_processor_->GetSelectionLabelCount(); ++j) {

2763

TokenSpan relative_token_span;

2764

if (!selection_feature_processor_->LabelToTokenSpan(

2765

j, &relative_token_span)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2766

TC3_LOG(ERROR) << "Couldn't map the label to a token span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2767

return false;

2768

}

2769

const TokenSpan candidate_span = ExpandTokenSpan(

2770

SingleTokenSpan(click_pos), relative_token_span.first,

2771

relative_token_span.second);

2772

if (candidate_span.first >= 0 && candidate_span.second <= num_tokens) {

2773

UpdateMax(&chunk_scores, candidate_span, scores[j]);

}

}

}

}

scored_chunks->clear();

2780

scored_chunks->reserve(chunk_scores.size());

2781

for (const auto& entry : chunk_scores) {

2782

scored_chunks->push_back(ScoredChunk{entry.first, entry.second});

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2788

bool Annotator::ModelBoundsSensitiveScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2789

int num_tokens, const TokenSpan& span_of_interest,

2790

const TokenSpan& inference_span, const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2791

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2792

std::vector<ScoredChunk>* scored_chunks) const {

2793

const int max_selection_span =

2794

selection_feature_processor_->GetOptions()->max_selection_span();

2795

const int max_chunk_length = selection_feature_processor_->GetOptions()

2796

->selection_reduced_output_space()

2797

? max_selection_span + 1

2798

: 2 * max_selection_span + 1;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2799

const bool score_single_token_spans_as_zero =

2800

selection_feature_processor_->GetOptions()

2801

->bounds_sensitive_features()

2802

->score_single_token_spans_as_zero();

2803

2804

scored_chunks->clear();

2805

if (score_single_token_spans_as_zero) {

2806

scored_chunks->reserve(TokenSpanSize(span_of_interest));

2807

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2808

2809

// Prepare all chunk candidates into one batch:

2810

// - Are contained in the inference span

2811

// - Have a non-empty intersection with the span of interest

2812

// - Are at least one token long

2813

// - Are not longer than the maximum chunk length

2814

std::vector<TokenSpan> candidate_spans;

2815

for (int start = inference_span.first; start < span_of_interest.second;

2816

++start) {

2817

const int leftmost_end_index = std::max(start, span_of_interest.first) + 1;

2818

for (int end = leftmost_end_index;

2819

end <= inference_span.second && end - start <= max_chunk_length;

2820

++end) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2821

const TokenSpan candidate_span = {start, end};

2822

if (score_single_token_spans_as_zero &&

2823

TokenSpanSize(candidate_span) == 1) {

2824

// Do not include the single token span in the batch, add a zero score

2825

// for it directly to the output.

2826

scored_chunks->push_back(ScoredChunk{candidate_span, 0.0f});

2827

} else {

2828

candidate_spans.push_back(candidate_span);

2829

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

const int max_batch_size = model_->selection_options()->batch_size();

2834

2835

std::vector<float> all_features;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2836

scored_chunks->reserve(scored_chunks->size() + candidate_spans.size());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2837

for (int batch_start = 0; batch_start < candidate_spans.size();

2838

batch_start += max_batch_size) {

2839

const int batch_end = std::min(batch_start + max_batch_size,

2840

static_cast<int>(candidate_spans.size()));

2841

2842

// Prepare features for the whole batch.

2843

all_features.clear();

2844

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2845

for (int i = batch_start; i < batch_end; ++i) {

2846

cached_features.AppendBoundsSensitiveFeaturesForSpan(candidate_spans[i],

&all_features);

}

// Run batched inference.

2851

const int batch_size = batch_end - batch_start;

2852

const int features_size = cached_features.OutputFeaturesSize();

2853

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2854

TensorView<float>(all_features.data(), {batch_size, features_size}),

2855

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2856

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2857

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2858

return false;

2859

}

2860

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2861

logits.dim(1) != 1) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2862

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int i = batch_start; i < batch_end; ++i) {

2868

scored_chunks->push_back(

2869

ScoredChunk{candidate_spans[i], logits.data()[i - batch_start]});

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2876

bool Annotator::DatetimeChunk(const UnicodeText& context_unicode,

2877

int64 reference_time_ms_utc,

2878

const std::string& reference_timezone,

2879

const std::string& locales, ModeFlag mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2880

AnnotationUsecase annotation_usecase,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2881

bool is_serialized_entity_data_enabled,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2882

std::vector<AnnotatedSpan>* result) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2883

std::vector<DatetimeParseResultSpan> datetime_spans;

2884

if (cfg_datetime_parser_) {

2885

if (!(model_->grammar_datetime_model()->enabled_modes() & mode)) {

2886

return true;

2887

}

2888

std::vector<Locale> parsed_locales;

2889

ParseLocales(locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2890

cfg_datetime_parser_->Parse(

2891

context_unicode.ToUTF8String(),

2892

ToDateAnnotationOptions(

2893

model_->grammar_datetime_model()->annotation_options(),

2894

reference_timezone, reference_time_ms_utc),

2895

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

2896

}

2897

2898

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2899

if (!datetime_parser_->Parse(context_unicode, reference_time_ms_utc,

2900

reference_timezone, locales, mode,

2901

annotation_usecase,

2902

/*anchor_start_end=*/false, &datetime_spans)) {

2903

return false;

2904

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2905

}

2906

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2907

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2908

AnnotatedSpan annotated_span;

2909

annotated_span.span = datetime_span.span;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2910

for (const DatetimeParseResult& parse_result : datetime_span.data) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2911

annotated_span.classification.emplace_back(

2912

PickCollectionForDatetime(parse_result),

2913

datetime_span.target_classification_score,

2914

datetime_span.priority_score);

2915

annotated_span.classification.back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2916

if (is_serialized_entity_data_enabled) {

2917

annotated_span.classification.back().serialized_entity_data =

2918

CreateDatetimeSerializedEntityData(parse_result);

2919

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2920

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2921

annotated_span.source = AnnotatedSpan::Source::DATETIME;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2922

result->push_back(std::move(annotated_span));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

return true;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2927

const Model* Annotator::model() const { return model_; }

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2928

const reflection::Schema* Annotator::entity_data_schema() const {

2929

return entity_data_schema_;

2930

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2931

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2932

const Model* ViewModel(const void* buffer, int size) {

if (!buffer) {

return nullptr;

}

return LoadAndVerifyModel(buffer, size);

2938

}

2939

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2940

bool Annotator::LookUpKnowledgeEntity(

2941

const std::string& id, std::string* serialized_knowledge_result) const {

2942

return knowledge_engine_ &&

2943

knowledge_engine_->LookUpEntity(id, serialized_knowledge_result);

2944

}

2945

Tony Mak