Blame - native/annotator/annotator.cc - platform/external/libtextclassifier

2018-01-24 11:11:20 +0100

[diff] [blame]

1

/*

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3

*

4

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

17

#include "annotator/annotator.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

18

19

#include <algorithm>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

20

#include <cmath>

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

21

#include <cstddef>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

22

#include <iterator>

23

#include <numeric>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

24

#include <string>

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

25

#include <unordered_map>

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

26

#include <vector>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

27

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

28

#include "annotator/collections.h"

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

29

#include "annotator/model_generated.h"

30

#include "annotator/types.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

31

#include "utils/base/logging.h"

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

32

#include "utils/base/status.h"

33

#include "utils/base/statusor.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

34

#include "utils/checksum.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

35

#include "utils/i18n/locale.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

36

#include "utils/math/softmax.h"

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

37

#include "utils/normalization.h"

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

38

#include "utils/optional.h"

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

39

#include "utils/regex-match.h"

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

40

#include "utils/strings/numbers.h"

41

#include "utils/strings/split.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

42

#include "utils/utf8/unicodetext.h"

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

43

#include "utils/utf8/unilib-common.h"

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

44

#include "utils/zlib/zlib_regex.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

45

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

46

namespace libtextclassifier3 {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

47

48

using SortedIntSet = std::set<int, std::function<bool(int, int)>>;

49

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

50

const std::string& Annotator::kPhoneCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

51

*[]() { return new std::string("phone"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

52

const std::string& Annotator::kAddressCollection =

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

53

*[]() { return new std::string("address"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

54

const std::string& Annotator::kDateCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

55

*[]() { return new std::string("date"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

56

const std::string& Annotator::kUrlCollection =

57

*[]() { return new std::string("url"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

58

const std::string& Annotator::kEmailCollection =

59

*[]() { return new std::string("email"); }();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

60

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

61

namespace {

62

const Model* LoadAndVerifyModel(const void* addr, int size) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

63

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

Tony Mak

51a9e54

2018-11-02 13:36:22 +0000

[diff] [blame]

64

if (VerifyModelBuffer(verifier)) {

65

return GetModel(addr);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

70

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

71

const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,

72

int size) {

73

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

74

if (VerifyPersonNameModelBuffer(verifier)) {

75

return GetPersonNameModel(addr);

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

81

// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will

82

// create a new instance, assign ownership to owned_lib, and return it.

83

const UniLib* MaybeCreateUnilib(const UniLib* lib,

84

std::unique_ptr<UniLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new UniLib);

89

return owned_lib->get();

}

}

// As above, but for CalendarLib.

94

const CalendarLib* MaybeCreateCalendarlib(

95

const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new CalendarLib);

100

return owned_lib->get();

}

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

104

// Returns whether the provided input is valid:

105

// * Valid utf8 text.

106

// * Sane span indices.

107

bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan span) {

108

if (!context.is_valid()) {

109

return false;

110

}

111

return (span.first >= 0 && span.first < span.second &&

112

span.second <= context.size_codepoints());

113

}

114

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

115

std::unordered_set<char32> FlatbuffersIntVectorToChar32UnorderedSet(

116

const flatbuffers::Vector<int32_t>* ints) {

117

if (ints == nullptr) {

118

return {};

119

}

120

std::unordered_set<char32> ints_set;

121

for (auto value : *ints) {

122

ints_set.insert(static_cast<char32>(value));

}

return ints_set;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

127

DateAnnotationOptions ToDateAnnotationOptions(

128

const GrammarDatetimeModel_::AnnotationOptions* fb_annotation_options,

129

const std::string& reference_timezone, const int64 reference_time_ms_utc) {

130

DateAnnotationOptions result_annotation_options;

131

result_annotation_options.base_timestamp_millis = reference_time_ms_utc;

132

result_annotation_options.reference_timezone = reference_timezone;

133

if (fb_annotation_options != nullptr) {

134

result_annotation_options.enable_special_day_offset =

135

fb_annotation_options->enable_special_day_offset();

136

result_annotation_options.merge_adjacent_components =

137

fb_annotation_options->merge_adjacent_components();

138

result_annotation_options.enable_date_range =

139

fb_annotation_options->enable_date_range();

140

result_annotation_options.include_preposition =

141

fb_annotation_options->include_preposition();

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

142

if (fb_annotation_options->extra_requested_dates() != nullptr) {

143

for (const auto& extra_requested_date :

144

*fb_annotation_options->extra_requested_dates()) {

145

result_annotation_options.extra_requested_dates.push_back(

146

extra_requested_date->str());

147

}

148

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

149

if (fb_annotation_options->ignored_spans() != nullptr) {

150

for (const auto& ignored_span : *fb_annotation_options->ignored_spans()) {

151

result_annotation_options.ignored_spans.push_back(ignored_span->str());

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

152

}

153

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

154

}

155

return result_annotation_options;

156

}

157

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

158

} // namespace

159

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

160

tflite::Interpreter* InterpreterManager::SelectionInterpreter() {

161

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

162

TC3_CHECK(selection_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

163

selection_interpreter_ = selection_executor_->CreateInterpreter();

164

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

165

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

166

}

167

}

168

return selection_interpreter_.get();

169

}

170

171

tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {

172

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

173

TC3_CHECK(classification_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

174

classification_interpreter_ = classification_executor_->CreateInterpreter();

175

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

176

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

177

}

178

}

179

return classification_interpreter_.get();

180

}

181

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

182

std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(

183

const char* buffer, int size, const UniLib* unilib,

184

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

185

const Model* model = LoadAndVerifyModel(buffer, size);

186

if (model == nullptr) {

return nullptr;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

190

auto classifier =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

191

std::unique_ptr<Annotator>(new Annotator(model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

192

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

199

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

200

std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,

201

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

202

if (!(*mmap)->handle().ok()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

203

TC3_VLOG(1) << "Mmap failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

208

(*mmap)->handle().num_bytes());

209

if (!model) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

210

TC3_LOG(ERROR) << "Model verification failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

214

auto classifier = std::unique_ptr<Annotator>(

215

new Annotator(mmap, model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

216

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

223

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

224

std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,

225

std::unique_ptr<CalendarLib> calendarlib) {

226

if (!(*mmap)->handle().ok()) {

227

TC3_VLOG(1) << "Mmap failed.";

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

232

(*mmap)->handle().num_bytes());

233

if (model == nullptr) {

234

TC3_LOG(ERROR) << "Model verification failed.";

return nullptr;

}

auto classifier = std::unique_ptr<Annotator>(

239

new Annotator(mmap, model, std::move(unilib), std::move(calendarlib)));

240

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

247

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

248

int fd, int offset, int size, const UniLib* unilib,

249

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

250

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

251

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

252

}

253

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

254

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

255

int fd, int offset, int size, std::unique_ptr<UniLib> unilib,

256

std::unique_ptr<CalendarLib> calendarlib) {

257

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

258

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

259

}

260

261

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

262

int fd, const UniLib* unilib, const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

263

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

264

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

265

}

266

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

267

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

268

int fd, std::unique_ptr<UniLib> unilib,

269

std::unique_ptr<CalendarLib> calendarlib) {

270

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

271

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

272

}

273

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

274

std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,

275

const UniLib* unilib,

276

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

277

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

278

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

279

}

280

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

281

std::unique_ptr<Annotator> Annotator::FromPath(

282

const std::string& path, std::unique_ptr<UniLib> unilib,

283

std::unique_ptr<CalendarLib> calendarlib) {

284

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

285

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

286

}

287

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

288

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

289

const UniLib* unilib, const CalendarLib* calendarlib)

290

: model_(model),

291

mmap_(std::move(*mmap)),

292

owned_unilib_(nullptr),

293

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

294

owned_calendarlib_(nullptr),

295

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

296

ValidateAndInitialize();

297

}

298

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

299

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

300

std::unique_ptr<UniLib> unilib,

301

std::unique_ptr<CalendarLib> calendarlib)

302

: model_(model),

303

mmap_(std::move(*mmap)),

304

owned_unilib_(std::move(unilib)),

305

unilib_(owned_unilib_.get()),

306

owned_calendarlib_(std::move(calendarlib)),

307

calendarlib_(owned_calendarlib_.get()) {

308

ValidateAndInitialize();

309

}

310

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

311

Annotator::Annotator(const Model* model, const UniLib* unilib,

312

const CalendarLib* calendarlib)

313

: model_(model),

314

owned_unilib_(nullptr),

315

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

316

owned_calendarlib_(nullptr),

317

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

318

ValidateAndInitialize();

319

}

320

321

void Annotator::ValidateAndInitialize() {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

322

initialized_ = false;

323

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

324

if (model_ == nullptr) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

325

TC3_LOG(ERROR) << "No model specified.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

329

const bool model_enabled_for_annotation =

330

(model_->triggering_options() != nullptr &&

331

(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));

332

const bool model_enabled_for_classification =

333

(model_->triggering_options() != nullptr &&

334

(model_->triggering_options()->enabled_modes() &

335

ModeFlag_CLASSIFICATION));

336

const bool model_enabled_for_selection =

337

(model_->triggering_options() != nullptr &&

338

(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));

339

340

// Annotation requires the selection model.

341

if (model_enabled_for_annotation || model_enabled_for_selection) {

342

if (!model_->selection_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

343

TC3_LOG(ERROR) << "No selection options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

344

return;

345

}

346

if (!model_->selection_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

347

TC3_LOG(ERROR) << "No selection feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

348

return;

349

}

350

if (!model_->selection_feature_options()->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

351

TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

352

return;

353

}

354

if (!model_->selection_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

355

TC3_LOG(ERROR) << "No selection model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

356

return;

357

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

358

selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

359

if (!selection_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

360

TC3_LOG(ERROR) << "Could not initialize selection executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

361

return;

362

}

363

selection_feature_processor_.reset(

364

new FeatureProcessor(model_->selection_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

365

}

366

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

367

// Annotation requires the classification model for conflict resolution and

368

// scoring.

369

// Selection requires the classification model for conflict resolution.

370

if (model_enabled_for_annotation || model_enabled_for_classification ||

371

model_enabled_for_selection) {

372

if (!model_->classification_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

373

TC3_LOG(ERROR) << "No classification options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

378

TC3_LOG(ERROR) << "No classification feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()

383

->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

384

TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

385

return;

386

}

387

if (!model_->classification_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

388

TC3_LOG(ERROR) << "No clf model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

392

classification_executor_ =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

393

ModelExecutor::FromBuffer(model_->classification_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

394

if (!classification_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

395

TC3_LOG(ERROR) << "Could not initialize classification executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

classification_feature_processor_.reset(new FeatureProcessor(

400

model_->classification_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

401

}

402

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

403

// The embeddings need to be specified if the model is to be used for

404

// classification or selection.

405

if (model_enabled_for_annotation || model_enabled_for_classification ||

406

model_enabled_for_selection) {

407

if (!model_->embedding_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

408

TC3_LOG(ERROR) << "No embedding model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

409

return;

410

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

411

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

412

// Check that the embedding size of the selection and classification model

413

// matches, as they are using the same embeddings.

414

if (model_enabled_for_selection &&

415

(model_->selection_feature_options()->embedding_size() !=

416

model_->classification_feature_options()->embedding_size() ||

417

model_->selection_feature_options()->embedding_quantization_bits() !=

418

model_->classification_feature_options()

419

->embedding_quantization_bits())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

420

TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

421

return;

422

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

423

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

424

embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

425

model_->embedding_model(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

426

model_->classification_feature_options()->embedding_size(),

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

427

model_->classification_feature_options()->embedding_quantization_bits(),

428

model_->embedding_pruning_mask());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

429

if (!embedding_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

430

TC3_LOG(ERROR) << "Could not initialize embedding executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

431

return;

432

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

433

}

434

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

435

std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

436

if (model_->regex_model()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

437

if (!InitializeRegexModel(decompressor.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

438

TC3_LOG(ERROR) << "Could not initialize regex model.";

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

439

return;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

440

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

441

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

442

if (model_->grammar_datetime_model() &&

443

model_->grammar_datetime_model()->datetime_rules()) {

444

cfg_datetime_parser_.reset(new dates::CfgDatetimeAnnotator(

445

*unilib_,

446

/*tokenizer_options=*/

447

model_->grammar_datetime_model()->grammar_tokenizer_options(),

448

*calendarlib_,

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

449

/*datetime_rules=*/model_->grammar_datetime_model()->datetime_rules(),

450

model_->grammar_datetime_model()->target_classification_score(),

451

model_->grammar_datetime_model()->priority_score()));

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

452

if (!cfg_datetime_parser_) {

453

TC3_LOG(ERROR) << "Could not initialize context free grammar based "

454

"datetime parser.";

455

return;

456

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

457

}

458

459

if (model_->datetime_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

460

datetime_parser_ = DatetimeParser::Instance(

461

model_->datetime_model(), *unilib_, *calendarlib_, decompressor.get());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

462

if (!datetime_parser_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

463

TC3_LOG(ERROR) << "Could not initialize datetime parser.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

468

if (model_->output_options()) {

469

if (model_->output_options()->filtered_collections_annotation()) {

470

for (const auto collection :

471

*model_->output_options()->filtered_collections_annotation()) {

472

filtered_collections_annotation_.insert(collection->str());

473

}

474

}

475

if (model_->output_options()->filtered_collections_classification()) {

476

for (const auto collection :

477

*model_->output_options()->filtered_collections_classification()) {

478

filtered_collections_classification_.insert(collection->str());

479

}

480

}

481

if (model_->output_options()->filtered_collections_selection()) {

482

for (const auto collection :

483

*model_->output_options()->filtered_collections_selection()) {

484

filtered_collections_selection_.insert(collection->str());

}

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

489

if (model_->number_annotator_options() &&

490

model_->number_annotator_options()->enabled()) {

491

number_annotator_.reset(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

492

new NumberAnnotator(model_->number_annotator_options(), unilib_));

493

}

494

495

if (model_->money_parsing_options()) {

496

money_separators_ = FlatbuffersIntVectorToChar32UnorderedSet(

497

model_->money_parsing_options()->separators());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

498

}

499

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

500

if (model_->duration_annotator_options() &&

501

model_->duration_annotator_options()->enabled()) {

502

duration_annotator_.reset(

503

new DurationAnnotator(model_->duration_annotator_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

504

selection_feature_processor_.get(), unilib_));

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

505

}

506

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

507

if (model_->entity_data_schema()) {

508

entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(

509

model_->entity_data_schema()->Data(),

510

model_->entity_data_schema()->size());

511

if (entity_data_schema_ == nullptr) {

512

TC3_LOG(ERROR) << "Could not load entity data schema data.";

return;

}

entity_data_builder_.reset(

517

new ReflectiveFlatbufferBuilder(entity_data_schema_));

518

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

519

entity_data_schema_ = nullptr;

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

520

entity_data_builder_ = nullptr;

521

}

522

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

523

if (model_->grammar_model()) {

524

grammar_annotator_.reset(new GrammarAnnotator(

525

unilib_, model_->grammar_model(), entity_data_builder_.get()));

526

}

527

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

528

if (model_->triggering_locales() &&

529

!ParseLocales(model_->triggering_locales()->c_str(),

530

&model_triggering_locales_)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

531

TC3_LOG(ERROR) << "Could not parse model supported locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

536

model_->triggering_options()->locales() != nullptr &&

537

!ParseLocales(model_->triggering_options()->locales()->c_str(),

538

&ml_model_triggering_locales_)) {

539

TC3_LOG(ERROR) << "Could not parse supported ML model locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

544

model_->triggering_options()->dictionary_locales() != nullptr &&

545

!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),

546

&dictionary_locales_)) {

547

TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";

return;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

initialized_ = true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

554

bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

555

if (!model_->regex_model()->patterns()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

556

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

557

}

558

559

// Initialize pattern recognizers.

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

560

int regex_pattern_id = 0;

561

for (const auto& regex_pattern : *model_->regex_model()->patterns()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

562

std::unique_ptr<UniLib::RegexPattern> compiled_pattern =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

563

UncompressMakeRegexPattern(

564

*unilib_, regex_pattern->pattern(),

565

regex_pattern->compressed_pattern(),

566

model_->regex_model()->lazy_regex_compilation(), decompressor);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

567

if (!compiled_pattern) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

568

TC3_LOG(INFO) << "Failed to load regex pattern";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

569

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

570

}

571

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

572

if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

573

annotation_regex_patterns_.push_back(regex_pattern_id);

574

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

575

if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

576

classification_regex_patterns_.push_back(regex_pattern_id);

577

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

578

if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

579

selection_regex_patterns_.push_back(regex_pattern_id);

580

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

581

regex_patterns_.push_back({

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

582

regex_pattern,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

583

std::move(compiled_pattern),

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

584

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

585

++regex_pattern_id;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

586

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

587

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

588

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

589

}

590

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

591

bool Annotator::InitializeKnowledgeEngine(

592

const std::string& serialized_config) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

593

std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

594

if (!knowledge_engine->Initialize(serialized_config, unilib_)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

595

TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";

596

return false;

597

}

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

598

if (model_->triggering_options() != nullptr) {

599

knowledge_engine->SetPriorityScore(

600

model_->triggering_options()->knowledge_priority_score());

601

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

602

knowledge_engine_ = std::move(knowledge_engine);

return true;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

606

bool Annotator::InitializeContactEngine(const std::string& serialized_config) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

607

std::unique_ptr<ContactEngine> contact_engine(

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

608

new ContactEngine(selection_feature_processor_.get(), unilib_,

609

model_->contact_annotator_options()));

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

610

if (!contact_engine->Initialize(serialized_config)) {

611

TC3_LOG(ERROR) << "Failed to initialize the contact engine.";

612

return false;

613

}

614

contact_engine_ = std::move(contact_engine);

return true;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

618

bool Annotator::InitializeInstalledAppEngine(

619

const std::string& serialized_config) {

620

std::unique_ptr<InstalledAppEngine> installed_app_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

621

new InstalledAppEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

622

if (!installed_app_engine->Initialize(serialized_config)) {

623

TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";

624

return false;

625

}

626

installed_app_engine_ = std::move(installed_app_engine);

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

630

void Annotator::SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id) {

631

lang_id_ = lang_id;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

632

if (lang_id_ != nullptr && model_->translate_annotator_options() &&

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

633

model_->translate_annotator_options()->enabled()) {

634

translate_annotator_.reset(new TranslateAnnotator(

635

model_->translate_annotator_options(), lang_id_, unilib_));

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

636

} else {

637

translate_annotator_.reset(nullptr);

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

}

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

641

bool Annotator::InitializePersonNameEngineFromUnownedBuffer(const void* buffer,

642

int size) {

643

const PersonNameModel* person_name_model =

644

LoadAndVerifyPersonNameModel(buffer, size);

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

645

646

if (person_name_model == nullptr) {

647

TC3_LOG(ERROR) << "Person name model verification failed.";

return false;

}

if (!person_name_model->enabled()) {

return true;

}

std::unique_ptr<PersonNameEngine> person_name_engine(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

656

new PersonNameEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

657

if (!person_name_engine->Initialize(person_name_model)) {

658

TC3_LOG(ERROR) << "Failed to initialize the person name engine.";

659

return false;

660

}

661

person_name_engine_ = std::move(person_name_engine);

return true;

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

665

bool Annotator::InitializePersonNameEngineFromScopedMmap(

666

const ScopedMmap& mmap) {

667

if (!mmap.handle().ok()) {

668

TC3_LOG(ERROR) << "Mmap for person name model failed.";

return false;

}

return InitializePersonNameEngineFromUnownedBuffer(mmap.handle().start(),

673

mmap.handle().num_bytes());

674

}

675

676

bool Annotator::InitializePersonNameEngineFromPath(const std::string& path) {

677

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

678

return InitializePersonNameEngineFromScopedMmap(*mmap);

679

}

680

681

bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,

682

int size) {

683

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

684

return InitializePersonNameEngineFromScopedMmap(*mmap);

685

}

686

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

687

namespace {

688

689

int CountDigits(const std::string& str, CodepointSpan selection_indices) {

690

int count = 0;

691

int i = 0;

692

const UnicodeText unicode_str = UTF8ToUnicodeText(str, /*do_copy=*/false);

693

for (auto it = unicode_str.begin(); it != unicode_str.end(); ++it, ++i) {

694

if (i >= selection_indices.first && i < selection_indices.second &&

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

695

IsDigit(*it)) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

++count;

}

}

return count;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

702

} // namespace

703

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

704

namespace internal {

705

// Helper function, which if the initial 'span' contains only white-spaces,

706

// moves the selection to a single-codepoint selection on a left or right side

707

// of this space.

708

CodepointSpan SnapLeftIfWhitespaceSelection(CodepointSpan span,

709

const UnicodeText& context_unicode,

710

const UniLib& unilib) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

711

TC3_CHECK(ValidNonEmptySpan(span));

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

712

713

UnicodeText::const_iterator it;

714

715

// Check that the current selection is all whitespaces.

716

it = context_unicode.begin();

717

std::advance(it, span.first);

718

for (int i = 0; i < (span.second - span.first); ++i, ++it) {

719

if (!unilib.IsWhitespace(*it)) {

return span;

}

}

CodepointSpan result;

// Try moving left.

result = span;

it = context_unicode.begin();

729

std::advance(it, span.first);

730

while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {

--result.first;

--it;

}

result.second = result.first + 1;

735

if (!unilib.IsWhitespace(*it)) {

return result;

}

// If moving left didn't find a non-whitespace character, just return the

// original span.

return span;

}

} // namespace internal

744

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

745

bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

746

return !span.classification.empty() &&

747

filtered_collections_annotation_.find(

748

span.classification[0].collection) !=

749

filtered_collections_annotation_.end();

750

}

751

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

752

bool Annotator::FilteredForClassification(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

753

const ClassificationResult& classification) const {

754

return filtered_collections_classification_.find(classification.collection) !=

755

filtered_collections_classification_.end();

756

}

757

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

758

bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

759

return !span.classification.empty() &&

760

filtered_collections_selection_.find(

761

span.classification[0].collection) !=

762

filtered_collections_selection_.end();

763

}

764

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

765

namespace {

766

inline bool ClassifiedAsOther(

767

const std::vector<ClassificationResult>& classification) {

768

return !classification.empty() &&

769

classification[0].collection == Collections::Other();

770

}

771

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

772

} // namespace

773

774

float Annotator::GetPriorityScore(

775

const std::vector<ClassificationResult>& classification) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

776

if (!classification.empty() && !ClassifiedAsOther(classification)) {

777

return classification[0].priority_score;

778

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

779

if (model_->triggering_options() != nullptr) {

780

return model_->triggering_options()->other_collection_priority_score();

781

} else {

782

return -1000.0;

783

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

784

}

785

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

786

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

787

bool Annotator::VerifyRegexMatchCandidate(

788

const std::string& context, const VerificationOptions* verification_options,

789

const std::string& match, const UniLib::RegexMatcher* matcher) const {

790

if (verification_options == nullptr) {

791

return true;

792

}

793

if (verification_options->verify_luhn_checksum() &&

794

!VerifyLuhnChecksum(match)) {

795

return false;

796

}

797

const int lua_verifier = verification_options->lua_verifier();

798

if (lua_verifier >= 0) {

799

if (model_->regex_model()->lua_verifier() == nullptr ||

800

lua_verifier >= model_->regex_model()->lua_verifier()->size()) {

801

TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;

return false;

}

return VerifyMatch(

context, matcher,

model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

811

CodepointSpan Annotator::SuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

812

const std::string& context, CodepointSpan click_indices,

813

const SelectionOptions& options) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

814

CodepointSpan original_click_indices = click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

815

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

816

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

817

return original_click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

818

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

819

if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

820

return original_click_indices;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

821

}

822

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

823

std::vector<Locale> detected_text_language_tags;

824

if (!ParseLocales(options.detected_text_language_tags,

825

&detected_text_language_tags)) {

826

TC3_LOG(WARNING)

827

<< "Failed to parse the detected_text_language_tags in options: "

828

<< options.detected_text_language_tags;

829

}

830

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

831

model_triggering_locales_,

832

/*default_value=*/true)) {

833

return original_click_indices;

834

}

835

Lukas Zilka

df710db

2018-02-27 12:44:09 +0100

[diff] [blame]

836

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

837

/*do_copy=*/false);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

838

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame]

839

if (!IsValidSpanInput(context_unicode, click_indices)) {

840

TC3_VLOG(1)

841

<< "Trying to run SuggestSelection with invalid input, indices: "

842

<< click_indices.first << " " << click_indices.second;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

843

return original_click_indices;

844

}

845

846

if (model_->snap_whitespace_selections()) {

847

// We want to expand a purely white-space selection to a multi-selection it

848

// would've been part of. But with this feature disabled we would do a no-

849

// op, because no token is found. Therefore, we need to modify the

850

// 'click_indices' a bit to include a part of the token, so that the click-

851

// finding logic finds the clicked token correctly. This modification is

852

// done by the following function. Note, that it's enough to check the left

853

// side of the current selection, because if the white-space is a part of a

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

854

// multi-selection, necessarily both tokens - on the left and the right

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

855

// sides need to be selected. Thus snapping only to the left is sufficient

856

// (there's a check at the bottom that makes sure that if we snap to the

857

// left token but the result does not contain the initial white-space,

858

// returns the original indices).

859

click_indices = internal::SnapLeftIfWhitespaceSelection(

860

click_indices, context_unicode, *unilib_);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

861

}

862

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

863

std::vector<AnnotatedSpan> candidates;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

864

InterpreterManager interpreter_manager(selection_executor_.get(),

865

classification_executor_.get());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

866

std::vector<Token> tokens;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

867

if (!ModelSuggestSelection(context_unicode, click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

868

detected_text_language_tags, &interpreter_manager,

869

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

870

TC3_LOG(ERROR) << "Model suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

871

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

872

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

873

if (!RegexChunk(context_unicode, selection_regex_patterns_, &candidates,

874

/*is_serialized_entity_data_enabled=*/false)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

875

TC3_LOG(ERROR) << "Regex suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

876

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

877

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

878

if (!DatetimeChunk(

879

UTF8ToUnicodeText(context, /*do_copy=*/false),

880

/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",

881

options.locales, ModeFlag_SELECTION, options.annotation_usecase,

882

/*is_serialized_entity_data_enabled=*/false, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

883

TC3_LOG(ERROR) << "Datetime suggest selection failed.";

884

return original_click_indices;

885

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

886

if (knowledge_engine_ != nullptr &&

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

887

!knowledge_engine_->Chunk(context, options.annotation_usecase,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

888

options.location_context, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

889

TC3_LOG(ERROR) << "Knowledge suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

890

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

891

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

892

if (contact_engine_ != nullptr &&

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

893

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

894

TC3_LOG(ERROR) << "Contact suggest selection failed.";

895

return original_click_indices;

896

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

897

if (installed_app_engine_ != nullptr &&

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

898

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

899

TC3_LOG(ERROR) << "Installed app suggest selection failed.";

900

return original_click_indices;

901

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

902

if (number_annotator_ != nullptr &&

903

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

904

&candidates)) {

905

TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";

906

return original_click_indices;

907

}

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

908

if (duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

909

!duration_annotator_->FindAll(context_unicode, tokens,

910

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

911

TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";

912

return original_click_indices;

913

}

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

914

if (person_name_engine_ != nullptr &&

915

!person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {

916

TC3_LOG(ERROR) << "Person name suggest selection failed.";

917

return original_click_indices;

918

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

919

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

920

AnnotatedSpan grammar_suggested_span;

921

if (grammar_annotator_ != nullptr &&

922

grammar_annotator_->SuggestSelection(detected_text_language_tags,

923

context_unicode, click_indices,

924

&grammar_suggested_span)) {

925

candidates.push_back(grammar_suggested_span);

926

}

927

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

928

// Sort candidates according to their position in the input, so that the next

929

// code can assume that any connected component of overlapping spans forms a

930

// contiguous block.

931

std::sort(candidates.begin(), candidates.end(),

932

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

933

return a.span.first < b.span.first;

934

});

935

936

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

937

if (!ResolveConflicts(candidates, context, tokens,

938

detected_text_language_tags, options.annotation_usecase,

939

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

940

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

941

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

942

}

943

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

944

std::sort(candidate_indices.begin(), candidate_indices.end(),

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

945

[this, &candidates](int a, int b) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

946

return GetPriorityScore(candidates[a].classification) >

947

GetPriorityScore(candidates[b].classification);

948

});

949

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

950

for (const int i : candidate_indices) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

951

if (SpansOverlap(candidates[i].span, click_indices) &&

952

SpansOverlap(candidates[i].span, original_click_indices)) {

953

// Run model classification if not present but requested and there's a

954

// classification collection filter specified.

955

if (candidates[i].classification.empty() &&

956

model_->selection_options()->always_classify_suggested_selection() &&

957

!filtered_collections_selection_.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

958

if (!ModelClassifyText(context, detected_text_language_tags,

959

candidates[i].span, &interpreter_manager,

960

/*embedding_cache=*/nullptr,

961

&candidates[i].classification)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

962

return original_click_indices;

}

}

// Ignore if span classification is filtered.

967

if (FilteredForSelection(candidates[i])) {

968

return original_click_indices;

969

}

970

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

971

return candidates[i].span;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

975

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

namespace {

// Helper function that returns the index of the first candidate that

980

// transitively does not overlap with the candidate on 'start_index'. If the end

981

// of 'candidates' is reached, it returns the index that points right behind the

982

// array.

983

int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,

984

int start_index) {

985

int first_non_overlapping = start_index + 1;

986

CodepointSpan conflicting_span = candidates[start_index].span;

987

while (

988

first_non_overlapping < candidates.size() &&

989

SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {

990

// Grow the span to include the current one.

991

conflicting_span.second = std::max(

992

conflicting_span.second, candidates[first_non_overlapping].span.second);

993

994

++first_non_overlapping;

995

}

996

return first_non_overlapping;

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1000

bool Annotator::ResolveConflicts(

1001

const std::vector<AnnotatedSpan>& candidates, const std::string& context,

1002

const std::vector<Token>& cached_tokens,

1003

const std::vector<Locale>& detected_text_language_tags,

1004

AnnotationUsecase annotation_usecase,

1005

InterpreterManager* interpreter_manager, std::vector<int>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1006

result->clear();

1007

result->reserve(candidates.size());

1008

for (int i = 0; i < candidates.size();) {

1009

int first_non_overlapping =

1010

FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);

1011

1012

const bool conflict_found = first_non_overlapping != (i + 1);

1013

if (conflict_found) {

1014

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1015

if (!ResolveConflict(context, cached_tokens, candidates,

1016

detected_text_language_tags, i,

1017

first_non_overlapping, annotation_usecase,

1018

interpreter_manager, &candidate_indices)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1019

return false;

1020

}

1021

result->insert(result->end(), candidate_indices.begin(),

1022

candidate_indices.end());

1023

} else {

1024

result->push_back(i);

1025

}

1026

1027

// Skip over the whole conflicting group/go to next candidate.

1028

i = first_non_overlapping;

}

return true;

}

namespace {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1034

// Returns true, if the given two sources do conflict in given annotation

1035

// usecase.

1036

// - In SMART usecase, all sources do conflict, because there's only 1 possible

1037

// annotation for a given span.

1038

// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime

1039

// and duration), while others not (e.g. duration and number).

1040

bool DoSourcesConflict(AnnotationUsecase annotation_usecase,

1041

const AnnotatedSpan::Source source1,

1042

const AnnotatedSpan::Source source2) {

1043

uint32 source_mask =

1044

(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));

1045

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1046

switch (annotation_usecase) {

1047

case AnnotationUsecase_ANNOTATION_USECASE_SMART:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1048

// In the SMART mode, all annotations conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1049

return true;

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1050

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1051

case AnnotationUsecase_ANNOTATION_USECASE_RAW:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1052

// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",

1053

// can have two non-conflicting annotations: "in 3 hours" (datetime), "3

1054

// hours" (duration).

1055

if ((source_mask &

1056

(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&

1057

(source_mask &

1058

(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {

1059

return false;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1060

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1061

1062

// A KNOWLEDGE entity does not conflict with anything.

1063

if ((source_mask &

1064

(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {

return false;

}

Tony Mak

2020-03-27 13:58:00 +0000

[diff] [blame]

1068

// A PERSONNAME entity does not conflict with anything.

1069

if ((source_mask &

1070

(1 << static_cast<int>(AnnotatedSpan::Source::PERSON_NAME)))) {

return false;

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1074

// Entities from other sources can conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1075

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1080

bool Annotator::ResolveConflict(

1081

const std::string& context, const std::vector<Token>& cached_tokens,

1082

const std::vector<AnnotatedSpan>& candidates,

1083

const std::vector<Locale>& detected_text_language_tags, int start_index,

1084

int end_index, AnnotationUsecase annotation_usecase,

1085

InterpreterManager* interpreter_manager,

1086

std::vector<int>* chosen_indices) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1087

std::vector<int> conflicting_indices;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1088

std::unordered_map<int, std::pair<float, int>> scores_lengths;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1089

for (int i = start_index; i < end_index; ++i) {

1090

conflicting_indices.push_back(i);

1091

if (!candidates[i].classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1092

scores_lengths[i] = {

1093

GetPriorityScore(candidates[i].classification),

1094

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

continue;

}

// OPTIMIZATION: So that we don't have to classify all the ML model

1099

// spans apriori, we wait until we get here, when they conflict with

1100

// something and we need the actual classification scores. So if the

1101

// candidate conflicts and comes from the model, we need to run a

1102

// classification to determine its priority:

1103

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1104

if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1105

candidates[i].span, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1106

/*embedding_cache=*/nullptr, &classification)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (!classification.empty()) {

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1111

scores_lengths[i] = {

1112

GetPriorityScore(classification),

1113

candidates[i].span.second - candidates[i].span.first};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2020-01-09 12:32:17 +0000

[diff] [blame]

1117

const bool prioritize_longest_annotation =

1118

model_->triggering_options() != nullptr &&

1119

model_->triggering_options()->prioritize_longest_annotation();

1120

std::sort(conflicting_indices.begin(), conflicting_indices.end(),

1121

[&scores_lengths, candidates, conflicting_indices,

1122

prioritize_longest_annotation](int i, int j) {

1123

if (scores_lengths[i].first == scores_lengths[j].first &&

1124

prioritize_longest_annotation) {

1125

return scores_lengths[i].second > scores_lengths[j].second;

1126

}

1127

return scores_lengths[i].first > scores_lengths[j].first;

1128

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1129

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1130

// Here we keep a set of indices that were chosen, per-source, to enable

1131

// effective computation.

1132

std::unordered_map<AnnotatedSpan::Source, SortedIntSet>

1133

chosen_indices_for_source_map;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1134

1135

// Greedily place the candidates if they don't conflict with the already

1136

// placed ones.

1137

for (int i = 0; i < conflicting_indices.size(); ++i) {

1138

const int considered_candidate = conflicting_indices[i];

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1139

1140

// See if there is a conflict between the candidate and all already placed

1141

// candidates.

1142

bool conflict = false;

1143

SortedIntSet* chosen_indices_for_source_ptr = nullptr;

1144

for (auto& source_set_pair : chosen_indices_for_source_map) {

1145

if (source_set_pair.first == candidates[considered_candidate].source) {

1146

chosen_indices_for_source_ptr = &source_set_pair.second;

1147

}

1148

1149

if (DoSourcesConflict(annotation_usecase, source_set_pair.first,

1150

candidates[considered_candidate].source) &&

1151

DoesCandidateConflict(considered_candidate, candidates,

1152

source_set_pair.second)) {

1153

conflict = true;

1154

break;

1155

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1156

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1157

1158

// Skip the candidate if a conflict was found.

if (conflict) {

continue;

}

// If the set of indices for the current source doesn't exist yet,

1164

// initialize it.

1165

if (chosen_indices_for_source_ptr == nullptr) {

1166

SortedIntSet new_set([&candidates](int a, int b) {

1167

return candidates[a].span.first < candidates[b].span.first;

1168

});

1169

chosen_indices_for_source_map[candidates[considered_candidate].source] =

1170

std::move(new_set);

1171

chosen_indices_for_source_ptr =

1172

&chosen_indices_for_source_map[candidates[considered_candidate]

.source];

}

// Place the candidate to the output and to the per-source conflict set.

1177

chosen_indices->push_back(considered_candidate);

1178

chosen_indices_for_source_ptr->insert(considered_candidate);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1179

}

1180

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1181

std::sort(chosen_indices->begin(), chosen_indices->end());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1186

bool Annotator::ModelSuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1187

const UnicodeText& context_unicode, CodepointSpan click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1188

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1189

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1190

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1191

if (model_->triggering_options() == nullptr ||

1192

!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1196

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1197

ml_model_triggering_locales_,

1198

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1202

int click_pos;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1203

*tokens = selection_feature_processor_->Tokenize(context_unicode);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1204

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1205

context_unicode, click_indices,

1206

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1207

tokens, &click_pos);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1208

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1209

TC3_VLOG(1) << "Could not calculate the click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1210

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1211

}

1212

1213

const int symmetry_context_size =

1214

model_->selection_options()->symmetry_context_size();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1215

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1216

bounds_sensitive_features = selection_feature_processor_->GetOptions()

1217

->bounds_sensitive_features();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1218

1219

// The symmetry context span is the clicked token with symmetry_context_size

1220

// tokens on either side.

1221

const TokenSpan symmetry_context_span = IntersectTokenSpans(

1222

ExpandTokenSpan(SingleTokenSpan(click_pos),

1223

/*num_tokens_left=*/symmetry_context_size,

1224

/*num_tokens_right=*/symmetry_context_size),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1225

{0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1226

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1227

// Compute the extraction span based on the model type.

1228

TokenSpan extraction_span;

1229

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1230

// The extraction span is the symmetry context span expanded to include

1231

// max_selection_span tokens on either side, which is how far a selection

1232

// can stretch from the click, plus a relevant number of tokens outside of

1233

// the bounds of the selection.

1234

const int max_selection_span =

1235

selection_feature_processor_->GetOptions()->max_selection_span();

1236

extraction_span =

1237

ExpandTokenSpan(symmetry_context_span,

1238

/*num_tokens_left=*/max_selection_span +

1239

bounds_sensitive_features->num_tokens_before(),

1240

/*num_tokens_right=*/max_selection_span +

1241

bounds_sensitive_features->num_tokens_after());

1242

} else {

1243

// The extraction span is the symmetry context span expanded to include

1244

// context_size tokens on either side.

1245

const int context_size =

1246

selection_feature_processor_->GetOptions()->context_size();

1247

extraction_span = ExpandTokenSpan(symmetry_context_span,

1248

/*num_tokens_left=*/context_size,

1249

/*num_tokens_right=*/context_size);

1250

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1251

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1252

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1253

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1254

*tokens, extraction_span)) {

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1258

std::unique_ptr<CachedFeatures> cached_features;

1259

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1260

*tokens, extraction_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1261

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1262

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1263

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1264

selection_feature_processor_->EmbeddingSize() +

1265

selection_feature_processor_->DenseFeaturesCount(),

1266

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1267

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Produce selection model candidates.

1272

std::vector<TokenSpan> chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1273

if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1274

interpreter_manager->SelectionInterpreter(), *cached_features,

1275

&chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1276

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

for (const TokenSpan& chunk : chunks) {

1281

AnnotatedSpan candidate;

1282

candidate.span = selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1283

context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1284

if (model_->selection_options()->strip_unpaired_brackets()) {

1285

candidate.span =

1286

StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);

1287

}

1288

1289

// Only output non-empty spans.

1290

if (candidate.span.first != candidate.span.second) {

1291

result->push_back(candidate);

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1297

bool Annotator::ModelClassifyText(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1298

const std::string& context,

1299

const std::vector<Locale>& detected_text_language_tags,

1300

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1301

FeatureProcessor::EmbeddingCache* embedding_cache,

1302

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1303

return ModelClassifyText(context, {}, detected_text_language_tags,

1304

selection_indices, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1305

embedding_cache, classification_results);

}

namespace internal {

std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,

1310

CodepointSpan selection_indices,

1311

TokenSpan tokens_around_selection_to_copy) {

1312

const auto first_selection_token = std::upper_bound(

1313

cached_tokens.begin(), cached_tokens.end(), selection_indices.first,

1314

[](int selection_start, const Token& token) {

1315

return selection_start < token.end;

1316

});

1317

const auto last_selection_token = std::lower_bound(

1318

cached_tokens.begin(), cached_tokens.end(), selection_indices.second,

1319

[](const Token& token, int selection_end) {

1320

return token.start < selection_end;

1321

});

1322

1323

const int64 first_token = std::max(

1324

static_cast<int64>(0),

1325

static_cast<int64>((first_selection_token - cached_tokens.begin()) -

1326

tokens_around_selection_to_copy.first));

1327

const int64 last_token = std::min(

1328

static_cast<int64>(cached_tokens.size()),

1329

static_cast<int64>((last_selection_token - cached_tokens.begin()) +

1330

tokens_around_selection_to_copy.second));

1331

1332

std::vector<Token> tokens;

1333

tokens.reserve(last_token - first_token);

1334

for (int i = first_token; i < last_token; ++i) {

1335

tokens.push_back(cached_tokens[i]);

}

return tokens;

}

} // namespace internal

1340

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1341

TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1342

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1343

bounds_sensitive_features =

1344

classification_feature_processor_->GetOptions()

1345

->bounds_sensitive_features();

1346

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1347

// The extraction span is the selection span expanded to include a relevant

1348

// number of tokens outside of the bounds of the selection.

1349

return {bounds_sensitive_features->num_tokens_before(),

1350

bounds_sensitive_features->num_tokens_after()};

1351

} else {

1352

// The extraction span is the clicked token with context_size tokens on

1353

// either side.

1354

const int context_size =

1355

selection_feature_processor_->GetOptions()->context_size();

1356

return {context_size, context_size};

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1360

namespace {

1361

// Sorts the classification results from high score to low score.

1362

void SortClassificationResults(

1363

std::vector<ClassificationResult>* classification_results) {

1364

std::sort(classification_results->begin(), classification_results->end(),

1365

[](const ClassificationResult& a, const ClassificationResult& b) {

1366

return a.score > b.score;

});

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1371

bool Annotator::ModelClassifyText(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1372

const std::string& context, const std::vector<Token>& cached_tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1373

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1374

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1375

FeatureProcessor::EmbeddingCache* embedding_cache,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1376

std::vector<ClassificationResult>* classification_results) const {

1377

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1378

return ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1379

selection_indices, interpreter_manager,

1380

embedding_cache, classification_results, &tokens);

1381

}

1382

1383

bool Annotator::ModelClassifyText(

1384

const std::string& context, const std::vector<Token>& cached_tokens,

1385

const std::vector<Locale>& detected_text_language_tags,

1386

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1387

FeatureProcessor::EmbeddingCache* embedding_cache,

1388

std::vector<ClassificationResult>* classification_results,

1389

std::vector<Token>* tokens) const {

1390

if (model_->triggering_options() == nullptr ||

1391

!(model_->triggering_options()->enabled_modes() &

1392

ModeFlag_CLASSIFICATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1396

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1397

ml_model_triggering_locales_,

1398

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1402

if (cached_tokens.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1403

*tokens = classification_feature_processor_->Tokenize(context);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1404

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1405

*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,

1406

ClassifyTextUpperBoundNeededTokens());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1407

}

1408

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1409

int click_pos;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1410

classification_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1411

context, selection_indices,

1412

classification_feature_processor_->GetOptions()

1413

->only_use_line_with_click(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1414

tokens, &click_pos);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1415

const TokenSpan selection_token_span =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1416

CodepointSpanToTokenSpan(*tokens, selection_indices);

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1417

const int selection_num_tokens = TokenSpanSize(selection_token_span);

1418

if (model_->classification_options()->max_num_tokens() > 0 &&

1419

model_->classification_options()->max_num_tokens() <

1420

selection_num_tokens) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1421

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1425

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1426

bounds_sensitive_features =

1427

classification_feature_processor_->GetOptions()

1428

->bounds_sensitive_features();

1429

if (selection_token_span.first == kInvalidIndex ||

1430

selection_token_span.second == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1431

TC3_LOG(ERROR) << "Could not determine span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Compute the extraction span based on the model type.

1436

TokenSpan extraction_span;

1437

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1438

// The extraction span is the selection span expanded to include a relevant

1439

// number of tokens outside of the bounds of the selection.

1440

extraction_span = ExpandTokenSpan(

1441

selection_token_span,

1442

/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),

1443

/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());

1444

} else {

1445

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1446

TC3_LOG(ERROR) << "Couldn't choose a click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1447

return false;

1448

}

1449

// The extraction span is the clicked token with context_size tokens on

1450

// either side.

1451

const int context_size =

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1452

classification_feature_processor_->GetOptions()->context_size();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1453

extraction_span = ExpandTokenSpan(SingleTokenSpan(click_pos),

1454

/*num_tokens_left=*/context_size,

1455

/*num_tokens_right=*/context_size);

1456

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1457

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1458

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1459

if (!classification_feature_processor_->HasEnoughSupportedCodepoints(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1460

*tokens, extraction_span)) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1461

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1465

std::unique_ptr<CachedFeatures> cached_features;

1466

if (!classification_feature_processor_->ExtractFeatures(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1467

*tokens, extraction_span, selection_indices,

1468

embedding_executor_.get(), embedding_cache,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1469

classification_feature_processor_->EmbeddingSize() +

1470

classification_feature_processor_->DenseFeaturesCount(),

1471

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1472

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1473

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1474

}

1475

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1476

std::vector<float> features;

1477

features.reserve(cached_features->OutputFeaturesSize());

1478

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1479

cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,

1480

&features);

1481

} else {

1482

cached_features->AppendClickContextFeaturesForClick(click_pos, &features);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1483

}

1484

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1485

TensorView<float> logits = classification_executor_->ComputeLogits(

1486

TensorView<float>(features.data(),

1487

{1, static_cast<int>(features.size())}),

1488

interpreter_manager->ClassificationInterpreter());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1489

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1490

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (logits.dims() != 2 || logits.dim(0) != 1 ||

1495

logits.dim(1) != classification_feature_processor_->NumCollections()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1496

TC3_LOG(ERROR) << "Mismatching output";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

const std::vector<float> scores =

1501

ComputeSoftmax(logits.data(), logits.dim(1));

1502

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1503

if (scores.empty()) {

1504

*classification_results = {{Collections::Other(), 1.0}};

1505

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1506

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1507

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1508

const int best_score_index =

1509

std::max_element(scores.begin(), scores.end()) - scores.begin();

1510

const std::string top_collection =

1511

classification_feature_processor_->LabelToCollection(best_score_index);

1512

1513

// Sanity checks.

1514

if (top_collection == Collections::Phone()) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1515

const int digit_count = CountDigits(context, selection_indices);

1516

if (digit_count <

1517

model_->classification_options()->phone_min_num_digits() ||

1518

digit_count >

1519

model_->classification_options()->phone_max_num_digits()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1520

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1521

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1522

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1523

} else if (top_collection == Collections::Address()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1524

if (selection_num_tokens <

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1525

model_->classification_options()->address_min_num_tokens()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1526

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1527

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1528

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1529

} else if (top_collection == Collections::Dictionary()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1530

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1531

dictionary_locales_,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1532

/*default_value=*/false)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1533

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1534

return true;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1535

}

1536

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1537

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1538

*classification_results = {{top_collection, /*arg_score=*/1.0,

1539

/*arg_priority_score=*/scores[best_score_index]}};

1540

1541

// For some entities, we might want to clamp the priority score, for better

1542

// conflict resolution between entities.

1543

if (model_->triggering_options() != nullptr &&

1544

model_->triggering_options()->collection_to_priority() != nullptr) {

1545

if (auto entry =

1546

model_->triggering_options()->collection_to_priority()->LookupByKey(

1547

top_collection.c_str())) {

1548

(*classification_results)[0].priority_score *= entry->value();

1549

}

1550

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1551

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1552

}

1553

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1554

bool Annotator::RegexClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1555

const std::string& context, CodepointSpan selection_indices,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1556

std::vector<ClassificationResult>* classification_result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1557

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1558

UTF8ToUnicodeText(context, /*do_copy=*/false)

1559

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1560

const UnicodeText selection_text_unicode(

1561

UTF8ToUnicodeText(selection_text, /*do_copy=*/false));

1562

1563

// Check whether any of the regular expressions match.

1564

for (const int pattern_id : classification_regex_patterns_) {

1565

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

1566

const std::unique_ptr<UniLib::RegexMatcher> matcher =

1567

regex_pattern.pattern->Matcher(selection_text_unicode);

1568

int status = UniLib::RegexMatcher::kNoError;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1569

bool matches;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1570

if (regex_pattern.config->use_approximate_matching()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1571

matches = matcher->ApproximatelyMatches(&status);

1572

} else {

1573

matches = matcher->Matches(&status);

1574

}

1575

if (status != UniLib::RegexMatcher::kNoError) {

1576

return false;

1577

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1578

if (matches && VerifyRegexMatchCandidate(

1579

context, regex_pattern.config->verification_options(),

1580

selection_text, matcher.get())) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1581

classification_result->push_back(

1582

{regex_pattern.config->collection_name()->str(),

1583

regex_pattern.config->target_classification_score(),

1584

regex_pattern.config->priority_score()});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1585

if (!SerializedEntityDataFromRegexMatch(

1586

regex_pattern.config, matcher.get(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1587

&classification_result->back().serialized_entity_data)) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1588

TC3_LOG(ERROR) << "Could not get entity data.";

1589

return false;

1590

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1594

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1595

}

1596

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1597

namespace {

1598

std::string PickCollectionForDatetime(

1599

const DatetimeParseResult& datetime_parse_result) {

1600

switch (datetime_parse_result.granularity) {

1601

case GRANULARITY_HOUR:

1602

case GRANULARITY_MINUTE:

1603

case GRANULARITY_SECOND:

1604

return Collections::DateTime();

1605

default:

1606

return Collections::Date();

1607

}

1608

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1609

1610

std::string CreateDatetimeSerializedEntityData(

1611

const DatetimeParseResult& parse_result) {

1612

EntityDataT entity_data;

1613

entity_data.datetime.reset(new EntityData_::DatetimeT());

1614

entity_data.datetime->time_ms_utc = parse_result.time_ms_utc;

1615

entity_data.datetime->granularity =

1616

static_cast<EntityData_::Datetime_::Granularity>(

1617

parse_result.granularity);

1618

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1619

for (const auto& c : parse_result.datetime_components) {

1620

EntityData_::Datetime_::DatetimeComponentT datetime_component;

1621

datetime_component.absolute_value = c.value;

1622

datetime_component.relative_count = c.relative_count;

1623

datetime_component.component_type =

1624

static_cast<EntityData_::Datetime_::DatetimeComponent_::ComponentType>(

1625

c.component_type);

1626

datetime_component.relation_type =

1627

EntityData_::Datetime_::DatetimeComponent_::RelationType_ABSOLUTE;

1628

if (c.relative_qualifier !=

1629

DatetimeComponent::RelativeQualifier::UNSPECIFIED) {

1630

datetime_component.relation_type =

1631

EntityData_::Datetime_::DatetimeComponent_::RelationType_RELATIVE;

1632

}

1633

entity_data.datetime->datetime_component.emplace_back(

1634

new EntityData_::Datetime_::DatetimeComponentT(datetime_component));

1635

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1636

flatbuffers::FlatBufferBuilder builder;

1637

FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));

1638

return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),

1639

builder.GetSize());

1640

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1641

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1642

} // namespace

1643

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1644

bool Annotator::DatetimeClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1645

const std::string& context, CodepointSpan selection_indices,

1646

const ClassificationOptions& options,

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1647

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1648

if (!datetime_parser_ && !cfg_datetime_parser_) {

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1649

return true;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1650

}

1651

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1652

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1653

UTF8ToUnicodeText(context, /*do_copy=*/false)

1654

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1655

1656

std::vector<DatetimeParseResultSpan> datetime_spans;

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1657

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1658

if (cfg_datetime_parser_) {

1659

if (!(model_->grammar_datetime_model()->enabled_modes() &

1660

ModeFlag_CLASSIFICATION)) {

1661

return true;

1662

}

1663

std::vector<Locale> parsed_locales;

1664

ParseLocales(options.locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1665

cfg_datetime_parser_->Parse(

1666

selection_text,

1667

ToDateAnnotationOptions(

1668

model_->grammar_datetime_model()->annotation_options(),

1669

options.reference_timezone, options.reference_time_ms_utc),

1670

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1671

}

1672

1673

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1674

if (!datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,

1675

options.reference_timezone, options.locales,

1676

ModeFlag_CLASSIFICATION,

1677

options.annotation_usecase,

1678

/*anchor_start_end=*/true, &datetime_spans)) {

1679

TC3_LOG(ERROR) << "Error during parsing datetime.";

1680

return false;

1681

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1682

}

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

1683

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1684

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

1685

// Only consider the result valid if the selection and extracted datetime

1686

// spans exactly match.

1687

if (std::make_pair(datetime_span.span.first + selection_indices.first,

1688

datetime_span.span.second + selection_indices.first) ==

1689

selection_indices) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1690

for (const DatetimeParseResult& parse_result : datetime_span.data) {

1691

classification_results->emplace_back(

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1692

PickCollectionForDatetime(parse_result),

1693

datetime_span.target_classification_score);

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1694

classification_results->back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1695

classification_results->back().serialized_entity_data =

1696

CreateDatetimeSerializedEntityData(parse_result);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1697

classification_results->back().priority_score =

1698

datetime_span.priority_score;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1699

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1700

return true;

1701

}

1702

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1703

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1704

}

1705

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1706

std::vector<ClassificationResult> Annotator::ClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1707

const std::string& context, CodepointSpan selection_indices,

1708

const ClassificationOptions& options) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1709

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1710

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1714

if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {

return {};

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1718

std::vector<Locale> detected_text_language_tags;

1719

if (!ParseLocales(options.detected_text_language_tags,

1720

&detected_text_language_tags)) {

1721

TC3_LOG(WARNING)

1722

<< "Failed to parse the detected_text_language_tags in options: "

1723

<< options.detected_text_language_tags;

1724

}

1725

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1726

model_triggering_locales_,

1727

/*default_value=*/true)) {

return {};

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame]

1731

if (!IsValidSpanInput(UTF8ToUnicodeText(context, /*do_copy=*/false),

1732

selection_indices)) {

1733

TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1734

<< std::get<0>(selection_indices) << " "

1735

<< std::get<1>(selection_indices);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1739

// We'll accumulate a list of candidates, and pick the best candidate in the

1740

// end.

1741

std::vector<AnnotatedSpan> candidates;

1742

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1743

// Try the knowledge engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1744

// TODO(b/126579108): Propagate error status.

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1745

ClassificationResult knowledge_result;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1746

if (knowledge_engine_ &&

1747

knowledge_engine_->ClassifyText(

1748

context, selection_indices, options.annotation_usecase,

1749

options.location_context, &knowledge_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1750

candidates.push_back({selection_indices, {knowledge_result}});

1751

candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1752

}

1753

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1754

AddContactMetadataToKnowledgeClassificationResults(&candidates);

1755

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1756

// Try the contact engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1757

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1758

ClassificationResult contact_result;

1759

if (contact_engine_ && contact_engine_->ClassifyText(

1760

context, selection_indices, &contact_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1761

candidates.push_back({selection_indices, {contact_result}});

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1762

}

1763

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1764

// Try the person name engine.

1765

ClassificationResult person_name_result;

1766

if (person_name_engine_ &&

1767

person_name_engine_->ClassifyText(context, selection_indices,

1768

&person_name_result)) {

1769

candidates.push_back({selection_indices, {person_name_result}});

Tony Mak

d0ae7c6

2020-03-27 13:58:00 +0000

[diff] [blame]

1770

candidates.back().source = AnnotatedSpan::Source::PERSON_NAME;

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

1771

}

1772

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1773

// Try the installed app engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1774

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1775

ClassificationResult installed_app_result;

1776

if (installed_app_engine_ &&

1777

installed_app_engine_->ClassifyText(context, selection_indices,

1778

&installed_app_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1779

candidates.push_back({selection_indices, {installed_app_result}});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1780

}

1781

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1782

// Try the regular expression models.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1783

std::vector<ClassificationResult> regex_results;

1784

if (!RegexClassifyText(context, selection_indices, &regex_results)) {

1785

return {};

1786

}

1787

for (const ClassificationResult& result : regex_results) {

1788

candidates.push_back({selection_indices, {result}});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1789

}

1790

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1791

// Try the date model.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1792

//

1793

// DatetimeClassifyText only returns the first result, which can however have

1794

// more interpretations. They are inserted in the candidates as a single

1795

// AnnotatedSpan, so that they get treated together by the conflict resolution

1796

// algorithm.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1797

std::vector<ClassificationResult> datetime_results;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1798

if (!DatetimeClassifyText(context, selection_indices, options,

1799

&datetime_results)) {

1800

return {};

1801

}

1802

if (!datetime_results.empty()) {

1803

candidates.push_back({selection_indices, std::move(datetime_results)});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1804

candidates.back().source = AnnotatedSpan::Source::DATETIME;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1805

}

1806

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1807

// Try the number annotator.

1808

// TODO(b/126579108): Propagate error status.

1809

ClassificationResult number_annotator_result;

1810

if (number_annotator_ &&

1811

number_annotator_->ClassifyText(

1812

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1813

options.annotation_usecase, &number_annotator_result)) {

1814

candidates.push_back({selection_indices, {number_annotator_result}});

1815

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1816

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1817

// Try the duration annotator.

1818

ClassificationResult duration_annotator_result;

1819

if (duration_annotator_ &&

1820

duration_annotator_->ClassifyText(

1821

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1822

options.annotation_usecase, &duration_annotator_result)) {

1823

candidates.push_back({selection_indices, {duration_annotator_result}});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1824

candidates.back().source = AnnotatedSpan::Source::DURATION;

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1825

}

1826

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

1827

// Try the translate annotator.

1828

ClassificationResult translate_annotator_result;

1829

if (translate_annotator_ &&

1830

translate_annotator_->ClassifyText(

1831

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1832

options.user_familiar_language_tags, &translate_annotator_result)) {

1833

candidates.push_back({selection_indices, {translate_annotator_result}});

1834

}

1835

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

1836

// Try the grammar model.

1837

ClassificationResult grammar_annotator_result;

1838

if (grammar_annotator_ && grammar_annotator_->ClassifyText(

1839

detected_text_language_tags,

1840

UTF8ToUnicodeText(context, /*do_copy=*/false),

1841

selection_indices, &grammar_annotator_result)) {

1842

candidates.push_back({selection_indices, {grammar_annotator_result}});

1843

}

1844

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1845

// Try the ML model.

1846

//

1847

// The output of the model is considered as an exclusive 1-of-N choice. That's

1848

// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1

1849

// span for each candidate, like e.g. the regex model.

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1850

InterpreterManager interpreter_manager(selection_executor_.get(),

1851

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1852

std::vector<ClassificationResult> model_results;

1853

std::vector<Token> tokens;

1854

if (!ModelClassifyText(

1855

context, /*cached_tokens=*/{}, detected_text_language_tags,

1856

selection_indices, &interpreter_manager,

1857

/*embedding_cache=*/nullptr, &model_results, &tokens)) {

1858

return {};

1859

}

1860

if (!model_results.empty()) {

1861

candidates.push_back({selection_indices, std::move(model_results)});

1862

}

1863

1864

std::vector<int> candidate_indices;

1865

if (!ResolveConflicts(candidates, context, tokens,

1866

detected_text_language_tags, options.annotation_usecase,

1867

&interpreter_manager, &candidate_indices)) {

1868

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

return {};

}

std::vector<ClassificationResult> results;

1873

for (const int i : candidate_indices) {

1874

for (const ClassificationResult& result : candidates[i].classification) {

1875

if (!FilteredForClassification(result)) {

1876

results.push_back(result);

1877

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1878

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1879

}

1880

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1881

// Sort results according to score.

1882

std::sort(results.begin(), results.end(),

1883

[](const ClassificationResult& a, const ClassificationResult& b) {

1884

return a.score > b.score;

1885

});

1886

1887

if (results.empty()) {

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1888

results = {{Collections::Other(), 1.0}};

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1889

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1890

return results;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1891

}

1892

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1893

bool Annotator::ModelAnnotate(

1894

const std::string& context,

1895

const std::vector<Locale>& detected_text_language_tags,

1896

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

1897

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1898

if (model_->triggering_options() == nullptr ||

1899

!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1903

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1904

ml_model_triggering_locales_,

1905

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1909

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

1910

/*do_copy=*/false);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1911

std::vector<UnicodeTextRange> lines;

1912

if (!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {

1913

lines.push_back({context_unicode.begin(), context_unicode.end()});

1914

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1915

lines = selection_feature_processor_->SplitContext(

1916

context_unicode, selection_feature_processor_->GetOptions()

1917

->use_pipe_character_for_newline());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1918

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1919

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1920

const float min_annotate_confidence =

1921

(model_->triggering_options() != nullptr

1922

? model_->triggering_options()->min_annotate_confidence()

1923

: 0.f);

1924

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1925

for (const UnicodeTextRange& line : lines) {

Tony Mak

408c6b8

2019-03-08 17:57:27 +0000

[diff] [blame]

1926

FeatureProcessor::EmbeddingCache embedding_cache;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1927

const std::string line_str =

1928

UnicodeText::UTF8Substring(line.first, line.second);

1929

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1930

*tokens = selection_feature_processor_->Tokenize(line_str);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1931

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1932

line_str, {0, std::distance(line.first, line.second)},

1933

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1934

tokens,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1935

/*click_pos=*/nullptr);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1936

const TokenSpan full_line_span = {0, tokens->size()};

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1937

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1938

// TODO(zilka): Add support for greater granularity of this check.

1939

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1940

*tokens, full_line_span)) {

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1944

std::unique_ptr<CachedFeatures> cached_features;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1945

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1946

*tokens, full_line_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1947

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1948

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1949

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1950

selection_feature_processor_->EmbeddingSize() +

1951

selection_feature_processor_->DenseFeaturesCount(),

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1952

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1953

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1954

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1955

}

1956

1957

std::vector<TokenSpan> local_chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1958

if (!ModelChunk(tokens->size(), /*span_of_interest=*/full_line_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1959

interpreter_manager->SelectionInterpreter(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1960

*cached_features, &local_chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1961

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1962

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1963

}

1964

1965

const int offset = std::distance(context_unicode.begin(), line.first);

1966

for (const TokenSpan& chunk : local_chunks) {

1967

const CodepointSpan codepoint_span =

1968

selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1969

line_str, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1970

1971

// Skip empty spans.

1972

if (codepoint_span.first != codepoint_span.second) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1973

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1974

if (!ModelClassifyText(line_str, *tokens, detected_text_language_tags,

1975

codepoint_span, interpreter_manager,

1976

&embedding_cache, &classification)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1977

TC3_LOG(ERROR) << "Could not classify text: "

1978

<< (codepoint_span.first + offset) << " "

1979

<< (codepoint_span.second + offset);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

// Do not include the span if it's classified as "other".

1984

if (!classification.empty() && !ClassifiedAsOther(classification) &&

1985

classification[0].score >= min_annotate_confidence) {

1986

AnnotatedSpan result_span;

1987

result_span.span = {codepoint_span.first + offset,

1988

codepoint_span.second + offset};

1989

result_span.classification = std::move(classification);

1990

result->push_back(std::move(result_span));

1991

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1992

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1993

}

1994

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1998

const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1999

return selection_feature_processor_.get();

2000

}

2001

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2002

const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2003

const {

2004

return classification_feature_processor_.get();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2005

}

2006

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2007

const DatetimeParser* Annotator::DatetimeParserForTests() const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2008

return datetime_parser_.get();

2009

}

2010

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2011

void Annotator::RemoveNotEnabledEntityTypes(

2012

const EnabledEntityTypes& is_entity_type_enabled,

2013

std::vector<AnnotatedSpan>* annotated_spans) const {

2014

for (AnnotatedSpan& annotated_span : *annotated_spans) {

2015

std::vector<ClassificationResult>& classifications =

2016

annotated_span.classification;

2017

classifications.erase(

2018

std::remove_if(classifications.begin(), classifications.end(),

2019

[&is_entity_type_enabled](

2020

const ClassificationResult& classification_result) {

2021

return !is_entity_type_enabled(

2022

classification_result.collection);

2023

}),

2024

classifications.end());

2025

}

2026

annotated_spans->erase(

2027

std::remove_if(annotated_spans->begin(), annotated_spans->end(),

2028

[](const AnnotatedSpan& annotated_span) {

2029

return annotated_span.classification.empty();

2030

}),

2031

annotated_spans->end());

2032

}

2033

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2034

void Annotator::AddContactMetadataToKnowledgeClassificationResults(

2035

std::vector<AnnotatedSpan>* candidates) const {

2036

if (candidates == nullptr || contact_engine_ == nullptr) {

2037

return;

2038

}

2039

for (auto& candidate : *candidates) {

2040

for (auto& classification_result : candidate.classification) {

2041

contact_engine_->AddContactMetadataToKnowledgeClassificationResult(

2042

&classification_result);

}

}

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2047

Status Annotator::AnnotateSingleInput(

2048

const std::string& context, const AnnotationOptions& options,

2049

std::vector<AnnotatedSpan>* candidates) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2050

if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2051

return Status(StatusCode::UNAVAILABLE, "Model annotation was not enabled.");

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2052

}

2053

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2054

const UnicodeText context_unicode =

2055

UTF8ToUnicodeText(context, /*do_copy=*/false);

2056

if (!context_unicode.is_valid()) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2057

return Status(StatusCode::INVALID_ARGUMENT,

2058

"Context string isn't valid UTF8.");

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2059

}

2060

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2061

std::vector<Locale> detected_text_language_tags;

2062

if (!ParseLocales(options.detected_text_language_tags,

2063

&detected_text_language_tags)) {

2064

TC3_LOG(WARNING)

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2065

<< "Failed to parse the detected_text_language_tags in options: "

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2066

<< options.detected_text_language_tags;

2067

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2068

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

2069

model_triggering_locales_,

2070

/*default_value=*/true)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2071

return Status(

2072

StatusCode::UNAVAILABLE,

2073

"The detected language tags are not in the supported locales.");

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2074

}

2075

2076

InterpreterManager interpreter_manager(selection_executor_.get(),

2077

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2078

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2079

// Annotate with the selection model.

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2080

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2081

if (!ModelAnnotate(context, detected_text_language_tags, &interpreter_manager,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2082

&tokens, candidates)) {

2083

return Status(StatusCode::INTERNAL, "Couldn't run ModelAnnotate.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2084

}

2085

2086

// Annotate with the regular expression models.

2087

if (!RegexChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2088

annotation_regex_patterns_, candidates,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2089

options.is_serialized_entity_data_enabled)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2090

return Status(StatusCode::INTERNAL, "Couldn't run RegexChunk.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2091

}

2092

2093

// Annotate with the datetime model.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2094

const EnabledEntityTypes is_entity_type_enabled(options.entity_types);

2095

if ((is_entity_type_enabled(Collections::Date()) ||

2096

is_entity_type_enabled(Collections::DateTime())) &&

2097

!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2098

options.reference_time_ms_utc, options.reference_timezone,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2099

options.locales, ModeFlag_ANNOTATION,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2100

options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2101

options.is_serialized_entity_data_enabled, candidates)) {

2102

return Status(StatusCode::INTERNAL, "Couldn't run DatetimeChunk.");

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2103

}

2104

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2105

// Annotate with the contact engine.

2106

if (contact_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2107

!contact_engine_->Chunk(context_unicode, tokens, candidates)) {

2108

return Status(StatusCode::INTERNAL, "Couldn't run contact engine Chunk.");

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2109

}

2110

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2111

// Annotate with the installed app engine.

2112

if (installed_app_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2113

!installed_app_engine_->Chunk(context_unicode, tokens, candidates)) {

2114

return Status(StatusCode::INTERNAL,

2115

"Couldn't run installed app engine Chunk.");

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2116

}

2117

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2118

// Annotate with the number annotator.

2119

if (number_annotator_ != nullptr &&

2120

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2121

candidates)) {

2122

return Status(StatusCode::INTERNAL,

2123

"Couldn't run number annotator FindAll.");

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

2124

}

2125

2126

// Annotate with the duration annotator.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2127

if (is_entity_type_enabled(Collections::Duration()) &&

2128

duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2129

!duration_annotator_->FindAll(context_unicode, tokens,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2130

options.annotation_usecase, candidates)) {

2131

return Status(StatusCode::INTERNAL,

2132

"Couldn't run duration annotator FindAll.");

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2133

}

2134

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2135

// Annotate with the person name engine.

2136

if (is_entity_type_enabled(Collections::PersonName()) &&

2137

person_name_engine_ &&

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2138

!person_name_engine_->Chunk(context_unicode, tokens, candidates)) {

2139

return Status(StatusCode::INTERNAL,

2140

"Couldn't run person name engine Chunk.");

Tony Mak

2020-01-08 17:30:51 +0000

[diff] [blame]

2141

}

2142

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2143

// Annotate with the grammar annotators.

2144

if (grammar_annotator_ != nullptr &&

2145

!grammar_annotator_->Annotate(detected_text_language_tags,

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2146

context_unicode, candidates)) {

2147

return Status(StatusCode::INTERNAL, "Couldn't run grammar annotators.");

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2148

}

2149

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2150

// Sort candidates according to their position in the input, so that the next

2151

// code can assume that any connected component of overlapping spans forms a

2152

// contiguous block.

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2153

std::sort(candidates->begin(), candidates->end(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2154

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

2155

return a.span.first < b.span.first;

2156

});

2157

2158

std::vector<int> candidate_indices;

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2159

if (!ResolveConflicts(*candidates, context, tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2160

detected_text_language_tags, options.annotation_usecase,

2161

&interpreter_manager, &candidate_indices)) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2162

return Status(StatusCode::INTERNAL, "Couldn't resolve conflicts.");

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2163

}

2164

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2165

std::vector<AnnotatedSpan> result;

2166

result.reserve(candidate_indices.size());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2167

AnnotatedSpan aggregated_span;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2168

for (const int i : candidate_indices) {

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2169

if ((*candidates)[i].span != aggregated_span.span) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2170

if (!aggregated_span.classification.empty()) {

2171

result.push_back(std::move(aggregated_span));

2172

}

2173

aggregated_span =

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2174

AnnotatedSpan((*candidates)[i].span, /*arg_classification=*/{});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2175

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2176

if ((*candidates)[i].classification.empty() ||

2177

ClassifiedAsOther((*candidates)[i].classification) ||

2178

FilteredForAnnotation((*candidates)[i])) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2179

continue;

2180

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2181

for (ClassificationResult& classification :

2182

(*candidates)[i].classification) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2183

aggregated_span.classification.push_back(std::move(classification));

2184

}

2185

}

2186

if (!aggregated_span.classification.empty()) {

2187

result.push_back(std::move(aggregated_span));

2188

}

2189

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2190

// We generate all candidates and remove them later (with the exception of

2191

// date/time/duration entities) because there are complex interdependencies

2192

// between the entity types. E.g., the TLD of an email can be interpreted as a

2193

// URL, but most likely a user of the API does not want such annotations if

2194

// "url" is enabled and "email" is not.

2195

RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);

2196

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2197

for (AnnotatedSpan& annotated_span : result) {

2198

SortClassificationResults(&annotated_span.classification);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2199

}

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2200

*candidates = result;

2201

return Status::OK;

2202

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2203

Tony Mak

2020-03-31 11:13:06 +0100

[diff] [blame^]

2204

StatusOr<std::vector<std::vector<AnnotatedSpan>>>

2205

Annotator::AnnotateStructuredInput(

2206

const std::vector<InputFragment>& string_fragments,

2207

const AnnotationOptions& options) const {

2208

std::vector<std::vector<AnnotatedSpan>> annotation_candidates(

2209

string_fragments.size());

2210

2211

std::vector<std::string> text_to_annotate;

2212

text_to_annotate.reserve(string_fragments.size());

2213

for (const auto& string_fragment : string_fragments) {

2214

text_to_annotate.push_back(string_fragment.text);

2215

}

2216

2217

// KnowledgeEngine is special, because it supports annotation of multiple

2218

// fragments at once.

2219

if (knowledge_engine_ &&

2220

!knowledge_engine_

2221

->ChunkMultipleSpans(text_to_annotate, options.annotation_usecase,

2222

options.location_context,

2223

&annotation_candidates)

2224

.ok()) {

2225

return Status(StatusCode::INTERNAL, "Couldn't run knowledge engine Chunk.");

2226

}

2227

// The annotator engines shouldn't change the number of annotation vectors.

2228

if (annotation_candidates.size() != text_to_annotate.size()) {

2229

TC3_LOG(ERROR) << "Received " << text_to_annotate.size()

2230

<< " texts to annotate but generated a different number of "

2231

"lists of annotations:"

2232

<< annotation_candidates.size();

2233

return Status(StatusCode::INTERNAL,

2234

"Number of annotation candidates differs from "

2235

"number of texts to annotate.");

2236

}

2237

2238

// Other annotators run on each fragment independently.

2239

for (int i = 0; i < text_to_annotate.size(); ++i) {

2240

AnnotationOptions annotation_options = options;

2241

if (string_fragments[i].datetime_options.has_value()) {

2242

DatetimeOptions reference_datetime =

2243

string_fragments[i].datetime_options.value();

2244

annotation_options.reference_time_ms_utc =

2245

reference_datetime.reference_time_ms_utc;

2246

annotation_options.reference_timezone =

2247

reference_datetime.reference_timezone;

2248

}

2249

2250

AddContactMetadataToKnowledgeClassificationResults(

2251

&annotation_candidates[i]);

2252

2253

Status annotation_status = AnnotateSingleInput(

2254

text_to_annotate[i], annotation_options, &annotation_candidates[i]);

2255

if (!annotation_status.ok()) {

2256

return annotation_status;

2257

}

2258

}

2259

return annotation_candidates;

2260

}

2261

2262

std::vector<AnnotatedSpan> Annotator::Annotate(

2263

const std::string& context, const AnnotationOptions& options) const {

2264

std::vector<InputFragment> string_fragments;

2265

string_fragments.push_back({.text = context});

2266

StatusOr<std::vector<std::vector<AnnotatedSpan>>> annotations =

2267

AnnotateStructuredInput(string_fragments, options);

2268

if (!annotations.ok()) {

2269

TC3_LOG(ERROR) << "Returned error when calling AnnotateStructuredInput: "

2270

<< annotations.status().error_message();

2271

return {};

2272

}

2273

return annotations.ValueOrDie()[0];

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2274

}

2275

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2276

CodepointSpan Annotator::ComputeSelectionBoundaries(

2277

const UniLib::RegexMatcher* match,

2278

const RegexModel_::Pattern* config) const {

2279

if (config->capturing_group() == nullptr) {

2280

// Use first capturing group to specify the selection.

2281

int status = UniLib::RegexMatcher::kNoError;

2282

const CodepointSpan result = {match->Start(1, &status),

2283

match->End(1, &status)};

2284

if (status != UniLib::RegexMatcher::kNoError) {

2285

return {kInvalidIndex, kInvalidIndex};

}

return result;

}

CodepointSpan result = {kInvalidIndex, kInvalidIndex};

2291

const int num_groups = config->capturing_group()->size();

2292

for (int i = 0; i < num_groups; i++) {

2293

if (!config->capturing_group()->Get(i)->extend_selection()) {

continue;

}

int status = UniLib::RegexMatcher::kNoError;

2298

// Check match and adjust bounds.

2299

const int group_start = match->Start(i, &status);

2300

const int group_end = match->End(i, &status);

2301

if (status != UniLib::RegexMatcher::kNoError) {

2302

return {kInvalidIndex, kInvalidIndex};

2303

}

2304

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

2305

continue;

2306

}

2307

if (result.first == kInvalidIndex) {

2308

result = {group_start, group_end};

2309

} else {

2310

result.first = std::min(result.first, group_start);

2311

result.second = std::max(result.second, group_end);

}

}

return result;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2317

bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2318

if (pattern->serialized_entity_data() != nullptr ||

2319

pattern->entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2320

return true;

2321

}

2322

if (pattern->capturing_group() != nullptr) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2323

for (const CapturingGroup* group : *pattern->capturing_group()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2324

if (group->entity_field_path() != nullptr) {

2325

return true;

2326

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2327

if (group->serialized_entity_data() != nullptr ||

2328

group->entity_data() != nullptr) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2329

return true;

2330

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

return false;

}

bool Annotator::SerializedEntityDataFromRegexMatch(

2337

const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,

2338

std::string* serialized_entity_data) const {

2339

if (!HasEntityData(pattern)) {

2340

serialized_entity_data->clear();

2341

return true;

2342

}

2343

TC3_CHECK(entity_data_builder_ != nullptr);

2344

2345

std::unique_ptr<ReflectiveFlatbuffer> entity_data =

2346

entity_data_builder_->NewRoot();

2347

2348

TC3_CHECK(entity_data != nullptr);

2349

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2350

// Set fixed entity data.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2351

if (pattern->serialized_entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2352

entity_data->MergeFromSerializedFlatbuffer(

2353

StringPiece(pattern->serialized_entity_data()->c_str(),

2354

pattern->serialized_entity_data()->size()));

2355

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2356

if (pattern->entity_data() != nullptr) {

2357

entity_data->MergeFrom(

2358

reinterpret_cast<const flatbuffers::Table*>(pattern->entity_data()));

2359

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2360

2361

// Add entity data from rule capturing groups.

2362

if (pattern->capturing_group() != nullptr) {

2363

const int num_groups = pattern->capturing_group()->size();

2364

for (int i = 0; i < num_groups; i++) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2365

const CapturingGroup* group = pattern->capturing_group()->Get(i);

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2366

2367

// Check whether the group matched.

2368

Optional<std::string> group_match_text =

2369

GetCapturingGroupText(matcher, /*group_id=*/i);

2370

if (!group_match_text.has_value()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2371

continue;

2372

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2373

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2374

// Set fixed entity data from capturing group match.

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2375

if (group->serialized_entity_data() != nullptr) {

2376

entity_data->MergeFromSerializedFlatbuffer(

2377

StringPiece(group->serialized_entity_data()->c_str(),

2378

group->serialized_entity_data()->size()));

2379

}

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2380

if (group->entity_data() != nullptr) {

2381

entity_data->MergeFrom(reinterpret_cast<const flatbuffers::Table*>(

2382

pattern->entity_data()));

2383

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2384

2385

// Set entity field from capturing group text.

2386

if (group->entity_field_path() != nullptr) {

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2387

UnicodeText normalized_group_match_text =

2388

UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);

2389

2390

// Apply normalization if specified.

2391

if (group->normalization_options() != nullptr) {

2392

normalized_group_match_text =

2393

NormalizeText(unilib_, group->normalization_options(),

2394

normalized_group_match_text);

2395

}

2396

2397

if (!entity_data->ParseAndSet(

2398

group->entity_field_path(),

2399

normalized_group_match_text.ToUTF8String())) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2400

TC3_LOG(ERROR)

2401

<< "Could not set entity data from rule capturing group.";

2402

return false;

2403

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

}

*serialized_entity_data = entity_data->Serialize();

return true;

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2412

UnicodeText RemoveMoneySeparators(

2413

const std::unordered_set<char32>& decimal_separators,

2414

const UnicodeText& amount,

2415

UnicodeText::const_iterator it_decimal_separator) {

2416

UnicodeText whole_amount;

2417

for (auto it = amount.begin();

2418

it != amount.end() && it != it_decimal_separator; ++it) {

2419

if (std::find(decimal_separators.begin(), decimal_separators.end(),

2420

static_cast<char32>(*it)) == decimal_separators.end()) {

2421

whole_amount.push_back(*it);

}

}

return whole_amount;

}

bool Annotator::ParseAndFillInMoneyAmount(

2428

std::string* serialized_entity_data) const {

2429

std::unique_ptr<EntityDataT> data =

2430

LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(

2431

*serialized_entity_data);

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2432

if (data == nullptr) {

2433

TC3_LOG(ERROR)

2434

<< "Data field is null when trying to parse Money Entity Data";

2435

return false;

2436

}

2437

if (data->money->unnormalized_amount.empty()) {

2438

TC3_LOG(ERROR) << "Data unnormalized_amount is empty when trying to parse "

2439

"Money Entity Data";

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

UnicodeText amount =

UTF8ToUnicodeText(data->money->unnormalized_amount, /*do_copy=*/false);

2445

int separator_back_index = 0;

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2446

auto it_decimal_separator = --amount.end();

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2447

for (; it_decimal_separator != amount.begin();

2448

--it_decimal_separator, ++separator_back_index) {

2449

if (std::find(money_separators_.begin(), money_separators_.end(),

2450

static_cast<char32>(*it_decimal_separator)) !=

2451

money_separators_.end()) {

break;

}

}

// If there are 3 digits after the last separator, we consider that a

2457

// thousands separator => the number is an int (e.g. 1.234 is considered int).

2458

// If there is no separator in number, also that number is an int.

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2459

if (separator_back_index == 3 || it_decimal_separator == amount.begin()) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2460

it_decimal_separator = amount.end();

2461

}

2462

2463

if (!unilib_->ParseInt32(RemoveMoneySeparators(money_separators_, amount,

2464

it_decimal_separator),

2465

&data->money->amount_whole_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2466

TC3_LOG(ERROR) << "Could not parse the money whole part as int32 from the "

2467

"amount: "

2468

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2469

return false;

2470

}

2471

if (it_decimal_separator == amount.end()) {

2472

data->money->amount_decimal_part = 0;

2473

} else {

2474

const int amount_codepoints_size = amount.size_codepoints();

2475

if (!unilib_->ParseInt32(

2476

UnicodeText::Substring(

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2477

amount, amount_codepoints_size - separator_back_index,

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2478

amount_codepoints_size, /*do_copy=*/false),

2479

&data->money->amount_decimal_part)) {

Tony Mak

2020-03-17 16:30:19 +0000

[diff] [blame]

2480

TC3_LOG(ERROR) << "Could not parse the money decimal part as int32 from "

2481

"the amount: "

2482

<< data->money->unnormalized_amount;

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

return false;

}

}

*serialized_entity_data =

2488

PackFlatbuffer<libtextclassifier3::EntityData>(data.get());

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2492

bool Annotator::RegexChunk(const UnicodeText& context_unicode,

2493

const std::vector<int>& rules,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2494

std::vector<AnnotatedSpan>* result,

2495

bool is_serialized_entity_data_enabled) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2496

for (int pattern_id : rules) {

2497

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

2498

const auto matcher = regex_pattern.pattern->Matcher(context_unicode);

2499

if (!matcher) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2500

TC3_LOG(ERROR) << "Could not get regex matcher for pattern: "

2501

<< pattern_id;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

int status = UniLib::RegexMatcher::kNoError;

2506

while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2507

if (regex_pattern.config->verification_options()) {

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2508

if (!VerifyRegexMatchCandidate(

2509

context_unicode.ToUTF8String(),

2510

regex_pattern.config->verification_options(),

2511

matcher->Group(1, &status).ToUTF8String(), matcher.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2512

continue;

2513

}

2514

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2515

2516

std::string serialized_entity_data;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2517

if (is_serialized_entity_data_enabled) {

2518

if (!SerializedEntityDataFromRegexMatch(

2519

regex_pattern.config, matcher.get(), &serialized_entity_data)) {

2520

TC3_LOG(ERROR) << "Could not get entity data.";

2521

return false;

2522

}

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2523

2524

// Further parsing unnormalized_amount for money into amount_whole_part

2525

// and amount_decimal_part. Can't do this with regexes because we cannot

2526

// have empty groups (amount_decimal_part might be an empty group).

2527

if (regex_pattern.config->collection_name()->str() ==

2528

Collections::Money()) {

2529

if (!ParseAndFillInMoneyAmount(&serialized_entity_data)) {

2530

TC3_LOG(ERROR) << "Could not parse and fill in money amount.";

2531

}

2532

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2533

}

2534

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2535

result->emplace_back();

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2536

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2537

// Selection/annotation regular expressions need to specify a capturing

2538

// group specifying the selection.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2539

result->back().span =

2540

ComputeSelectionBoundaries(matcher.get(), regex_pattern.config);

2541

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2542

result->back().classification = {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2543

{regex_pattern.config->collection_name()->str(),

2544

regex_pattern.config->target_classification_score(),

2545

regex_pattern.config->priority_score()}};

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2546

2547

result->back().classification[0].serialized_entity_data =

2548

serialized_entity_data;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2554

bool Annotator::ModelChunk(int num_tokens, const TokenSpan& span_of_interest,

2555

tflite::Interpreter* selection_interpreter,

2556

const CachedFeatures& cached_features,

2557

std::vector<TokenSpan>* chunks) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2558

const int max_selection_span =

2559

selection_feature_processor_->GetOptions()->max_selection_span();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2560

// The inference span is the span of interest expanded to include

2561

// max_selection_span tokens on either side, which is how far a selection can

2562

// stretch from the click.

2563

const TokenSpan inference_span = IntersectTokenSpans(

2564

ExpandTokenSpan(span_of_interest,

2565

/*num_tokens_left=*/max_selection_span,

2566

/*num_tokens_right=*/max_selection_span),

2567

{0, num_tokens});

2568

2569

std::vector<ScoredChunk> scored_chunks;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2570

if (selection_feature_processor_->GetOptions()->bounds_sensitive_features() &&

2571

selection_feature_processor_->GetOptions()

2572

->bounds_sensitive_features()

2573

->enabled()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2574

if (!ModelBoundsSensitiveScoreChunks(

2575

num_tokens, span_of_interest, inference_span, cached_features,

2576

selection_interpreter, &scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

} else {

if (!ModelClickContextScoreChunks(num_tokens, span_of_interest,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2581

cached_features, selection_interpreter,

2582

&scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2583

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2584

}

2585

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2586

std::sort(scored_chunks.rbegin(), scored_chunks.rend(),

2587

[](const ScoredChunk& lhs, const ScoredChunk& rhs) {

2588

return lhs.score < rhs.score;

2589

});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2590

2591

// Traverse the candidate chunks from highest-scoring to lowest-scoring. Pick

2592

// them greedily as long as they do not overlap with any previously picked

2593

// chunks.

2594

std::vector<bool> token_used(TokenSpanSize(inference_span));

2595

chunks->clear();

2596

for (const ScoredChunk& scored_chunk : scored_chunks) {

2597

bool feasible = true;

2598

for (int i = scored_chunk.token_span.first;

2599

i < scored_chunk.token_span.second; ++i) {

2600

if (token_used[i - inference_span.first]) {

feasible = false;

break;

}

}

if (!feasible) {

continue;

}

for (int i = scored_chunk.token_span.first;

2611

i < scored_chunk.token_span.second; ++i) {

2612

token_used[i - inference_span.first] = true;

2613

}

2614

2615

chunks->push_back(scored_chunk.token_span);

2616

}

2617

2618

std::sort(chunks->begin(), chunks->end());

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2623

namespace {

2624

// Updates the value at the given key in the map to maximum of the current value

2625

// and the given value, or simply inserts the value if the key is not yet there.

2626

template <typename Map>

2627

void UpdateMax(Map* map, typename Map::key_type key,

2628

typename Map::mapped_type value) {

2629

const auto it = map->find(key);

2630

if (it != map->end()) {

2631

it->second = std::max(it->second, value);

} else {

(*map)[key] = value;

}

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2638

bool Annotator::ModelClickContextScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2639

int num_tokens, const TokenSpan& span_of_interest,

2640

const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2641

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2642

std::vector<ScoredChunk>* scored_chunks) const {

2643

const int max_batch_size = model_->selection_options()->batch_size();

2644

2645

std::vector<float> all_features;

2646

std::map<TokenSpan, float> chunk_scores;

2647

for (int batch_start = span_of_interest.first;

2648

batch_start < span_of_interest.second; batch_start += max_batch_size) {

2649

const int batch_end =

2650

std::min(batch_start + max_batch_size, span_of_interest.second);

2651

2652

// Prepare features for the whole batch.

2653

all_features.clear();

2654

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2655

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2656

cached_features.AppendClickContextFeaturesForClick(click_pos,

&all_features);

}

// Run batched inference.

2661

const int batch_size = batch_end - batch_start;

2662

const int features_size = cached_features.OutputFeaturesSize();

2663

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2664

TensorView<float>(all_features.data(), {batch_size, features_size}),

2665

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2666

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2667

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2668

return false;

2669

}

2670

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2671

logits.dim(1) !=

2672

selection_feature_processor_->GetSelectionLabelCount()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2673

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2679

const std::vector<float> scores = ComputeSoftmax(

2680

logits.data() + logits.dim(1) * (click_pos - batch_start),

2681

logits.dim(1));

2682

for (int j = 0;

2683

j < selection_feature_processor_->GetSelectionLabelCount(); ++j) {

2684

TokenSpan relative_token_span;

2685

if (!selection_feature_processor_->LabelToTokenSpan(

2686

j, &relative_token_span)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2687

TC3_LOG(ERROR) << "Couldn't map the label to a token span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2688

return false;

2689

}

2690

const TokenSpan candidate_span = ExpandTokenSpan(

2691

SingleTokenSpan(click_pos), relative_token_span.first,

2692

relative_token_span.second);

2693

if (candidate_span.first >= 0 && candidate_span.second <= num_tokens) {

2694

UpdateMax(&chunk_scores, candidate_span, scores[j]);

}

}

}

}

scored_chunks->clear();

2701

scored_chunks->reserve(chunk_scores.size());

2702

for (const auto& entry : chunk_scores) {

2703

scored_chunks->push_back(ScoredChunk{entry.first, entry.second});

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2709

bool Annotator::ModelBoundsSensitiveScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2710

int num_tokens, const TokenSpan& span_of_interest,

2711

const TokenSpan& inference_span, const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2712

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2713

std::vector<ScoredChunk>* scored_chunks) const {

2714

const int max_selection_span =

2715

selection_feature_processor_->GetOptions()->max_selection_span();

2716

const int max_chunk_length = selection_feature_processor_->GetOptions()

2717

->selection_reduced_output_space()

2718

? max_selection_span + 1

2719

: 2 * max_selection_span + 1;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2720

const bool score_single_token_spans_as_zero =

2721

selection_feature_processor_->GetOptions()

2722

->bounds_sensitive_features()

2723

->score_single_token_spans_as_zero();

2724

2725

scored_chunks->clear();

2726

if (score_single_token_spans_as_zero) {

2727

scored_chunks->reserve(TokenSpanSize(span_of_interest));

2728

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2729

2730

// Prepare all chunk candidates into one batch:

2731

// - Are contained in the inference span

2732

// - Have a non-empty intersection with the span of interest

2733

// - Are at least one token long

2734

// - Are not longer than the maximum chunk length

2735

std::vector<TokenSpan> candidate_spans;

2736

for (int start = inference_span.first; start < span_of_interest.second;

2737

++start) {

2738

const int leftmost_end_index = std::max(start, span_of_interest.first) + 1;

2739

for (int end = leftmost_end_index;

2740

end <= inference_span.second && end - start <= max_chunk_length;

2741

++end) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2742

const TokenSpan candidate_span = {start, end};

2743

if (score_single_token_spans_as_zero &&

2744

TokenSpanSize(candidate_span) == 1) {

2745

// Do not include the single token span in the batch, add a zero score

2746

// for it directly to the output.

2747

scored_chunks->push_back(ScoredChunk{candidate_span, 0.0f});

2748

} else {

2749

candidate_spans.push_back(candidate_span);

2750

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

const int max_batch_size = model_->selection_options()->batch_size();

2755

2756

std::vector<float> all_features;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2757

scored_chunks->reserve(scored_chunks->size() + candidate_spans.size());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2758

for (int batch_start = 0; batch_start < candidate_spans.size();

2759

batch_start += max_batch_size) {

2760

const int batch_end = std::min(batch_start + max_batch_size,

2761

static_cast<int>(candidate_spans.size()));

2762

2763

// Prepare features for the whole batch.

2764

all_features.clear();

2765

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2766

for (int i = batch_start; i < batch_end; ++i) {

2767

cached_features.AppendBoundsSensitiveFeaturesForSpan(candidate_spans[i],

&all_features);

}

// Run batched inference.

2772

const int batch_size = batch_end - batch_start;

2773

const int features_size = cached_features.OutputFeaturesSize();

2774

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2775

TensorView<float>(all_features.data(), {batch_size, features_size}),

2776

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2777

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2778

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2779

return false;

2780

}

2781

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2782

logits.dim(1) != 1) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2783

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int i = batch_start; i < batch_end; ++i) {

2789

scored_chunks->push_back(

2790

ScoredChunk{candidate_spans[i], logits.data()[i - batch_start]});

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2797

bool Annotator::DatetimeChunk(const UnicodeText& context_unicode,

2798

int64 reference_time_ms_utc,

2799

const std::string& reference_timezone,

2800

const std::string& locales, ModeFlag mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2801

AnnotationUsecase annotation_usecase,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2802

bool is_serialized_entity_data_enabled,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2803

std::vector<AnnotatedSpan>* result) const {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2804

std::vector<DatetimeParseResultSpan> datetime_spans;

2805

if (cfg_datetime_parser_) {

2806

if (!(model_->grammar_datetime_model()->enabled_modes() & mode)) {

2807

return true;

2808

}

2809

std::vector<Locale> parsed_locales;

2810

ParseLocales(locales, &parsed_locales);

Tony Mak

2020-03-12 18:29:35 +0000

[diff] [blame]

2811

cfg_datetime_parser_->Parse(

2812

context_unicode.ToUTF8String(),

2813

ToDateAnnotationOptions(

2814

model_->grammar_datetime_model()->annotation_options(),

2815

reference_timezone, reference_time_ms_utc),

2816

parsed_locales, &datetime_spans);

Tony Mak

2020-03-19 21:52:02 +0000

[diff] [blame]

2817

}

2818

2819

if (datetime_parser_) {

Tony Mak

2020-02-07 18:31:16 +0000

[diff] [blame]

2820

if (!datetime_parser_->Parse(context_unicode, reference_time_ms_utc,

2821

reference_timezone, locales, mode,

2822

annotation_usecase,

2823

/*anchor_start_end=*/false, &datetime_spans)) {

2824

return false;

2825

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2826

}

2827

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2828

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2829

AnnotatedSpan annotated_span;

2830

annotated_span.span = datetime_span.span;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2831

for (const DatetimeParseResult& parse_result : datetime_span.data) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2832

annotated_span.classification.emplace_back(

2833

PickCollectionForDatetime(parse_result),

2834

datetime_span.target_classification_score,

2835

datetime_span.priority_score);

2836

annotated_span.classification.back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2837

if (is_serialized_entity_data_enabled) {

2838

annotated_span.classification.back().serialized_entity_data =

2839

CreateDatetimeSerializedEntityData(parse_result);

2840

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2841

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2842

annotated_span.source = AnnotatedSpan::Source::DATETIME;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2843

result->push_back(std::move(annotated_span));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

return true;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2848

const Model* Annotator::model() const { return model_; }

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2849

const reflection::Schema* Annotator::entity_data_schema() const {

2850

return entity_data_schema_;

2851

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2852

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2853

const Model* ViewModel(const void* buffer, int size) {

if (!buffer) {

return nullptr;

}

return LoadAndVerifyModel(buffer, size);

2859

}

2860

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2861

bool Annotator::LookUpKnowledgeEntity(

2862

const std::string& id, std::string* serialized_knowledge_result) const {

2863

return knowledge_engine_ &&

2864

knowledge_engine_->LookUpEntity(id, serialized_knowledge_result);

2865

}

2866

Tony Mak