Blame - native/annotator/annotator.cc - platform/external/libtextclassifier

2018-01-24 11:11:20 +0100

[diff] [blame]

1

/*

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

3

*

4

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

17

#include "annotator/annotator.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

#include <algorithm>

#include <cctype>

#include <cmath>

#include <iterator>

#include <numeric>

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

24

#include <unordered_map>

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

25

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

26

#include "annotator/collections.h"

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

27

#include "annotator/model_generated.h"

28

#include "annotator/types.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

29

#include "utils/base/logging.h"

30

#include "utils/checksum.h"

31

#include "utils/math/softmax.h"

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

32

#include "utils/normalization.h"

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

33

#include "utils/optional.h"

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

34

#include "utils/regex-match.h"

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

35

#include "utils/utf8/unicodetext.h"

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

36

#include "utils/zlib/zlib_regex.h"

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

37

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

38

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

39

namespace libtextclassifier3 {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

40

41

using SortedIntSet = std::set<int, std::function<bool(int, int)>>;

42

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

43

const std::string& Annotator::kPhoneCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

44

*[]() { return new std::string("phone"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

45

const std::string& Annotator::kAddressCollection =

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

46

*[]() { return new std::string("address"); }();

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

47

const std::string& Annotator::kDateCollection =

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

48

*[]() { return new std::string("date"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

49

const std::string& Annotator::kUrlCollection =

50

*[]() { return new std::string("url"); }();

Tony Mak

296b7b6

2018-12-04 18:09:15 +0000

[diff] [blame]

51

const std::string& Annotator::kEmailCollection =

52

*[]() { return new std::string("email"); }();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

53

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

54

namespace {

55

const Model* LoadAndVerifyModel(const void* addr, int size) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

56

flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);

Tony Mak

51a9e54

2018-11-02 13:36:22 +0000

[diff] [blame]

57

if (VerifyModelBuffer(verifier)) {

58

return GetModel(addr);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

} else {

return nullptr;

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

63

64

// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will

65

// create a new instance, assign ownership to owned_lib, and return it.

66

const UniLib* MaybeCreateUnilib(const UniLib* lib,

67

std::unique_ptr<UniLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new UniLib);

72

return owned_lib->get();

}

}

// As above, but for CalendarLib.

77

const CalendarLib* MaybeCreateCalendarlib(

78

const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {

if (lib) {

return lib;

} else {

owned_lib->reset(new CalendarLib);

83

return owned_lib->get();

}

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame^]

87

// Returns whether the provided input is valid:

88

// * Valid utf8 text.

89

// * Sane span indices.

90

bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan span) {

91

if (!context.is_valid()) {

92

return false;

93

}

94

return (span.first >= 0 && span.first < span.second &&

95

span.second <= context.size_codepoints());

96

}

97

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

98

} // namespace

99

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

100

tflite::Interpreter* InterpreterManager::SelectionInterpreter() {

101

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

102

TC3_CHECK(selection_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

103

selection_interpreter_ = selection_executor_->CreateInterpreter();

104

if (!selection_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

105

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

106

}

107

}

108

return selection_interpreter_.get();

109

}

110

111

tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {

112

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

113

TC3_CHECK(classification_executor_);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

114

classification_interpreter_ = classification_executor_->CreateInterpreter();

115

if (!classification_interpreter_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

116

TC3_LOG(ERROR) << "Could not build TFLite interpreter.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

117

}

118

}

119

return classification_interpreter_.get();

120

}

121

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

122

std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(

123

const char* buffer, int size, const UniLib* unilib,

124

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

125

const Model* model = LoadAndVerifyModel(buffer, size);

126

if (model == nullptr) {

return nullptr;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

130

auto classifier =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

131

std::unique_ptr<Annotator>(new Annotator(model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

132

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-11-20 20:39:04 +0000

[diff] [blame]

139

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

140

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

141

std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,

142

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

143

if (!(*mmap)->handle().ok()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

144

TC3_VLOG(1) << "Mmap failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

149

(*mmap)->handle().num_bytes());

150

if (!model) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

151

TC3_LOG(ERROR) << "Model verification failed.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return nullptr;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

155

auto classifier = std::unique_ptr<Annotator>(

156

new Annotator(mmap, model, unilib, calendarlib));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

157

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

164

std::unique_ptr<Annotator> Annotator::FromScopedMmap(

165

std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,

166

std::unique_ptr<CalendarLib> calendarlib) {

167

if (!(*mmap)->handle().ok()) {

168

TC3_VLOG(1) << "Mmap failed.";

return nullptr;

}

const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),

173

(*mmap)->handle().num_bytes());

174

if (model == nullptr) {

175

TC3_LOG(ERROR) << "Model verification failed.";

return nullptr;

}

auto classifier = std::unique_ptr<Annotator>(

180

new Annotator(mmap, model, std::move(unilib), std::move(calendarlib)));

181

if (!classifier->IsInitialized()) {

return nullptr;

}

return classifier;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

188

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

189

int fd, int offset, int size, const UniLib* unilib,

190

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

191

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

192

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

193

}

194

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

195

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

196

int fd, int offset, int size, std::unique_ptr<UniLib> unilib,

197

std::unique_ptr<CalendarLib> calendarlib) {

198

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));

199

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

200

}

201

202

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

203

int fd, const UniLib* unilib, const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

204

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

205

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

206

}

207

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

208

std::unique_ptr<Annotator> Annotator::FromFileDescriptor(

209

int fd, std::unique_ptr<UniLib> unilib,

210

std::unique_ptr<CalendarLib> calendarlib) {

211

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));

212

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

213

}

214

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

215

std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,

216

const UniLib* unilib,

217

const CalendarLib* calendarlib) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

218

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

219

return FromScopedMmap(&mmap, unilib, calendarlib);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

220

}

221

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

222

std::unique_ptr<Annotator> Annotator::FromPath(

223

const std::string& path, std::unique_ptr<UniLib> unilib,

224

std::unique_ptr<CalendarLib> calendarlib) {

225

std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));

226

return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));

227

}

228

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

229

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

230

const UniLib* unilib, const CalendarLib* calendarlib)

231

: model_(model),

232

mmap_(std::move(*mmap)),

233

owned_unilib_(nullptr),

234

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

235

owned_calendarlib_(nullptr),

236

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

237

ValidateAndInitialize();

238

}

239

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

240

Annotator::Annotator(std::unique_ptr<ScopedMmap>* mmap, const Model* model,

241

std::unique_ptr<UniLib> unilib,

242

std::unique_ptr<CalendarLib> calendarlib)

243

: model_(model),

244

mmap_(std::move(*mmap)),

245

owned_unilib_(std::move(unilib)),

246

unilib_(owned_unilib_.get()),

247

owned_calendarlib_(std::move(calendarlib)),

248

calendarlib_(owned_calendarlib_.get()) {

249

ValidateAndInitialize();

250

}

251

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

252

Annotator::Annotator(const Model* model, const UniLib* unilib,

253

const CalendarLib* calendarlib)

254

: model_(model),

255

owned_unilib_(nullptr),

256

unilib_(MaybeCreateUnilib(unilib, &owned_unilib_)),

257

owned_calendarlib_(nullptr),

258

calendarlib_(MaybeCreateCalendarlib(calendarlib, &owned_calendarlib_)) {

259

ValidateAndInitialize();

260

}

261

262

void Annotator::ValidateAndInitialize() {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

263

initialized_ = false;

264

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

265

if (model_ == nullptr) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

266

TC3_LOG(ERROR) << "No model specified.";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

270

const bool model_enabled_for_annotation =

271

(model_->triggering_options() != nullptr &&

272

(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));

273

const bool model_enabled_for_classification =

274

(model_->triggering_options() != nullptr &&

275

(model_->triggering_options()->enabled_modes() &

276

ModeFlag_CLASSIFICATION));

277

const bool model_enabled_for_selection =

278

(model_->triggering_options() != nullptr &&

279

(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));

280

281

// Annotation requires the selection model.

282

if (model_enabled_for_annotation || model_enabled_for_selection) {

283

if (!model_->selection_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

284

TC3_LOG(ERROR) << "No selection options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

285

return;

286

}

287

if (!model_->selection_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

288

TC3_LOG(ERROR) << "No selection feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

289

return;

290

}

291

if (!model_->selection_feature_options()->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

292

TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

293

return;

294

}

295

if (!model_->selection_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

296

TC3_LOG(ERROR) << "No selection model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

297

return;

298

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

299

selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

300

if (!selection_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

301

TC3_LOG(ERROR) << "Could not initialize selection executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

302

return;

303

}

304

selection_feature_processor_.reset(

305

new FeatureProcessor(model_->selection_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

306

}

307

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

308

// Annotation requires the classification model for conflict resolution and

309

// scoring.

310

// Selection requires the classification model for conflict resolution.

311

if (model_enabled_for_annotation || model_enabled_for_classification ||

312

model_enabled_for_selection) {

313

if (!model_->classification_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

314

TC3_LOG(ERROR) << "No classification options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

319

TC3_LOG(ERROR) << "No classification feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

if (!model_->classification_feature_options()

324

->bounds_sensitive_features()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

325

TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

326

return;

327

}

328

if (!model_->classification_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

329

TC3_LOG(ERROR) << "No clf model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

333

classification_executor_ =

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

334

ModelExecutor::FromBuffer(model_->classification_model());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

335

if (!classification_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

336

TC3_LOG(ERROR) << "Could not initialize classification executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return;

}

classification_feature_processor_.reset(new FeatureProcessor(

341

model_->classification_feature_options(), unilib_));

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

342

}

343

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

344

// The embeddings need to be specified if the model is to be used for

345

// classification or selection.

346

if (model_enabled_for_annotation || model_enabled_for_classification ||

347

model_enabled_for_selection) {

348

if (!model_->embedding_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

349

TC3_LOG(ERROR) << "No embedding model.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

350

return;

351

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

352

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

353

// Check that the embedding size of the selection and classification model

354

// matches, as they are using the same embeddings.

355

if (model_enabled_for_selection &&

356

(model_->selection_feature_options()->embedding_size() !=

357

model_->classification_feature_options()->embedding_size() ||

358

model_->selection_feature_options()->embedding_quantization_bits() !=

359

model_->classification_feature_options()

360

->embedding_quantization_bits())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

361

TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

362

return;

363

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

364

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

365

embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

366

model_->embedding_model(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

367

model_->classification_feature_options()->embedding_size(),

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

368

model_->classification_feature_options()->embedding_quantization_bits(),

369

model_->embedding_pruning_mask());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

370

if (!embedding_executor_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

371

TC3_LOG(ERROR) << "Could not initialize embedding executor.";

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

372

return;

373

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

374

}

375

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

376

std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

377

if (model_->regex_model()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

378

if (!InitializeRegexModel(decompressor.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

379

TC3_LOG(ERROR) << "Could not initialize regex model.";

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

380

return;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

381

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

382

}

383

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

384

if (model_->datetime_model()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

385

datetime_parser_ = DatetimeParser::Instance(

386

model_->datetime_model(), *unilib_, *calendarlib_, decompressor.get());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

387

if (!datetime_parser_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

388

TC3_LOG(ERROR) << "Could not initialize datetime parser.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

393

if (model_->output_options()) {

394

if (model_->output_options()->filtered_collections_annotation()) {

395

for (const auto collection :

396

*model_->output_options()->filtered_collections_annotation()) {

397

filtered_collections_annotation_.insert(collection->str());

398

}

399

}

400

if (model_->output_options()->filtered_collections_classification()) {

401

for (const auto collection :

402

*model_->output_options()->filtered_collections_classification()) {

403

filtered_collections_classification_.insert(collection->str());

404

}

405

}

406

if (model_->output_options()->filtered_collections_selection()) {

407

for (const auto collection :

408

*model_->output_options()->filtered_collections_selection()) {

409

filtered_collections_selection_.insert(collection->str());

}

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

414

if (model_->number_annotator_options() &&

415

model_->number_annotator_options()->enabled()) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

416

if (selection_feature_processor_ == nullptr) {

417

TC3_LOG(ERROR)

418

<< "Could not initialize NumberAnnotator without a feature processor";

return;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

422

number_annotator_.reset(

423

new NumberAnnotator(model_->number_annotator_options(),

424

selection_feature_processor_.get()));

425

}

426

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

427

if (model_->duration_annotator_options() &&

428

model_->duration_annotator_options()->enabled()) {

429

duration_annotator_.reset(

430

new DurationAnnotator(model_->duration_annotator_options(),

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

431

selection_feature_processor_.get(), unilib_));

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

432

}

433

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

434

if (model_->entity_data_schema()) {

435

entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(

436

model_->entity_data_schema()->Data(),

437

model_->entity_data_schema()->size());

438

if (entity_data_schema_ == nullptr) {

439

TC3_LOG(ERROR) << "Could not load entity data schema data.";

return;

}

entity_data_builder_.reset(

444

new ReflectiveFlatbufferBuilder(entity_data_schema_));

445

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

446

entity_data_schema_ = nullptr;

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

447

entity_data_builder_ = nullptr;

448

}

449

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

450

if (model_->triggering_locales() &&

451

!ParseLocales(model_->triggering_locales()->c_str(),

452

&model_triggering_locales_)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

453

TC3_LOG(ERROR) << "Could not parse model supported locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

458

model_->triggering_options()->locales() != nullptr &&

459

!ParseLocales(model_->triggering_options()->locales()->c_str(),

460

&ml_model_triggering_locales_)) {

461

TC3_LOG(ERROR) << "Could not parse supported ML model locales.";

return;

}

if (model_->triggering_options() != nullptr &&

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

466

model_->triggering_options()->dictionary_locales() != nullptr &&

467

!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),

468

&dictionary_locales_)) {

469

TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";

return;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

initialized_ = true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

476

bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

477

if (!model_->regex_model()->patterns()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

478

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

479

}

480

481

// Initialize pattern recognizers.

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

482

int regex_pattern_id = 0;

483

for (const auto& regex_pattern : *model_->regex_model()->patterns()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

484

std::unique_ptr<UniLib::RegexPattern> compiled_pattern =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

485

UncompressMakeRegexPattern(

486

*unilib_, regex_pattern->pattern(),

487

regex_pattern->compressed_pattern(),

488

model_->regex_model()->lazy_regex_compilation(), decompressor);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

489

if (!compiled_pattern) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

490

TC3_LOG(INFO) << "Failed to load regex pattern";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

491

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

492

}

493

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

494

if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

495

annotation_regex_patterns_.push_back(regex_pattern_id);

496

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

497

if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

498

classification_regex_patterns_.push_back(regex_pattern_id);

499

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

500

if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

501

selection_regex_patterns_.push_back(regex_pattern_id);

502

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

503

regex_patterns_.push_back({

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

504

regex_pattern,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

505

std::move(compiled_pattern),

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

506

});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

507

++regex_pattern_id;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

508

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

509

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

510

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

511

}

512

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

513

bool Annotator::InitializeKnowledgeEngine(

514

const std::string& serialized_config) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

515

std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

516

if (!knowledge_engine->Initialize(serialized_config)) {

517

TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";

518

return false;

519

}

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

520

if (model_->triggering_options() != nullptr) {

521

knowledge_engine->SetPriorityScore(

522

model_->triggering_options()->knowledge_priority_score());

523

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

524

knowledge_engine_ = std::move(knowledge_engine);

return true;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

528

bool Annotator::InitializeContactEngine(const std::string& serialized_config) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

529

std::unique_ptr<ContactEngine> contact_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

530

new ContactEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

531

if (!contact_engine->Initialize(serialized_config)) {

532

TC3_LOG(ERROR) << "Failed to initialize the contact engine.";

533

return false;

534

}

535

contact_engine_ = std::move(contact_engine);

return true;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

539

bool Annotator::InitializeInstalledAppEngine(

540

const std::string& serialized_config) {

541

std::unique_ptr<InstalledAppEngine> installed_app_engine(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

542

new InstalledAppEngine(selection_feature_processor_.get(), unilib_));

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

543

if (!installed_app_engine->Initialize(serialized_config)) {

544

TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";

545

return false;

546

}

547

installed_app_engine_ = std::move(installed_app_engine);

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

551

namespace {

552

553

int CountDigits(const std::string& str, CodepointSpan selection_indices) {

554

int count = 0;

555

int i = 0;

556

const UnicodeText unicode_str = UTF8ToUnicodeText(str, /*do_copy=*/false);

557

for (auto it = unicode_str.begin(); it != unicode_str.end(); ++it, ++i) {

558

if (i >= selection_indices.first && i < selection_indices.second &&

isdigit(*it)) {

++count;

}

}

return count;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

566

} // namespace

567

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

568

namespace internal {

569

// Helper function, which if the initial 'span' contains only white-spaces,

570

// moves the selection to a single-codepoint selection on a left or right side

571

// of this space.

572

CodepointSpan SnapLeftIfWhitespaceSelection(CodepointSpan span,

573

const UnicodeText& context_unicode,

574

const UniLib& unilib) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

575

TC3_CHECK(ValidNonEmptySpan(span));

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

576

577

UnicodeText::const_iterator it;

578

579

// Check that the current selection is all whitespaces.

580

it = context_unicode.begin();

581

std::advance(it, span.first);

582

for (int i = 0; i < (span.second - span.first); ++i, ++it) {

583

if (!unilib.IsWhitespace(*it)) {

return span;

}

}

CodepointSpan result;

// Try moving left.

result = span;

it = context_unicode.begin();

593

std::advance(it, span.first);

594

while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {

--result.first;

--it;

}

result.second = result.first + 1;

599

if (!unilib.IsWhitespace(*it)) {

return result;

}

// If moving left didn't find a non-whitespace character, just return the

// original span.

return span;

}

} // namespace internal

608

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

609

bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

610

return !span.classification.empty() &&

611

filtered_collections_annotation_.find(

612

span.classification[0].collection) !=

613

filtered_collections_annotation_.end();

614

}

615

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

616

bool Annotator::FilteredForClassification(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

617

const ClassificationResult& classification) const {

618

return filtered_collections_classification_.find(classification.collection) !=

619

filtered_collections_classification_.end();

620

}

621

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

622

bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

623

return !span.classification.empty() &&

624

filtered_collections_selection_.find(

625

span.classification[0].collection) !=

626

filtered_collections_selection_.end();

627

}

628

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

629

namespace {

630

inline bool ClassifiedAsOther(

631

const std::vector<ClassificationResult>& classification) {

632

return !classification.empty() &&

633

classification[0].collection == Collections::Other();

634

}

635

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

636

} // namespace

637

638

float Annotator::GetPriorityScore(

639

const std::vector<ClassificationResult>& classification) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

640

if (!classification.empty() && !ClassifiedAsOther(classification)) {

641

return classification[0].priority_score;

642

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

643

if (model_->triggering_options() != nullptr) {

644

return model_->triggering_options()->other_collection_priority_score();

645

} else {

646

return -1000.0;

647

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

648

}

649

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

650

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

651

bool Annotator::VerifyRegexMatchCandidate(

652

const std::string& context, const VerificationOptions* verification_options,

653

const std::string& match, const UniLib::RegexMatcher* matcher) const {

654

if (verification_options == nullptr) {

655

return true;

656

}

657

if (verification_options->verify_luhn_checksum() &&

658

!VerifyLuhnChecksum(match)) {

659

return false;

660

}

661

const int lua_verifier = verification_options->lua_verifier();

662

if (lua_verifier >= 0) {

663

if (model_->regex_model()->lua_verifier() == nullptr ||

664

lua_verifier >= model_->regex_model()->lua_verifier()->size()) {

665

TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;

return false;

}

return VerifyMatch(

context, matcher,

model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

675

CodepointSpan Annotator::SuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

676

const std::string& context, CodepointSpan click_indices,

677

const SelectionOptions& options) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

678

CodepointSpan original_click_indices = click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

679

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

680

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

681

return original_click_indices;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

682

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

683

if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

684

return original_click_indices;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

685

}

686

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

687

std::vector<Locale> detected_text_language_tags;

688

if (!ParseLocales(options.detected_text_language_tags,

689

&detected_text_language_tags)) {

690

TC3_LOG(WARNING)

691

<< "Failed to parse the detected_text_language_tags in options: "

692

<< options.detected_text_language_tags;

693

}

694

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

695

model_triggering_locales_,

696

/*default_value=*/true)) {

697

return original_click_indices;

698

}

699

Lukas Zilka

df710db

2018-02-27 12:44:09 +0100

[diff] [blame]

700

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

701

/*do_copy=*/false);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

702

Tony Mak

968412a

2019-11-13 15:39:57 +0000

[diff] [blame^]

703

if (!IsValidSpanInput(context_unicode, click_indices)) {

704

TC3_VLOG(1)

705

<< "Trying to run SuggestSelection with invalid input, indices: "

706

<< click_indices.first << " " << click_indices.second;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

707

return original_click_indices;

708

}

709

710

if (model_->snap_whitespace_selections()) {

711

// We want to expand a purely white-space selection to a multi-selection it

712

// would've been part of. But with this feature disabled we would do a no-

713

// op, because no token is found. Therefore, we need to modify the

714

// 'click_indices' a bit to include a part of the token, so that the click-

715

// finding logic finds the clicked token correctly. This modification is

716

// done by the following function. Note, that it's enough to check the left

717

// side of the current selection, because if the white-space is a part of a

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

718

// multi-selection, necessarily both tokens - on the left and the right

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

719

// sides need to be selected. Thus snapping only to the left is sufficient

720

// (there's a check at the bottom that makes sure that if we snap to the

721

// left token but the result does not contain the initial white-space,

722

// returns the original indices).

723

click_indices = internal::SnapLeftIfWhitespaceSelection(

724

click_indices, context_unicode, *unilib_);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

725

}

726

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

727

std::vector<AnnotatedSpan> candidates;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

728

InterpreterManager interpreter_manager(selection_executor_.get(),

729

classification_executor_.get());

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

730

std::vector<Token> tokens;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

731

if (!ModelSuggestSelection(context_unicode, click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

732

detected_text_language_tags, &interpreter_manager,

733

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

734

TC3_LOG(ERROR) << "Model suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

735

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

736

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

737

if (!RegexChunk(context_unicode, selection_regex_patterns_, &candidates,

738

/*is_serialized_entity_data_enabled=*/false)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

739

TC3_LOG(ERROR) << "Regex suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

740

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

741

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

742

if (!DatetimeChunk(

743

UTF8ToUnicodeText(context, /*do_copy=*/false),

744

/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",

745

options.locales, ModeFlag_SELECTION, options.annotation_usecase,

746

/*is_serialized_entity_data_enabled=*/false, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

747

TC3_LOG(ERROR) << "Datetime suggest selection failed.";

748

return original_click_indices;

749

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

750

if (knowledge_engine_ != nullptr &&

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

751

!knowledge_engine_->Chunk(context, options.annotation_usecase,

752

&candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

753

TC3_LOG(ERROR) << "Knowledge suggest selection failed.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

754

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

755

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

756

if (contact_engine_ != nullptr &&

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

757

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

758

TC3_LOG(ERROR) << "Contact suggest selection failed.";

759

return original_click_indices;

760

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

761

if (installed_app_engine_ != nullptr &&

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

762

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

763

TC3_LOG(ERROR) << "Installed app suggest selection failed.";

764

return original_click_indices;

765

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

766

if (number_annotator_ != nullptr &&

767

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

768

&candidates)) {

769

TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";

770

return original_click_indices;

771

}

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

772

if (duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

773

!duration_annotator_->FindAll(context_unicode, tokens,

774

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

775

TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";

776

return original_click_indices;

777

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

778

779

// Sort candidates according to their position in the input, so that the next

780

// code can assume that any connected component of overlapping spans forms a

781

// contiguous block.

782

std::sort(candidates.begin(), candidates.end(),

783

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

784

return a.span.first < b.span.first;

785

});

786

787

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

788

if (!ResolveConflicts(candidates, context, tokens,

789

detected_text_language_tags, options.annotation_usecase,

790

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

791

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

792

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

793

}

794

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

795

std::sort(candidate_indices.begin(), candidate_indices.end(),

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

796

[this, &candidates](int a, int b) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

797

return GetPriorityScore(candidates[a].classification) >

798

GetPriorityScore(candidates[b].classification);

799

});

800

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

801

for (const int i : candidate_indices) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

802

if (SpansOverlap(candidates[i].span, click_indices) &&

803

SpansOverlap(candidates[i].span, original_click_indices)) {

804

// Run model classification if not present but requested and there's a

805

// classification collection filter specified.

806

if (candidates[i].classification.empty() &&

807

model_->selection_options()->always_classify_suggested_selection() &&

808

!filtered_collections_selection_.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

809

if (!ModelClassifyText(context, detected_text_language_tags,

810

candidates[i].span, &interpreter_manager,

811

/*embedding_cache=*/nullptr,

812

&candidates[i].classification)) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

813

return original_click_indices;

}

}

// Ignore if span classification is filtered.

818

if (FilteredForSelection(candidates[i])) {

819

return original_click_indices;

820

}

821

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

822

return candidates[i].span;

}

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

826

return original_click_indices;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

namespace {

// Helper function that returns the index of the first candidate that

831

// transitively does not overlap with the candidate on 'start_index'. If the end

832

// of 'candidates' is reached, it returns the index that points right behind the

833

// array.

834

int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,

835

int start_index) {

836

int first_non_overlapping = start_index + 1;

837

CodepointSpan conflicting_span = candidates[start_index].span;

838

while (

839

first_non_overlapping < candidates.size() &&

840

SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {

841

// Grow the span to include the current one.

842

conflicting_span.second = std::max(

843

conflicting_span.second, candidates[first_non_overlapping].span.second);

844

845

++first_non_overlapping;

846

}

847

return first_non_overlapping;

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

851

bool Annotator::ResolveConflicts(

852

const std::vector<AnnotatedSpan>& candidates, const std::string& context,

853

const std::vector<Token>& cached_tokens,

854

const std::vector<Locale>& detected_text_language_tags,

855

AnnotationUsecase annotation_usecase,

856

InterpreterManager* interpreter_manager, std::vector<int>* result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

857

result->clear();

858

result->reserve(candidates.size());

859

for (int i = 0; i < candidates.size();) {

860

int first_non_overlapping =

861

FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);

862

863

const bool conflict_found = first_non_overlapping != (i + 1);

864

if (conflict_found) {

865

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

866

if (!ResolveConflict(context, cached_tokens, candidates,

867

detected_text_language_tags, i,

868

first_non_overlapping, annotation_usecase,

869

interpreter_manager, &candidate_indices)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

870

return false;

871

}

872

result->insert(result->end(), candidate_indices.begin(),

873

candidate_indices.end());

874

} else {

875

result->push_back(i);

876

}

877

878

// Skip over the whole conflicting group/go to next candidate.

879

i = first_non_overlapping;

}

return true;

}

namespace {

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

885

// Returns true, if the given two sources do conflict in given annotation

886

// usecase.

887

// - In SMART usecase, all sources do conflict, because there's only 1 possible

888

// annotation for a given span.

889

// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime

890

// and duration), while others not (e.g. duration and number).

891

bool DoSourcesConflict(AnnotationUsecase annotation_usecase,

892

const AnnotatedSpan::Source source1,

893

const AnnotatedSpan::Source source2) {

894

uint32 source_mask =

895

(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));

896

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

897

switch (annotation_usecase) {

898

case AnnotationUsecase_ANNOTATION_USECASE_SMART:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

899

// In the SMART mode, all annotations conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

900

return true;

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

901

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

902

case AnnotationUsecase_ANNOTATION_USECASE_RAW:

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

903

// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",

904

// can have two non-conflicting annotations: "in 3 hours" (datetime), "3

905

// hours" (duration).

906

if ((source_mask &

907

(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&

908

(source_mask &

909

(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {

910

return false;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

911

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

912

913

// A KNOWLEDGE entity does not conflict with anything.

914

if ((source_mask &

915

(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {

return false;

}

// Entities from other sources can conflict.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

920

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

} // namespace

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

925

bool Annotator::ResolveConflict(

926

const std::string& context, const std::vector<Token>& cached_tokens,

927

const std::vector<AnnotatedSpan>& candidates,

928

const std::vector<Locale>& detected_text_language_tags, int start_index,

929

int end_index, AnnotationUsecase annotation_usecase,

930

InterpreterManager* interpreter_manager,

931

std::vector<int>* chosen_indices) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

932

std::vector<int> conflicting_indices;

933

std::unordered_map<int, float> scores;

934

for (int i = start_index; i < end_index; ++i) {

935

conflicting_indices.push_back(i);

936

if (!candidates[i].classification.empty()) {

937

scores[i] = GetPriorityScore(candidates[i].classification);

continue;

}

// OPTIMIZATION: So that we don't have to classify all the ML model

942

// spans apriori, we wait until we get here, when they conflict with

943

// something and we need the actual classification scores. So if the

944

// candidate conflicts and comes from the model, we need to run a

945

// classification to determine its priority:

946

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

947

if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,

948

candidates[i].span, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

949

/*embedding_cache=*/nullptr, &classification)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (!classification.empty()) {

954

scores[i] = GetPriorityScore(classification);

}

}

std::sort(conflicting_indices.begin(), conflicting_indices.end(),

959

[&scores](int i, int j) { return scores[i] > scores[j]; });

960

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

961

// Here we keep a set of indices that were chosen, per-source, to enable

962

// effective computation.

963

std::unordered_map<AnnotatedSpan::Source, SortedIntSet>

964

chosen_indices_for_source_map;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

965

966

// Greedily place the candidates if they don't conflict with the already

967

// placed ones.

968

for (int i = 0; i < conflicting_indices.size(); ++i) {

969

const int considered_candidate = conflicting_indices[i];

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

970

971

// See if there is a conflict between the candidate and all already placed

972

// candidates.

973

bool conflict = false;

974

SortedIntSet* chosen_indices_for_source_ptr = nullptr;

975

for (auto& source_set_pair : chosen_indices_for_source_map) {

976

if (source_set_pair.first == candidates[considered_candidate].source) {

977

chosen_indices_for_source_ptr = &source_set_pair.second;

978

}

979

980

if (DoSourcesConflict(annotation_usecase, source_set_pair.first,

981

candidates[considered_candidate].source) &&

982

DoesCandidateConflict(considered_candidate, candidates,

983

source_set_pair.second)) {

984

conflict = true;

985

break;

986

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

987

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

988

989

// Skip the candidate if a conflict was found.

if (conflict) {

continue;

}

// If the set of indices for the current source doesn't exist yet,

995

// initialize it.

996

if (chosen_indices_for_source_ptr == nullptr) {

997

SortedIntSet new_set([&candidates](int a, int b) {

998

return candidates[a].span.first < candidates[b].span.first;

999

});

1000

chosen_indices_for_source_map[candidates[considered_candidate].source] =

1001

std::move(new_set);

1002

chosen_indices_for_source_ptr =

1003

&chosen_indices_for_source_map[candidates[considered_candidate]

.source];

}

// Place the candidate to the output and to the per-source conflict set.

1008

chosen_indices->push_back(considered_candidate);

1009

chosen_indices_for_source_ptr->insert(considered_candidate);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1010

}

1011

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1012

std::sort(chosen_indices->begin(), chosen_indices->end());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1017

bool Annotator::ModelSuggestSelection(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1018

const UnicodeText& context_unicode, CodepointSpan click_indices,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1019

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1020

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1021

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1022

if (model_->triggering_options() == nullptr ||

1023

!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1027

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1028

ml_model_triggering_locales_,

1029

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1033

int click_pos;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1034

*tokens = selection_feature_processor_->Tokenize(context_unicode);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1035

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1036

context_unicode, click_indices,

1037

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1038

tokens, &click_pos);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1039

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1040

TC3_VLOG(1) << "Could not calculate the click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1041

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1042

}

1043

1044

const int symmetry_context_size =

1045

model_->selection_options()->symmetry_context_size();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1046

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1047

bounds_sensitive_features = selection_feature_processor_->GetOptions()

1048

->bounds_sensitive_features();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1049

1050

// The symmetry context span is the clicked token with symmetry_context_size

1051

// tokens on either side.

1052

const TokenSpan symmetry_context_span = IntersectTokenSpans(

1053

ExpandTokenSpan(SingleTokenSpan(click_pos),

1054

/*num_tokens_left=*/symmetry_context_size,

1055

/*num_tokens_right=*/symmetry_context_size),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1056

{0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1057

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1058

// Compute the extraction span based on the model type.

1059

TokenSpan extraction_span;

1060

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1061

// The extraction span is the symmetry context span expanded to include

1062

// max_selection_span tokens on either side, which is how far a selection

1063

// can stretch from the click, plus a relevant number of tokens outside of

1064

// the bounds of the selection.

1065

const int max_selection_span =

1066

selection_feature_processor_->GetOptions()->max_selection_span();

1067

extraction_span =

1068

ExpandTokenSpan(symmetry_context_span,

1069

/*num_tokens_left=*/max_selection_span +

1070

bounds_sensitive_features->num_tokens_before(),

1071

/*num_tokens_right=*/max_selection_span +

1072

bounds_sensitive_features->num_tokens_after());

1073

} else {

1074

// The extraction span is the symmetry context span expanded to include

1075

// context_size tokens on either side.

1076

const int context_size =

1077

selection_feature_processor_->GetOptions()->context_size();

1078

extraction_span = ExpandTokenSpan(symmetry_context_span,

1079

/*num_tokens_left=*/context_size,

1080

/*num_tokens_right=*/context_size);

1081

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1082

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1083

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1084

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1085

*tokens, extraction_span)) {

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1089

std::unique_ptr<CachedFeatures> cached_features;

1090

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1091

*tokens, extraction_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1092

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1093

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1094

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1095

selection_feature_processor_->EmbeddingSize() +

1096

selection_feature_processor_->DenseFeaturesCount(),

1097

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1098

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Produce selection model candidates.

1103

std::vector<TokenSpan> chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1104

if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1105

interpreter_manager->SelectionInterpreter(), *cached_features,

1106

&chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1107

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

for (const TokenSpan& chunk : chunks) {

1112

AnnotatedSpan candidate;

1113

candidate.span = selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1114

context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1115

if (model_->selection_options()->strip_unpaired_brackets()) {

1116

candidate.span =

1117

StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);

1118

}

1119

1120

// Only output non-empty spans.

1121

if (candidate.span.first != candidate.span.second) {

1122

result->push_back(candidate);

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1128

bool Annotator::ModelClassifyText(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1129

const std::string& context,

1130

const std::vector<Locale>& detected_text_language_tags,

1131

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1132

FeatureProcessor::EmbeddingCache* embedding_cache,

1133

std::vector<ClassificationResult>* classification_results) const {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1134

return ModelClassifyText(context, {}, detected_text_language_tags,

1135

selection_indices, interpreter_manager,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1136

embedding_cache, classification_results);

}

namespace internal {

std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,

1141

CodepointSpan selection_indices,

1142

TokenSpan tokens_around_selection_to_copy) {

1143

const auto first_selection_token = std::upper_bound(

1144

cached_tokens.begin(), cached_tokens.end(), selection_indices.first,

1145

[](int selection_start, const Token& token) {

1146

return selection_start < token.end;

1147

});

1148

const auto last_selection_token = std::lower_bound(

1149

cached_tokens.begin(), cached_tokens.end(), selection_indices.second,

1150

[](const Token& token, int selection_end) {

1151

return token.start < selection_end;

1152

});

1153

1154

const int64 first_token = std::max(

1155

static_cast<int64>(0),

1156

static_cast<int64>((first_selection_token - cached_tokens.begin()) -

1157

tokens_around_selection_to_copy.first));

1158

const int64 last_token = std::min(

1159

static_cast<int64>(cached_tokens.size()),

1160

static_cast<int64>((last_selection_token - cached_tokens.begin()) +

1161

tokens_around_selection_to_copy.second));

1162

1163

std::vector<Token> tokens;

1164

tokens.reserve(last_token - first_token);

1165

for (int i = first_token; i < last_token; ++i) {

1166

tokens.push_back(cached_tokens[i]);

}

return tokens;

}

} // namespace internal

1171

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1172

TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1173

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1174

bounds_sensitive_features =

1175

classification_feature_processor_->GetOptions()

1176

->bounds_sensitive_features();

1177

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1178

// The extraction span is the selection span expanded to include a relevant

1179

// number of tokens outside of the bounds of the selection.

1180

return {bounds_sensitive_features->num_tokens_before(),

1181

bounds_sensitive_features->num_tokens_after()};

1182

} else {

1183

// The extraction span is the clicked token with context_size tokens on

1184

// either side.

1185

const int context_size =

1186

selection_feature_processor_->GetOptions()->context_size();

1187

return {context_size, context_size};

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1191

namespace {

1192

// Sorts the classification results from high score to low score.

1193

void SortClassificationResults(

1194

std::vector<ClassificationResult>* classification_results) {

1195

std::sort(classification_results->begin(), classification_results->end(),

1196

[](const ClassificationResult& a, const ClassificationResult& b) {

1197

return a.score > b.score;

});

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1202

bool Annotator::ModelClassifyText(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1203

const std::string& context, const std::vector<Token>& cached_tokens,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1204

const std::vector<Locale>& detected_text_language_tags,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1205

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1206

FeatureProcessor::EmbeddingCache* embedding_cache,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1207

std::vector<ClassificationResult>* classification_results) const {

1208

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1209

return ModelClassifyText(context, cached_tokens, detected_text_language_tags,

1210

selection_indices, interpreter_manager,

1211

embedding_cache, classification_results, &tokens);

1212

}

1213

1214

bool Annotator::ModelClassifyText(

1215

const std::string& context, const std::vector<Token>& cached_tokens,

1216

const std::vector<Locale>& detected_text_language_tags,

1217

CodepointSpan selection_indices, InterpreterManager* interpreter_manager,

1218

FeatureProcessor::EmbeddingCache* embedding_cache,

1219

std::vector<ClassificationResult>* classification_results,

1220

std::vector<Token>* tokens) const {

1221

if (model_->triggering_options() == nullptr ||

1222

!(model_->triggering_options()->enabled_modes() &

1223

ModeFlag_CLASSIFICATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1227

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1228

ml_model_triggering_locales_,

1229

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1233

if (cached_tokens.empty()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1234

*tokens = classification_feature_processor_->Tokenize(context);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1235

} else {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1236

*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,

1237

ClassifyTextUpperBoundNeededTokens());

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1238

}

1239

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1240

int click_pos;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1241

classification_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1242

context, selection_indices,

1243

classification_feature_processor_->GetOptions()

1244

->only_use_line_with_click(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1245

tokens, &click_pos);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1246

const TokenSpan selection_token_span =

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1247

CodepointSpanToTokenSpan(*tokens, selection_indices);

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1248

const int selection_num_tokens = TokenSpanSize(selection_token_span);

1249

if (model_->classification_options()->max_num_tokens() > 0 &&

1250

model_->classification_options()->max_num_tokens() <

1251

selection_num_tokens) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1252

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1256

const FeatureProcessorOptions_::BoundsSensitiveFeatures*

1257

bounds_sensitive_features =

1258

classification_feature_processor_->GetOptions()

1259

->bounds_sensitive_features();

1260

if (selection_token_span.first == kInvalidIndex ||

1261

selection_token_span.second == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1262

TC3_LOG(ERROR) << "Could not determine span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Compute the extraction span based on the model type.

1267

TokenSpan extraction_span;

1268

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1269

// The extraction span is the selection span expanded to include a relevant

1270

// number of tokens outside of the bounds of the selection.

1271

extraction_span = ExpandTokenSpan(

1272

selection_token_span,

1273

/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),

1274

/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());

1275

} else {

1276

if (click_pos == kInvalidIndex) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1277

TC3_LOG(ERROR) << "Couldn't choose a click position.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1278

return false;

1279

}

1280

// The extraction span is the clicked token with context_size tokens on

1281

// either side.

1282

const int context_size =

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1283

classification_feature_processor_->GetOptions()->context_size();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1284

extraction_span = ExpandTokenSpan(SingleTokenSpan(click_pos),

1285

/*num_tokens_left=*/context_size,

1286

/*num_tokens_right=*/context_size);

1287

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1288

extraction_span = IntersectTokenSpans(extraction_span, {0, tokens->size()});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1289

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1290

if (!classification_feature_processor_->HasEnoughSupportedCodepoints(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1291

*tokens, extraction_span)) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1292

*classification_results = {{Collections::Other(), 1.0}};

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1296

std::unique_ptr<CachedFeatures> cached_features;

1297

if (!classification_feature_processor_->ExtractFeatures(

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1298

*tokens, extraction_span, selection_indices,

1299

embedding_executor_.get(), embedding_cache,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1300

classification_feature_processor_->EmbeddingSize() +

1301

classification_feature_processor_->DenseFeaturesCount(),

1302

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1303

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1304

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1305

}

1306

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1307

std::vector<float> features;

1308

features.reserve(cached_features->OutputFeaturesSize());

1309

if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {

1310

cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,

1311

&features);

1312

} else {

1313

cached_features->AppendClickContextFeaturesForClick(click_pos, &features);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1314

}

1315

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1316

TensorView<float> logits = classification_executor_->ComputeLogits(

1317

TensorView<float>(features.data(),

1318

{1, static_cast<int>(features.size())}),

1319

interpreter_manager->ClassificationInterpreter());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1320

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1321

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

if (logits.dims() != 2 || logits.dim(0) != 1 ||

1326

logits.dim(1) != classification_feature_processor_->NumCollections()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1327

TC3_LOG(ERROR) << "Mismatching output";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

const std::vector<float> scores =

1332

ComputeSoftmax(logits.data(), logits.dim(1));

1333

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1334

if (scores.empty()) {

1335

*classification_results = {{Collections::Other(), 1.0}};

1336

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1337

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1338

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1339

const int best_score_index =

1340

std::max_element(scores.begin(), scores.end()) - scores.begin();

1341

const std::string top_collection =

1342

classification_feature_processor_->LabelToCollection(best_score_index);

1343

1344

// Sanity checks.

1345

if (top_collection == Collections::Phone()) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1346

const int digit_count = CountDigits(context, selection_indices);

1347

if (digit_count <

1348

model_->classification_options()->phone_min_num_digits() ||

1349

digit_count >

1350

model_->classification_options()->phone_max_num_digits()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1351

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1352

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1353

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1354

} else if (top_collection == Collections::Address()) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1355

if (selection_num_tokens <

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1356

model_->classification_options()->address_min_num_tokens()) {

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1357

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1358

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1359

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1360

} else if (top_collection == Collections::Dictionary()) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1361

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1362

dictionary_locales_,

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1363

/*default_value=*/false)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1364

*classification_results = {{Collections::Other(), 1.0}};

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1365

return true;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1366

}

1367

}

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1368

1369

*classification_results = {{top_collection, 1.0, scores[best_score_index]}};

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1370

return true;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1371

}

1372

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1373

bool Annotator::RegexClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1374

const std::string& context, CodepointSpan selection_indices,

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1375

std::vector<ClassificationResult>* classification_result) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1376

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1377

UTF8ToUnicodeText(context, /*do_copy=*/false)

1378

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1379

const UnicodeText selection_text_unicode(

1380

UTF8ToUnicodeText(selection_text, /*do_copy=*/false));

1381

1382

// Check whether any of the regular expressions match.

1383

for (const int pattern_id : classification_regex_patterns_) {

1384

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

1385

const std::unique_ptr<UniLib::RegexMatcher> matcher =

1386

regex_pattern.pattern->Matcher(selection_text_unicode);

1387

int status = UniLib::RegexMatcher::kNoError;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1388

bool matches;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1389

if (regex_pattern.config->use_approximate_matching()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1390

matches = matcher->ApproximatelyMatches(&status);

1391

} else {

1392

matches = matcher->Matches(&status);

1393

}

1394

if (status != UniLib::RegexMatcher::kNoError) {

1395

return false;

1396

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1397

if (matches && VerifyRegexMatchCandidate(

1398

context, regex_pattern.config->verification_options(),

1399

selection_text, matcher.get())) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1400

classification_result->push_back(

1401

{regex_pattern.config->collection_name()->str(),

1402

regex_pattern.config->target_classification_score(),

1403

regex_pattern.config->priority_score()});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1404

if (!SerializedEntityDataFromRegexMatch(

1405

regex_pattern.config, matcher.get(),

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1406

&classification_result->back().serialized_entity_data)) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1407

TC3_LOG(ERROR) << "Could not get entity data.";

1408

return false;

1409

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1413

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1414

}

1415

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1416

namespace {

1417

std::string PickCollectionForDatetime(

1418

const DatetimeParseResult& datetime_parse_result) {

1419

switch (datetime_parse_result.granularity) {

1420

case GRANULARITY_HOUR:

1421

case GRANULARITY_MINUTE:

1422

case GRANULARITY_SECOND:

1423

return Collections::DateTime();

1424

default:

1425

return Collections::Date();

1426

}

1427

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1428

1429

std::string CreateDatetimeSerializedEntityData(

1430

const DatetimeParseResult& parse_result) {

1431

EntityDataT entity_data;

1432

entity_data.datetime.reset(new EntityData_::DatetimeT());

1433

entity_data.datetime->time_ms_utc = parse_result.time_ms_utc;

1434

entity_data.datetime->granularity =

1435

static_cast<EntityData_::Datetime_::Granularity>(

1436

parse_result.granularity);

1437

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1438

for (const auto& c : parse_result.datetime_components) {

1439

EntityData_::Datetime_::DatetimeComponentT datetime_component;

1440

datetime_component.absolute_value = c.value;

1441

datetime_component.relative_count = c.relative_count;

1442

datetime_component.component_type =

1443

static_cast<EntityData_::Datetime_::DatetimeComponent_::ComponentType>(

1444

c.component_type);

1445

datetime_component.relation_type =

1446

EntityData_::Datetime_::DatetimeComponent_::RelationType_ABSOLUTE;

1447

if (c.relative_qualifier !=

1448

DatetimeComponent::RelativeQualifier::UNSPECIFIED) {

1449

datetime_component.relation_type =

1450

EntityData_::Datetime_::DatetimeComponent_::RelationType_RELATIVE;

1451

}

1452

entity_data.datetime->datetime_component.emplace_back(

1453

new EntityData_::Datetime_::DatetimeComponentT(datetime_component));

1454

}

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1455

flatbuffers::FlatBufferBuilder builder;

1456

FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));

1457

return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),

1458

builder.GetSize());

1459

}

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1460

} // namespace

1461

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1462

bool Annotator::DatetimeClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1463

const std::string& context, CodepointSpan selection_indices,

1464

const ClassificationOptions& options,

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1465

std::vector<ClassificationResult>* classification_results) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1466

if (!datetime_parser_) {

return false;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1470

const std::string selection_text =

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1471

UTF8ToUnicodeText(context, /*do_copy=*/false)

1472

.UTF8Substring(selection_indices.first, selection_indices.second);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1473

1474

std::vector<DatetimeParseResultSpan> datetime_spans;

1475

if (!datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,

1476

options.reference_timezone, options.locales,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1477

ModeFlag_CLASSIFICATION,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1478

options.annotation_usecase,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1479

/*anchor_start_end=*/true, &datetime_spans)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1480

TC3_LOG(ERROR) << "Error during parsing datetime.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1481

return false;

1482

}

1483

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

1484

// Only consider the result valid if the selection and extracted datetime

1485

// spans exactly match.

1486

if (std::make_pair(datetime_span.span.first + selection_indices.first,

1487

datetime_span.span.second + selection_indices.first) ==

1488

selection_indices) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1489

for (const DatetimeParseResult& parse_result : datetime_span.data) {

1490

classification_results->emplace_back(

Tony Mak

2019-02-01 14:52:10 +0000

[diff] [blame]

1491

PickCollectionForDatetime(parse_result),

1492

datetime_span.target_classification_score);

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1493

classification_results->back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1494

classification_results->back().serialized_entity_data =

1495

CreateDatetimeSerializedEntityData(parse_result);

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1496

classification_results->back().priority_score =

1497

datetime_span.priority_score;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1498

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1499

return true;

1500

}

1501

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1502

return true;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1503

}

1504

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1505

std::vector<ClassificationResult> Annotator::ClassifyText(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1506

const std::string& context, CodepointSpan selection_indices,

1507

const ClassificationOptions& options) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1508

if (!initialized_) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1509

TC3_LOG(ERROR) << "Not initialized";

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1513

if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {

return {};

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1517

std::vector<Locale> detected_text_language_tags;

1518

if (!ParseLocales(options.detected_text_language_tags,

1519

&detected_text_language_tags)) {

1520

TC3_LOG(WARNING)

1521

<< "Failed to parse the detected_text_language_tags in options: "

1522

<< options.detected_text_language_tags;

1523

}

1524

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1525

model_triggering_locales_,

1526

/*default_value=*/true)) {

return {};

}

Tony Mak

2019-11-13 15:39:57 +0000

[diff] [blame^]

1530

if (!IsValidSpanInput(UTF8ToUnicodeText(context, /*do_copy=*/false),

1531

selection_indices)) {

1532

TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1533

<< std::get<0>(selection_indices) << " "

1534

<< std::get<1>(selection_indices);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1538

// We'll accumulate a list of candidates, and pick the best candidate in the

1539

// end.

1540

std::vector<AnnotatedSpan> candidates;

1541

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1542

// Try the knowledge engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1543

// TODO(b/126579108): Propagate error status.

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1544

ClassificationResult knowledge_result;

1545

if (knowledge_engine_ && knowledge_engine_->ClassifyText(

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1546

context, selection_indices,

1547

options.annotation_usecase, &knowledge_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1548

candidates.push_back({selection_indices, {knowledge_result}});

1549

candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1550

}

1551

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1552

AddContactMetadataToKnowledgeClassificationResults(&candidates);

1553

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1554

// Try the contact engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1555

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1556

ClassificationResult contact_result;

1557

if (contact_engine_ && contact_engine_->ClassifyText(

1558

context, selection_indices, &contact_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1559

candidates.push_back({selection_indices, {contact_result}});

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1560

}

1561

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1562

// Try the installed app engine.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1563

// TODO(b/126579108): Propagate error status.

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1564

ClassificationResult installed_app_result;

1565

if (installed_app_engine_ &&

1566

installed_app_engine_->ClassifyText(context, selection_indices,

1567

&installed_app_result)) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1568

candidates.push_back({selection_indices, {installed_app_result}});

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1569

}

1570

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1571

// Try the regular expression models.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1572

std::vector<ClassificationResult> regex_results;

1573

if (!RegexClassifyText(context, selection_indices, &regex_results)) {

1574

return {};

1575

}

1576

for (const ClassificationResult& result : regex_results) {

1577

candidates.push_back({selection_indices, {result}});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1578

}

1579

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1580

// Try the date model.

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1581

//

1582

// DatetimeClassifyText only returns the first result, which can however have

1583

// more interpretations. They are inserted in the candidates as a single

1584

// AnnotatedSpan, so that they get treated together by the conflict resolution

1585

// algorithm.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1586

std::vector<ClassificationResult> datetime_results;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1587

if (!DatetimeClassifyText(context, selection_indices, options,

1588

&datetime_results)) {

1589

return {};

1590

}

1591

if (!datetime_results.empty()) {

1592

candidates.push_back({selection_indices, std::move(datetime_results)});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1593

candidates.back().source = AnnotatedSpan::Source::DATETIME;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1594

}

1595

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1596

// Try the number annotator.

1597

// TODO(b/126579108): Propagate error status.

1598

ClassificationResult number_annotator_result;

1599

if (number_annotator_ &&

1600

number_annotator_->ClassifyText(

1601

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1602

options.annotation_usecase, &number_annotator_result)) {

1603

candidates.push_back({selection_indices, {number_annotator_result}});

1604

}

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1605

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1606

// Try the duration annotator.

1607

ClassificationResult duration_annotator_result;

1608

if (duration_annotator_ &&

1609

duration_annotator_->ClassifyText(

1610

UTF8ToUnicodeText(context, /*do_copy=*/false), selection_indices,

1611

options.annotation_usecase, &duration_annotator_result)) {

1612

candidates.push_back({selection_indices, {duration_annotator_result}});

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1613

candidates.back().source = AnnotatedSpan::Source::DURATION;

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1614

}

1615

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1616

// Try the ML model.

1617

//

1618

// The output of the model is considered as an exclusive 1-of-N choice. That's

1619

// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1

1620

// span for each candidate, like e.g. the regex model.

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1621

InterpreterManager interpreter_manager(selection_executor_.get(),

1622

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1623

std::vector<ClassificationResult> model_results;

1624

std::vector<Token> tokens;

1625

if (!ModelClassifyText(

1626

context, /*cached_tokens=*/{}, detected_text_language_tags,

1627

selection_indices, &interpreter_manager,

1628

/*embedding_cache=*/nullptr, &model_results, &tokens)) {

1629

return {};

1630

}

1631

if (!model_results.empty()) {

1632

candidates.push_back({selection_indices, std::move(model_results)});

1633

}

1634

1635

std::vector<int> candidate_indices;

1636

if (!ResolveConflicts(candidates, context, tokens,

1637

detected_text_language_tags, options.annotation_usecase,

1638

&interpreter_manager, &candidate_indices)) {

1639

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

return {};

}

std::vector<ClassificationResult> results;

1644

for (const int i : candidate_indices) {

1645

for (const ClassificationResult& result : candidates[i].classification) {

1646

if (!FilteredForClassification(result)) {

1647

results.push_back(result);

1648

}

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1649

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1650

}

1651

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1652

// Sort results according to score.

1653

std::sort(results.begin(), results.end(),

1654

[](const ClassificationResult& a, const ClassificationResult& b) {

1655

return a.score > b.score;

1656

});

1657

1658

if (results.empty()) {

Tony Mak

2019-04-30 09:34:45 +0100

[diff] [blame]

1659

results = {{Collections::Other(), 1.0}};

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1660

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1661

return results;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1662

}

1663

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1664

bool Annotator::ModelAnnotate(

1665

const std::string& context,

1666

const std::vector<Locale>& detected_text_language_tags,

1667

InterpreterManager* interpreter_manager, std::vector<Token>* tokens,

1668

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1669

if (model_->triggering_options() == nullptr ||

1670

!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {

return true;

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1674

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1675

ml_model_triggering_locales_,

1676

/*default_value=*/true)) {

return true;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1680

const UnicodeText context_unicode = UTF8ToUnicodeText(context,

1681

/*do_copy=*/false);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1682

std::vector<UnicodeTextRange> lines;

1683

if (!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {

1684

lines.push_back({context_unicode.begin(), context_unicode.end()});

1685

} else {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1686

lines = selection_feature_processor_->SplitContext(

1687

context_unicode, selection_feature_processor_->GetOptions()

1688

->use_pipe_character_for_newline());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1689

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1690

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1691

const float min_annotate_confidence =

1692

(model_->triggering_options() != nullptr

1693

? model_->triggering_options()->min_annotate_confidence()

1694

: 0.f);

1695

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1696

for (const UnicodeTextRange& line : lines) {

Tony Mak

408c6b8

2019-03-08 17:57:27 +0000

[diff] [blame]

1697

FeatureProcessor::EmbeddingCache embedding_cache;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1698

const std::string line_str =

1699

UnicodeText::UTF8Substring(line.first, line.second);

1700

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1701

*tokens = selection_feature_processor_->Tokenize(line_str);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1702

selection_feature_processor_->RetokenizeAndFindClick(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1703

line_str, {0, std::distance(line.first, line.second)},

1704

selection_feature_processor_->GetOptions()->only_use_line_with_click(),

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1705

tokens,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1706

/*click_pos=*/nullptr);

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1707

const TokenSpan full_line_span = {0, tokens->size()};

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1708

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1709

// TODO(zilka): Add support for greater granularity of this check.

1710

if (!selection_feature_processor_->HasEnoughSupportedCodepoints(

1711

*tokens, full_line_span)) {

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1715

std::unique_ptr<CachedFeatures> cached_features;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1716

if (!selection_feature_processor_->ExtractFeatures(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1717

*tokens, full_line_span,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1718

/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},

1719

embedding_executor_.get(),

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1720

/*embedding_cache=*/nullptr,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1721

selection_feature_processor_->EmbeddingSize() +

1722

selection_feature_processor_->DenseFeaturesCount(),

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1723

&cached_features)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1724

TC3_LOG(ERROR) << "Could not extract features.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1725

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1726

}

1727

1728

std::vector<TokenSpan> local_chunks;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1729

if (!ModelChunk(tokens->size(), /*span_of_interest=*/full_line_span,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1730

interpreter_manager->SelectionInterpreter(),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1731

*cached_features, &local_chunks)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1732

TC3_LOG(ERROR) << "Could not chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1733

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1734

}

1735

1736

const int offset = std::distance(context_unicode.begin(), line.first);

1737

for (const TokenSpan& chunk : local_chunks) {

1738

const CodepointSpan codepoint_span =

1739

selection_feature_processor_->StripBoundaryCodepoints(

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1740

line_str, TokenSpanToCodepointSpan(*tokens, chunk));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1741

1742

// Skip empty spans.

1743

if (codepoint_span.first != codepoint_span.second) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1744

std::vector<ClassificationResult> classification;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1745

if (!ModelClassifyText(line_str, *tokens, detected_text_language_tags,

1746

codepoint_span, interpreter_manager,

1747

&embedding_cache, &classification)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1748

TC3_LOG(ERROR) << "Could not classify text: "

1749

<< (codepoint_span.first + offset) << " "

1750

<< (codepoint_span.second + offset);

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

return false;

}

// Do not include the span if it's classified as "other".

1755

if (!classification.empty() && !ClassifiedAsOther(classification) &&

1756

classification[0].score >= min_annotate_confidence) {

1757

AnnotatedSpan result_span;

1758

result_span.span = {codepoint_span.first + offset,

1759

codepoint_span.second + offset};

1760

result_span.classification = std::move(classification);

1761

result->push_back(std::move(result_span));

1762

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1763

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

1764

}

1765

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1769

const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1770

return selection_feature_processor_.get();

1771

}

1772

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1773

const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

1774

const {

1775

return classification_feature_processor_.get();

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1776

}

1777

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1778

const DatetimeParser* Annotator::DatetimeParserForTests() const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1779

return datetime_parser_.get();

1780

}

1781

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1782

void Annotator::RemoveNotEnabledEntityTypes(

1783

const EnabledEntityTypes& is_entity_type_enabled,

1784

std::vector<AnnotatedSpan>* annotated_spans) const {

1785

for (AnnotatedSpan& annotated_span : *annotated_spans) {

1786

std::vector<ClassificationResult>& classifications =

1787

annotated_span.classification;

1788

classifications.erase(

1789

std::remove_if(classifications.begin(), classifications.end(),

1790

[&is_entity_type_enabled](

1791

const ClassificationResult& classification_result) {

1792

return !is_entity_type_enabled(

1793

classification_result.collection);

1794

}),

1795

classifications.end());

1796

}

1797

annotated_spans->erase(

1798

std::remove_if(annotated_spans->begin(), annotated_spans->end(),

1799

[](const AnnotatedSpan& annotated_span) {

1800

return annotated_span.classification.empty();

1801

}),

1802

annotated_spans->end());

1803

}

1804

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1805

void Annotator::AddContactMetadataToKnowledgeClassificationResults(

1806

std::vector<AnnotatedSpan>* candidates) const {

1807

if (candidates == nullptr || contact_engine_ == nullptr) {

1808

return;

1809

}

1810

for (auto& candidate : *candidates) {

1811

for (auto& classification_result : candidate.classification) {

1812

contact_engine_->AddContactMetadataToKnowledgeClassificationResult(

1813

&classification_result);

}

}

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1818

std::vector<AnnotatedSpan> Annotator::Annotate(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1819

const std::string& context, const AnnotationOptions& options) const {

1820

std::vector<AnnotatedSpan> candidates;

1821

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

1822

if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {

return {};

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1826

const UnicodeText context_unicode =

1827

UTF8ToUnicodeText(context, /*do_copy=*/false);

1828

if (!context_unicode.is_valid()) {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1832

std::vector<Locale> detected_text_language_tags;

1833

if (!ParseLocales(options.detected_text_language_tags,

1834

&detected_text_language_tags)) {

1835

TC3_LOG(WARNING)

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1836

<< "Failed to parse the detected_text_language_tags in options: "

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1837

<< options.detected_text_language_tags;

1838

}

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

1839

if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,

1840

model_triggering_locales_,

1841

/*default_value=*/true)) {

return {};

}

InterpreterManager interpreter_manager(selection_executor_.get(),

1846

classification_executor_.get());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1847

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1848

// Annotate with the selection model.

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

1849

std::vector<Token> tokens;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1850

if (!ModelAnnotate(context, detected_text_language_tags, &interpreter_manager,

1851

&tokens, &candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1852

TC3_LOG(ERROR) << "Couldn't run ModelAnnotate.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

// Annotate with the regular expression models.

1857

if (!RegexChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1858

annotation_regex_patterns_, &candidates,

1859

options.is_serialized_entity_data_enabled)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1860

TC3_LOG(ERROR) << "Couldn't run RegexChunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

// Annotate with the datetime model.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1865

const EnabledEntityTypes is_entity_type_enabled(options.entity_types);

1866

if ((is_entity_type_enabled(Collections::Date()) ||

1867

is_entity_type_enabled(Collections::DateTime())) &&

1868

!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1869

options.reference_time_ms_utc, options.reference_timezone,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1870

options.locales, ModeFlag_ANNOTATION,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1871

options.annotation_usecase,

1872

options.is_serialized_entity_data_enabled, &candidates)) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1873

TC3_LOG(ERROR) << "Couldn't run DatetimeChunk.";

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1877

// Annotate with the knowledge engine into a temporary vector.

1878

std::vector<AnnotatedSpan> knowledge_candidates;

1879

if (knowledge_engine_ &&

1880

!knowledge_engine_->Chunk(context, options.annotation_usecase,

1881

&knowledge_candidates)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1882

TC3_LOG(ERROR) << "Couldn't run knowledge engine Chunk.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

1886

AddContactMetadataToKnowledgeClassificationResults(&knowledge_candidates);

1887

1888

// Move the knowledge candidates to the full candidate list, and erase

1889

// knowledge_candidates.

1890

candidates.insert(candidates.end(),

1891

std::make_move_iterator(knowledge_candidates.begin()),

1892

std::make_move_iterator(knowledge_candidates.end()));

1893

knowledge_candidates.clear();

1894

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1895

// Annotate with the contact engine.

1896

if (contact_engine_ &&

1897

!contact_engine_->Chunk(context_unicode, tokens, &candidates)) {

1898

TC3_LOG(ERROR) << "Couldn't run contact engine Chunk.";

return {};

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

1902

// Annotate with the installed app engine.

1903

if (installed_app_engine_ &&

1904

!installed_app_engine_->Chunk(context_unicode, tokens, &candidates)) {

1905

TC3_LOG(ERROR) << "Couldn't run installed app engine Chunk.";

return {};

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1909

// Annotate with the number annotator.

1910

if (number_annotator_ != nullptr &&

1911

!number_annotator_->FindAll(context_unicode, options.annotation_usecase,

1912

&candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1913

TC3_LOG(ERROR) << "Couldn't run number annotator FindAll.";

return {};

}

// Annotate with the duration annotator.

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1918

if (is_entity_type_enabled(Collections::Duration()) &&

1919

duration_annotator_ != nullptr &&

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

1920

!duration_annotator_->FindAll(context_unicode, tokens,

1921

options.annotation_usecase, &candidates)) {

Tony Mak

2019-03-20 17:35:13 +0000

[diff] [blame]

1922

TC3_LOG(ERROR) << "Couldn't run duration annotator FindAll.";

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

return {};

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1926

// Sort candidates according to their position in the input, so that the next

1927

// code can assume that any connected component of overlapping spans forms a

1928

// contiguous block.

1929

std::sort(candidates.begin(), candidates.end(),

1930

[](const AnnotatedSpan& a, const AnnotatedSpan& b) {

1931

return a.span.first < b.span.first;

1932

});

1933

1934

std::vector<int> candidate_indices;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1935

if (!ResolveConflicts(candidates, context, tokens,

1936

detected_text_language_tags, options.annotation_usecase,

1937

&interpreter_manager, &candidate_indices)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

1938

TC3_LOG(ERROR) << "Couldn't resolve conflicts.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return {};

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1942

std::vector<AnnotatedSpan> result;

1943

result.reserve(candidate_indices.size());

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1944

AnnotatedSpan aggregated_span;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1945

for (const int i : candidate_indices) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1946

if (candidates[i].span != aggregated_span.span) {

1947

if (!aggregated_span.classification.empty()) {

1948

result.push_back(std::move(aggregated_span));

1949

}

1950

aggregated_span =

1951

AnnotatedSpan(candidates[i].span, /*arg_classification=*/{});

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1952

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1953

if (candidates[i].classification.empty() ||

1954

ClassifiedAsOther(candidates[i].classification) ||

1955

FilteredForAnnotation(candidates[i])) {

1956

continue;

1957

}

1958

for (ClassificationResult& classification : candidates[i].classification) {

1959

aggregated_span.classification.push_back(std::move(classification));

1960

}

1961

}

1962

if (!aggregated_span.classification.empty()) {

1963

result.push_back(std::move(aggregated_span));

1964

}

1965

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

1966

// We generate all candidates and remove them later (with the exception of

1967

// date/time/duration entities) because there are complex interdependencies

1968

// between the entity types. E.g., the TLD of an email can be interpreted as a

1969

// URL, but most likely a user of the API does not want such annotations if

1970

// "url" is enabled and "email" is not.

1971

RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);

1972

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

1973

for (AnnotatedSpan& annotated_span : result) {

1974

SortClassificationResults(&annotated_span.classification);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

1975

}

1976

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

return result;

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

1980

CodepointSpan Annotator::ComputeSelectionBoundaries(

1981

const UniLib::RegexMatcher* match,

1982

const RegexModel_::Pattern* config) const {

1983

if (config->capturing_group() == nullptr) {

1984

// Use first capturing group to specify the selection.

1985

int status = UniLib::RegexMatcher::kNoError;

1986

const CodepointSpan result = {match->Start(1, &status),

1987

match->End(1, &status)};

1988

if (status != UniLib::RegexMatcher::kNoError) {

1989

return {kInvalidIndex, kInvalidIndex};

}

return result;

}

CodepointSpan result = {kInvalidIndex, kInvalidIndex};

1995

const int num_groups = config->capturing_group()->size();

1996

for (int i = 0; i < num_groups; i++) {

1997

if (!config->capturing_group()->Get(i)->extend_selection()) {

continue;

}

int status = UniLib::RegexMatcher::kNoError;

2002

// Check match and adjust bounds.

2003

const int group_start = match->Start(i, &status);

2004

const int group_end = match->End(i, &status);

2005

if (status != UniLib::RegexMatcher::kNoError) {

2006

return {kInvalidIndex, kInvalidIndex};

2007

}

2008

if (group_start == kInvalidIndex || group_end == kInvalidIndex) {

2009

continue;

2010

}

2011

if (result.first == kInvalidIndex) {

2012

result = {group_start, group_end};

2013

} else {

2014

result.first = std::min(result.first, group_start);

2015

result.second = std::max(result.second, group_end);

}

}

return result;

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2021

bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {

2022

if (pattern->serialized_entity_data() != nullptr) {

2023

return true;

2024

}

2025

if (pattern->capturing_group() != nullptr) {

2026

for (const RegexModel_::Pattern_::CapturingGroup* group :

2027

*pattern->capturing_group()) {

2028

if (group->entity_field_path() != nullptr) {

2029

return true;

2030

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2031

if (group->serialized_entity_data() != nullptr) {

2032

return true;

2033

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

return false;

}

bool Annotator::SerializedEntityDataFromRegexMatch(

2040

const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,

2041

std::string* serialized_entity_data) const {

2042

if (!HasEntityData(pattern)) {

2043

serialized_entity_data->clear();

2044

return true;

2045

}

2046

TC3_CHECK(entity_data_builder_ != nullptr);

2047

2048

std::unique_ptr<ReflectiveFlatbuffer> entity_data =

2049

entity_data_builder_->NewRoot();

2050

2051

TC3_CHECK(entity_data != nullptr);

2052

2053

// Set static entity data.

2054

if (pattern->serialized_entity_data() != nullptr) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2055

entity_data->MergeFromSerializedFlatbuffer(

2056

StringPiece(pattern->serialized_entity_data()->c_str(),

2057

pattern->serialized_entity_data()->size()));

2058

}

2059

2060

// Add entity data from rule capturing groups.

2061

if (pattern->capturing_group() != nullptr) {

2062

const int num_groups = pattern->capturing_group()->size();

2063

for (int i = 0; i < num_groups; i++) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2064

const RegexModel_::Pattern_::CapturingGroup* group =

2065

pattern->capturing_group()->Get(i);

2066

2067

// Check whether the group matched.

2068

Optional<std::string> group_match_text =

2069

GetCapturingGroupText(matcher, /*group_id=*/i);

2070

if (!group_match_text.has_value()) {

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2071

continue;

2072

}

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2073

2074

// Set static entity data from capturing group match.

2075

if (group->serialized_entity_data() != nullptr) {

2076

entity_data->MergeFromSerializedFlatbuffer(

2077

StringPiece(group->serialized_entity_data()->c_str(),

2078

group->serialized_entity_data()->size()));

2079

}

2080

2081

// Set entity field from capturing group text.

2082

if (group->entity_field_path() != nullptr) {

Tony Mak

2019-10-15 15:29:22 +0100

[diff] [blame]

2083

UnicodeText normalized_group_match_text =

2084

UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);

2085

2086

// Apply normalization if specified.

2087

if (group->normalization_options() != nullptr) {

2088

normalized_group_match_text =

2089

NormalizeText(unilib_, group->normalization_options(),

2090

normalized_group_match_text);

2091

}

2092

2093

if (!entity_data->ParseAndSet(

2094

group->entity_field_path(),

2095

normalized_group_match_text.ToUTF8String())) {

Tony Mak

2019-09-12 15:40:32 +0100

[diff] [blame]

2096

TC3_LOG(ERROR)

2097

<< "Could not set entity data from rule capturing group.";

2098

return false;

2099

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

}

}

}

*serialized_entity_data = entity_data->Serialize();

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2108

bool Annotator::RegexChunk(const UnicodeText& context_unicode,

2109

const std::vector<int>& rules,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2110

std::vector<AnnotatedSpan>* result,

2111

bool is_serialized_entity_data_enabled) const {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2112

for (int pattern_id : rules) {

2113

const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];

2114

const auto matcher = regex_pattern.pattern->Matcher(context_unicode);

2115

if (!matcher) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2116

TC3_LOG(ERROR) << "Could not get regex matcher for pattern: "

2117

<< pattern_id;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

int status = UniLib::RegexMatcher::kNoError;

2122

while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2123

if (regex_pattern.config->verification_options()) {

Tony Mak

2019-03-26 14:04:00 +0000

[diff] [blame]

2124

if (!VerifyRegexMatchCandidate(

2125

context_unicode.ToUTF8String(),

2126

regex_pattern.config->verification_options(),

2127

matcher->Group(1, &status).ToUTF8String(), matcher.get())) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2128

continue;

2129

}

2130

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2131

2132

std::string serialized_entity_data;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2133

if (is_serialized_entity_data_enabled) {

2134

if (!SerializedEntityDataFromRegexMatch(

2135

regex_pattern.config, matcher.get(), &serialized_entity_data)) {

2136

TC3_LOG(ERROR) << "Could not get entity data.";

2137

return false;

2138

}

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2139

}

2140

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2141

result->emplace_back();

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2142

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2143

// Selection/annotation regular expressions need to specify a capturing

2144

// group specifying the selection.

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2145

result->back().span =

2146

ComputeSelectionBoundaries(matcher.get(), regex_pattern.config);

2147

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2148

result->back().classification = {

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2149

{regex_pattern.config->collection_name()->str(),

2150

regex_pattern.config->target_classification_score(),

2151

regex_pattern.config->priority_score()}};

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2152

2153

result->back().classification[0].serialized_entity_data =

2154

serialized_entity_data;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2160

bool Annotator::ModelChunk(int num_tokens, const TokenSpan& span_of_interest,

2161

tflite::Interpreter* selection_interpreter,

2162

const CachedFeatures& cached_features,

2163

std::vector<TokenSpan>* chunks) const {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2164

const int max_selection_span =

2165

selection_feature_processor_->GetOptions()->max_selection_span();

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2166

// The inference span is the span of interest expanded to include

2167

// max_selection_span tokens on either side, which is how far a selection can

2168

// stretch from the click.

2169

const TokenSpan inference_span = IntersectTokenSpans(

2170

ExpandTokenSpan(span_of_interest,

2171

/*num_tokens_left=*/max_selection_span,

2172

/*num_tokens_right=*/max_selection_span),

2173

{0, num_tokens});

2174

2175

std::vector<ScoredChunk> scored_chunks;

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2176

if (selection_feature_processor_->GetOptions()->bounds_sensitive_features() &&

2177

selection_feature_processor_->GetOptions()

2178

->bounds_sensitive_features()

2179

->enabled()) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2180

if (!ModelBoundsSensitiveScoreChunks(

2181

num_tokens, span_of_interest, inference_span, cached_features,

2182

selection_interpreter, &scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

} else {

if (!ModelClickContextScoreChunks(num_tokens, span_of_interest,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2187

cached_features, selection_interpreter,

2188

&scored_chunks)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2189

return false;

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2190

}

2191

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2192

std::sort(scored_chunks.rbegin(), scored_chunks.rend(),

2193

[](const ScoredChunk& lhs, const ScoredChunk& rhs) {

2194

return lhs.score < rhs.score;

2195

});

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2196

2197

// Traverse the candidate chunks from highest-scoring to lowest-scoring. Pick

2198

// them greedily as long as they do not overlap with any previously picked

2199

// chunks.

2200

std::vector<bool> token_used(TokenSpanSize(inference_span));

2201

chunks->clear();

2202

for (const ScoredChunk& scored_chunk : scored_chunks) {

2203

bool feasible = true;

2204

for (int i = scored_chunk.token_span.first;

2205

i < scored_chunk.token_span.second; ++i) {

2206

if (token_used[i - inference_span.first]) {

feasible = false;

break;

}

}

if (!feasible) {

continue;

}

for (int i = scored_chunk.token_span.first;

2217

i < scored_chunk.token_span.second; ++i) {

2218

token_used[i - inference_span.first] = true;

2219

}

2220

2221

chunks->push_back(scored_chunk.token_span);

2222

}

2223

2224

std::sort(chunks->begin(), chunks->end());

return true;

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2229

namespace {

2230

// Updates the value at the given key in the map to maximum of the current value

2231

// and the given value, or simply inserts the value if the key is not yet there.

2232

template <typename Map>

2233

void UpdateMax(Map* map, typename Map::key_type key,

2234

typename Map::mapped_type value) {

2235

const auto it = map->find(key);

2236

if (it != map->end()) {

2237

it->second = std::max(it->second, value);

} else {

(*map)[key] = value;

}

}

} // namespace

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2244

bool Annotator::ModelClickContextScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2245

int num_tokens, const TokenSpan& span_of_interest,

2246

const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2247

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2248

std::vector<ScoredChunk>* scored_chunks) const {

2249

const int max_batch_size = model_->selection_options()->batch_size();

2250

2251

std::vector<float> all_features;

2252

std::map<TokenSpan, float> chunk_scores;

2253

for (int batch_start = span_of_interest.first;

2254

batch_start < span_of_interest.second; batch_start += max_batch_size) {

2255

const int batch_end =

2256

std::min(batch_start + max_batch_size, span_of_interest.second);

2257

2258

// Prepare features for the whole batch.

2259

all_features.clear();

2260

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2261

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2262

cached_features.AppendClickContextFeaturesForClick(click_pos,

&all_features);

}

// Run batched inference.

2267

const int batch_size = batch_end - batch_start;

2268

const int features_size = cached_features.OutputFeaturesSize();

2269

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2270

TensorView<float>(all_features.data(), {batch_size, features_size}),

2271

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2272

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2273

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2274

return false;

2275

}

2276

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2277

logits.dim(1) !=

2278

selection_feature_processor_->GetSelectionLabelCount()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2279

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int click_pos = batch_start; click_pos < batch_end; ++click_pos) {

2285

const std::vector<float> scores = ComputeSoftmax(

2286

logits.data() + logits.dim(1) * (click_pos - batch_start),

2287

logits.dim(1));

2288

for (int j = 0;

2289

j < selection_feature_processor_->GetSelectionLabelCount(); ++j) {

2290

TokenSpan relative_token_span;

2291

if (!selection_feature_processor_->LabelToTokenSpan(

2292

j, &relative_token_span)) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2293

TC3_LOG(ERROR) << "Couldn't map the label to a token span.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2294

return false;

2295

}

2296

const TokenSpan candidate_span = ExpandTokenSpan(

2297

SingleTokenSpan(click_pos), relative_token_span.first,

2298

relative_token_span.second);

2299

if (candidate_span.first >= 0 && candidate_span.second <= num_tokens) {

2300

UpdateMax(&chunk_scores, candidate_span, scores[j]);

}

}

}

}

scored_chunks->clear();

2307

scored_chunks->reserve(chunk_scores.size());

2308

for (const auto& entry : chunk_scores) {

2309

scored_chunks->push_back(ScoredChunk{entry.first, entry.second});

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2315

bool Annotator::ModelBoundsSensitiveScoreChunks(

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2316

int num_tokens, const TokenSpan& span_of_interest,

2317

const TokenSpan& inference_span, const CachedFeatures& cached_features,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2318

tflite::Interpreter* selection_interpreter,

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2319

std::vector<ScoredChunk>* scored_chunks) const {

2320

const int max_selection_span =

2321

selection_feature_processor_->GetOptions()->max_selection_span();

2322

const int max_chunk_length = selection_feature_processor_->GetOptions()

2323

->selection_reduced_output_space()

2324

? max_selection_span + 1

2325

: 2 * max_selection_span + 1;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2326

const bool score_single_token_spans_as_zero =

2327

selection_feature_processor_->GetOptions()

2328

->bounds_sensitive_features()

2329

->score_single_token_spans_as_zero();

2330

2331

scored_chunks->clear();

2332

if (score_single_token_spans_as_zero) {

2333

scored_chunks->reserve(TokenSpanSize(span_of_interest));

2334

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2335

2336

// Prepare all chunk candidates into one batch:

2337

// - Are contained in the inference span

2338

// - Have a non-empty intersection with the span of interest

2339

// - Are at least one token long

2340

// - Are not longer than the maximum chunk length

2341

std::vector<TokenSpan> candidate_spans;

2342

for (int start = inference_span.first; start < span_of_interest.second;

2343

++start) {

2344

const int leftmost_end_index = std::max(start, span_of_interest.first) + 1;

2345

for (int end = leftmost_end_index;

2346

end <= inference_span.second && end - start <= max_chunk_length;

2347

++end) {

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2348

const TokenSpan candidate_span = {start, end};

2349

if (score_single_token_spans_as_zero &&

2350

TokenSpanSize(candidate_span) == 1) {

2351

// Do not include the single token span in the batch, add a zero score

2352

// for it directly to the output.

2353

scored_chunks->push_back(ScoredChunk{candidate_span, 0.0f});

2354

} else {

2355

candidate_spans.push_back(candidate_span);

2356

}

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

}

const int max_batch_size = model_->selection_options()->batch_size();

2361

2362

std::vector<float> all_features;

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2363

scored_chunks->reserve(scored_chunks->size() + candidate_spans.size());

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2364

for (int batch_start = 0; batch_start < candidate_spans.size();

2365

batch_start += max_batch_size) {

2366

const int batch_end = std::min(batch_start + max_batch_size,

2367

static_cast<int>(candidate_spans.size()));

2368

2369

// Prepare features for the whole batch.

2370

all_features.clear();

2371

all_features.reserve(max_batch_size * cached_features.OutputFeaturesSize());

2372

for (int i = batch_start; i < batch_end; ++i) {

2373

cached_features.AppendBoundsSensitiveFeaturesForSpan(candidate_spans[i],

&all_features);

}

// Run batched inference.

2378

const int batch_size = batch_end - batch_start;

2379

const int features_size = cached_features.OutputFeaturesSize();

2380

TensorView<float> logits = selection_executor_->ComputeLogits(

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2381

TensorView<float>(all_features.data(), {batch_size, features_size}),

2382

selection_interpreter);

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2383

if (!logits.is_valid()) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2384

TC3_LOG(ERROR) << "Couldn't compute logits.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2385

return false;

2386

}

2387

if (logits.dims() != 2 || logits.dim(0) != batch_size ||

2388

logits.dim(1) != 1) {

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2389

TC3_LOG(ERROR) << "Mismatching output.";

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

return false;

}

// Save results.

for (int i = batch_start; i < batch_end; ++i) {

2395

scored_chunks->push_back(

2396

ScoredChunk{candidate_spans[i], logits.data()[i - batch_start]});

}

}

return true;

}

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2403

bool Annotator::DatetimeChunk(const UnicodeText& context_unicode,

2404

int64 reference_time_ms_utc,

2405

const std::string& reference_timezone,

2406

const std::string& locales, ModeFlag mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2407

AnnotationUsecase annotation_usecase,

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2408

bool is_serialized_entity_data_enabled,

Tony Mak

2018-09-17 11:48:50 +0100

[diff] [blame]

2409

std::vector<AnnotatedSpan>* result) const {

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2410

if (!datetime_parser_) {

Lukas Zilka

2018-04-25 11:38:51 +0200

[diff] [blame]

2411

return true;

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2412

}

2413

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2414

std::vector<DatetimeParseResultSpan> datetime_spans;

2415

if (!datetime_parser_->Parse(context_unicode, reference_time_ms_utc,

Lukas Zilka

2018-03-08 14:48:21 +0100

[diff] [blame]

2416

reference_timezone, locales, mode,

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2417

annotation_usecase,

Lukas Zilka

2018-03-28 18:09:48 +0200

[diff] [blame]

2418

/*anchor_start_end=*/false, &datetime_spans)) {

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

2419

return false;

2420

}

2421

for (const DatetimeParseResultSpan& datetime_span : datetime_spans) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2422

AnnotatedSpan annotated_span;

2423

annotated_span.span = datetime_span.span;

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2424

for (const DatetimeParseResult& parse_result : datetime_span.data) {

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2425

annotated_span.classification.emplace_back(

2426

PickCollectionForDatetime(parse_result),

2427

datetime_span.target_classification_score,

2428

datetime_span.priority_score);

2429

annotated_span.classification.back().datetime_parse_result = parse_result;

Tony Mak

2019-04-10 16:12:15 +0100

[diff] [blame]

2430

if (is_serialized_entity_data_enabled) {

2431

annotated_span.classification.back().serialized_entity_data =

2432

CreateDatetimeSerializedEntityData(parse_result);

2433

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2434

}

Tony Mak

2019-03-22 13:36:41 +0000

[diff] [blame]

2435

annotated_span.source = AnnotatedSpan::Source::DATETIME;

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2436

result->push_back(std::move(annotated_span));

Lukas Zilka

2018-02-09 10:25:19 +0100

[diff] [blame]

}

return true;

}

Tony Mak

2019-03-04 15:58:11 +0000

[diff] [blame]

2441

const Model* Annotator::model() const { return model_; }

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2442

const reflection::Schema* Annotator::entity_data_schema() const {

2443

return entity_data_schema_;

2444

}

Tony Mak

2019-01-16 15:56:48 +0000

[diff] [blame]

2445

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

2446

const Model* ViewModel(const void* buffer, int size) {

if (!buffer) {

return nullptr;

}

return LoadAndVerifyModel(buffer, size);

2452

}

2453

Tony Mak

2019-02-20 18:25:39 +0000

[diff] [blame]

2454

bool Annotator::LookUpKnowledgeEntity(

2455

const std::string& id, std::string* serialized_knowledge_result) const {

2456

return knowledge_engine_ &&

2457

knowledge_engine_->LookUpEntity(id, serialized_knowledge_result);

2458

}

2459

Tony Mak