Blame - token-feature-extractor.cc - platform/external/libtextclassifier

2017-03-10 12:29:15 +0100

[diff] [blame]

/*

*

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

17

#include "token-feature-extractor.h"

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

18

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

19

#include <cctype>

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

20

#include <string>

21

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

22

#include "util/base/logging.h"

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

23

#include "util/hash/farmhash.h"

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

24

#include "util/strings/stringpiece.h"

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

25

#include "util/utf8/unicodetext.h"

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

26

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

27

namespace libtextclassifier2 {

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

28

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

29

namespace {

30

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

31

std::string RemapTokenAscii(const std::string& token,

32

const TokenFeatureExtractorOptions& options) {

33

if (!options.remap_digits && !options.lowercase_tokens) {

return token;

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

37

std::string copy = token;

38

for (int i = 0; i < token.size(); ++i) {

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

39

if (options.remap_digits && isdigit(copy[i])) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

40

copy[i] = '0';

41

}

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

42

if (options.lowercase_tokens) {

43

copy[i] = tolower(copy[i]);

44

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

}

return copy;

}

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

49

void RemapTokenUnicode(const std::string& token,

50

const TokenFeatureExtractorOptions& options,

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

51

const UniLib& unilib, UnicodeText* remapped) {

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

52

if (!options.remap_digits && !options.lowercase_tokens) {

53

// Leave remapped untouched.

54

return;

55

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

56

57

UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

58

remapped->clear();

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

59

for (auto it = word.begin(); it != word.end(); ++it) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

60

if (options.remap_digits && unilib.IsDigit(*it)) {

61

remapped->AppendCodepoint('0');

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

62

} else if (options.lowercase_tokens) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

63

remapped->AppendCodepoint(unilib.ToLower(*it));

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

64

} else {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

65

remapped->AppendCodepoint(*it);

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

}

}

}

} // namespace

TokenFeatureExtractor::TokenFeatureExtractor(

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

73

const TokenFeatureExtractorOptions& options, const UniLib& unilib)

74

: options_(options), unilib_(unilib) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

75

for (const std::string& pattern : options.regexp_features) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

76

regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(

77

unilib_.CreateRegexPattern(pattern)));

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

78

}

79

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

80

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

81

int TokenFeatureExtractor::HashToken(StringPiece token) const {

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

82

if (options_.allowed_chargrams.empty()) {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

83

return tc2farmhash::Fingerprint64(token) % options_.num_buckets;

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

84

} else {

85

// Padding and out-of-vocabulary tokens have extra buckets reserved because

86

// they are special and important tokens, and we don't want them to share

87

// embedding with other charactergrams.

88

// TODO(zilka): Experimentally verify.

89

const int kNumExtraBuckets = 2;

90

const std::string token_string = token.ToString();

91

if (token_string == "<PAD>") {

92

return 1;

93

} else if (options_.allowed_chargrams.find(token_string) ==

94

options_.allowed_chargrams.end()) {

95

return 0; // Out-of-vocabulary.

96

} else {

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

97

return (tc2farmhash::Fingerprint64(token) %

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

98

(options_.num_buckets - kNumExtraBuckets)) +

99

kNumExtraBuckets;

100

}

101

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

102

}

103

104

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(

105

const Token& token) const {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

106

if (options_.unicode_aware_features) {

107

return ExtractCharactergramFeaturesUnicode(token);

108

} else {

109

return ExtractCharactergramFeaturesAscii(token);

}

}

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(

114

const Token& token) const {

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

115

std::vector<int> result;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

116

if (token.is_padding || token.value.empty()) {

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

117

result.push_back(HashToken("<PAD>"));

118

} else {

Matt Sharifi

2017-04-24 13:30:47 +0200

[diff] [blame]

119

const std::string word = RemapTokenAscii(token.value, options_);

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

120

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

121

// Trim words that are over max_word_length characters.

122

const int max_word_length = options_.max_word_length;

123

std::string feature_word;

124

if (word.size() > max_word_length) {

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

125

feature_word =

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

126

"^" + word.substr(0, max_word_length / 2) + "\1" +

127

word.substr(word.size() - max_word_length / 2, max_word_length / 2) +

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

128

"$";

129

} else {

130

// Add a prefix and suffix to the word.

131

feature_word = "^" + word + "$";

132

}

133

134

// Upper-bound the number of charactergram extracted to avoid resizing.

135

result.reserve(options_.chargram_orders.size() * feature_word.size());

136

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

137

if (options_.chargram_orders.empty()) {

138

result.push_back(HashToken(feature_word));

139

} else {

140

// Generate the character-grams.

141

for (int chargram_order : options_.chargram_orders) {

142

if (chargram_order == 1) {

143

for (int i = 1; i < feature_word.size() - 1; ++i) {

144

result.push_back(

145

HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));

}

} else {

for (int i = 0;

i < static_cast<int>(feature_word.size()) - chargram_order + 1;

150

++i) {

151

result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,

152

/*len=*/chargram_order)));

153

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

}

}

}

}

return result;

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

161

std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(

162

const Token& token) const {

163

std::vector<int> result;

Lukas Zilka

2017-04-06 15:54:24 +0200

[diff] [blame]

164

if (token.is_padding || token.value.empty()) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

165

result.push_back(HashToken("<PAD>"));

166

} else {

167

UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

168

RemapTokenUnicode(token.value, options_, unilib_, &word);

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

169

170

// Trim the word if needed by finding a left-cut point and right-cut point.

171

auto left_cut = word.begin();

172

auto right_cut = word.end();

173

for (int i = 0; i < options_.max_word_length / 2; i++) {

174

if (left_cut < right_cut) {

175

++left_cut;

176

}

177

if (left_cut < right_cut) {

--right_cut;

}

}

std::string feature_word;

183

if (left_cut == right_cut) {

184

feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";

} else {

// clang-format off

feature_word = "^" +

word.UTF8Substring(word.begin(), left_cut) +

189

"\1" +

190

word.UTF8Substring(right_cut, word.end()) +

"$";

// clang-format on

}

const UnicodeText feature_word_unicode =

196

UTF8ToUnicodeText(feature_word, /*do_copy=*/false);

197

198

// Upper-bound the number of charactergram extracted to avoid resizing.

199

result.reserve(options_.chargram_orders.size() * feature_word.size());

200

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

201

if (options_.chargram_orders.empty()) {

202

result.push_back(HashToken(feature_word));

203

} else {

204

// Generate the character-grams.

205

for (int chargram_order : options_.chargram_orders) {

206

UnicodeText::const_iterator it_start = feature_word_unicode.begin();

207

UnicodeText::const_iterator it_end = feature_word_unicode.end();

208

if (chargram_order == 1) {

209

++it_start;

210

--it_end;

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

211

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

212

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

213

UnicodeText::const_iterator it_chargram_start = it_start;

214

UnicodeText::const_iterator it_chargram_end = it_start;

215

bool chargram_is_complete = true;

216

for (int i = 0; i < chargram_order; ++i) {

217

if (it_chargram_end == it_end) {

218

chargram_is_complete = false;

break;

}

++it_chargram_end;

}

if (!chargram_is_complete) {

continue;

}

for (; it_chargram_end <= it_end;

228

++it_chargram_start, ++it_chargram_end) {

229

const int length_bytes =

230

it_chargram_end.utf8_data() - it_chargram_start.utf8_data();

231

result.push_back(HashToken(

232

StringPiece(it_chargram_start.utf8_data(), length_bytes)));

233

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

}

}

}

return result;

}

Lukas Zilka

2017-04-07 19:55:11 +0200

[diff] [blame]

240

bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

241

std::vector<int>* sparse_features,

242

std::vector<float>* dense_features) const {

243

if (sparse_features == nullptr || dense_features == nullptr) {

return false;

}

*sparse_features = ExtractCharactergramFeatures(token);

248

249

if (options_.extract_case_feature) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

250

if (options_.unicode_aware_features) {

251

UnicodeText token_unicode =

252

UTF8ToUnicodeText(token.value, /*do_copy=*/false);

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

253

const bool is_upper = unilib_.IsUpper(*token_unicode.begin());

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

254

if (!token.value.empty() && is_upper) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

255

dense_features->push_back(1.0);

256

} else {

257

dense_features->push_back(-1.0);

258

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

259

} else {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

260

if (!token.value.empty() && isupper(*token.value.begin())) {

261

dense_features->push_back(1.0);

262

} else {

263

dense_features->push_back(-1.0);

264

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

}

}

if (options_.extract_selection_mask_feature) {

Lukas Zilka

6bb39a8

2017-04-07 19:55:11 +0200

[diff] [blame]

269

if (is_in_span) {

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

270

dense_features->push_back(1.0);

271

} else {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

272

if (options_.unicode_aware_features) {

273

dense_features->push_back(-1.0);

274

} else {

275

dense_features->push_back(0.0);

276

}

Matt Sharifi

2017-03-10 12:29:15 +0100

[diff] [blame]

}

}

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

280

// Add regexp features.

281

if (!regex_patterns_.empty()) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

282

for (int i = 0; i < regex_patterns_.size(); ++i) {

283

if (!regex_patterns_[i].get()) {

284

dense_features->push_back(-1.0);

continue;

}

Lukas Zilka

2018-01-24 11:11:20 +0100

[diff] [blame]

288

if (regex_patterns_[i]->Matches(token.value)) {

Lukas Zilka

2017-04-03 17:32:27 +0200

[diff] [blame]

289

dense_features->push_back(1.0);

290

} else {

291

dense_features->push_back(-1.0);

292

}

293

}

294

}

Lukas Zilka

2017-10-11 10:50:05 +0200

[diff] [blame]

295

Matt Sharifi