blob: 758ef80eddee620ce9269d31e6dd68cc68777187 [file] [log] [blame]
Eugene Klyuchnikov771eb102015-11-27 11:27:11 +01001/* Copyright 2013 Google Inc. All Rights Reserved.
2
Eugene Klyuchnikov24ffa782015-12-11 11:11:51 +01003 Distributed under MIT license.
Eugene Klyuchnikov771eb102015-11-27 11:27:11 +01004 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
Zoltan Szabadka66098832015-06-12 16:45:17 +02007#include "./static_dict.h"
8
Eugene Kliuchnikov02829182016-06-03 10:51:04 +02009#include "../common/dictionary.h"
Eugene Kliuchnikovda254cf2017-12-12 14:33:12 +010010#include "../common/platform.h"
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050011#include "../common/transform.h"
12#include "./encoder_dict.h"
Zoltan Szabadka66098832015-06-12 16:45:17 +020013#include "./find_match_length.h"
Zoltan Szabadka66098832015-06-12 16:45:17 +020014
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020015#if defined(__cplusplus) || defined(c_plusplus)
16extern "C" {
17#endif
Zoltan Szabadka66098832015-06-12 16:45:17 +020018
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050019/* TODO: use BrotliTransforms.cutOffTransforms instead. */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020020static const uint8_t kOmitLastNTransforms[10] = {
21 0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
22};
23
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050024static BROTLI_INLINE uint32_t Hash(const uint8_t* data) {
Eugene Kliuchnikovda254cf2017-12-12 14:33:12 +010025 uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kDictHashMul32;
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +020026 /* The higher bits contain more mixture from the multiplication,
27 so we take our results from there. */
Zoltan Szabadka66098832015-06-12 16:45:17 +020028 return h >> (32 - kDictNumBits);
29}
30
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020031static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
32 uint32_t* matches) {
33 uint32_t match = (uint32_t)((distance << 5) + len_code);
34 matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
Zoltan Szabadka66098832015-06-12 16:45:17 +020035}
36
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010037static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary,
38 const uint8_t* data,
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020039 size_t id,
40 size_t len,
41 size_t maxlen) {
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010042 const size_t offset = dictionary->offsets_by_length[len] + len * id;
43 return FindMatchLengthWithLimit(&dictionary->data[offset], data,
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020044 BROTLI_MIN(size_t, len, maxlen));
Zoltan Szabadka66098832015-06-12 16:45:17 +020045}
46
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010047static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary,
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020048 DictWord w, const uint8_t* data, size_t max_length) {
49 if (w.len > max_length) {
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020050 return BROTLI_FALSE;
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020051 } else {
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010052 const size_t offset = dictionary->offsets_by_length[w.len] +
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020053 (size_t)w.len * (size_t)w.idx;
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010054 const uint8_t* dict = &dictionary->data[offset];
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020055 if (w.transform == 0) {
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +020056 /* Match against base dictionary word. */
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020057 return
58 TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len);
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020059 } else if (w.transform == 10) {
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +020060 /* Match against uppercase first transform.
61 Note that there are only ASCII uppercase words in the lookup table. */
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020062 return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' &&
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020063 (dict[0] ^ 32) == data[0] &&
64 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
65 w.len - 1u);
66 } else {
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +020067 /* Match against uppercase all transform.
68 Note that there are only ASCII uppercase words in the lookup table. */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020069 size_t i;
70 for (i = 0; i < w.len; ++i) {
71 if (dict[i] >= 'a' && dict[i] <= 'z') {
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020072 if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE;
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020073 } else {
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020074 if (dict[i] != data[i]) return BROTLI_FALSE;
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020075 }
Zoltan Szabadka66098832015-06-12 16:45:17 +020076 }
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020077 return BROTLI_TRUE;
Zoltan Szabadka66098832015-06-12 16:45:17 +020078 }
Zoltan Szabadka66098832015-06-12 16:45:17 +020079 }
80}
81
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020082BROTLI_BOOL BrotliFindAllStaticDictionaryMatches(
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050083 const BrotliEncoderDictionary* dictionary, const uint8_t* data,
Eugene Kliuchnikovda254cf2017-12-12 14:33:12 +010084 size_t min_length, size_t max_length, uint32_t* matches) {
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020085 BROTLI_BOOL has_found_match = BROTLI_FALSE;
86 {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050087 size_t offset = dictionary->buckets[Hash(data)];
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020088 BROTLI_BOOL end = !offset;
89 while (!end) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050090 DictWord w = dictionary->dict_words[offset++];
Eugene Kliuchnikov8d3fdc12017-01-26 11:32:18 +010091 const size_t l = w.len & 0x1F;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050092 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
Zoltan Szabadka8844b7f2016-01-07 16:27:49 +010093 const size_t id = w.idx;
Eugene Kliuchnikov20481892016-07-26 14:41:59 +020094 end = !!(w.len & 0x80);
95 w.len = (uint8_t)l;
Zoltan Szabadka66098832015-06-12 16:45:17 +020096 if (w.transform == 0) {
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +010097 const size_t matchlen =
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -050098 DictMatchLength(dictionary->words, data, id, l, max_length);
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +020099 const uint8_t* s;
100 size_t minlen;
101 size_t maxlen;
102 size_t len;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500103 /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */
Zoltan Szabadka66098832015-06-12 16:45:17 +0200104 if (matchlen == l) {
105 AddMatch(id, l, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200106 has_found_match = BROTLI_TRUE;
Zoltan Szabadka66098832015-06-12 16:45:17 +0200107 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500108 /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and
109 "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */
Zoltan Szabadka66098832015-06-12 16:45:17 +0200110 if (matchlen >= l - 1) {
111 AddMatch(id + 12 * n, l - 1, l, matches);
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200112 if (l + 2 < max_length &&
113 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
Zoltan Szabadka66098832015-06-12 16:45:17 +0200114 data[l + 2] == ' ') {
115 AddMatch(id + 49 * n, l + 3, l, matches);
116 }
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200117 has_found_match = BROTLI_TRUE;
Zoltan Szabadka66098832015-06-12 16:45:17 +0200118 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500119 /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200120 minlen = min_length;
121 if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
122 maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
123 for (len = minlen; len <= maxlen; ++len) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200124 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200125 has_found_match = BROTLI_TRUE;
Zoltan Szabadka66098832015-06-12 16:45:17 +0200126 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200127 if (matchlen < l || l + 6 >= max_length) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200128 continue;
129 }
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200130 s = &data[l];
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500131 /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */
Zoltan Szabadka66098832015-06-12 16:45:17 +0200132 if (s[0] == ' ') {
133 AddMatch(id + n, l + 1, l, matches);
134 if (s[1] == 'a') {
135 if (s[2] == ' ') {
136 AddMatch(id + 28 * n, l + 3, l, matches);
137 } else if (s[2] == 's') {
138 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
139 } else if (s[2] == 't') {
140 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
141 } else if (s[2] == 'n') {
142 if (s[3] == 'd' && s[4] == ' ') {
143 AddMatch(id + 10 * n, l + 5, l, matches);
144 }
145 }
146 } else if (s[1] == 'b') {
147 if (s[2] == 'y' && s[3] == ' ') {
148 AddMatch(id + 38 * n, l + 4, l, matches);
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200149 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200150 } else if (s[1] == 'i') {
151 if (s[2] == 'n') {
152 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
153 } else if (s[2] == 's') {
154 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
155 }
156 } else if (s[1] == 'f') {
157 if (s[2] == 'o') {
158 if (s[3] == 'r' && s[4] == ' ') {
159 AddMatch(id + 25 * n, l + 5, l, matches);
160 }
161 } else if (s[2] == 'r') {
162 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
163 AddMatch(id + 37 * n, l + 6, l, matches);
164 }
165 }
166 } else if (s[1] == 'o') {
167 if (s[2] == 'f') {
168 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
169 } else if (s[2] == 'n') {
170 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
171 }
172 } else if (s[1] == 'n') {
173 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
174 AddMatch(id + 80 * n, l + 5, l, matches);
175 }
176 } else if (s[1] == 't') {
177 if (s[2] == 'h') {
178 if (s[3] == 'e') {
179 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
180 } else if (s[3] == 'a') {
181 if (s[4] == 't' && s[5] == ' ') {
182 AddMatch(id + 29 * n, l + 6, l, matches);
183 }
184 }
185 } else if (s[2] == 'o') {
186 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
187 }
188 } else if (s[1] == 'w') {
189 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
190 AddMatch(id + 35 * n, l + 6, l, matches);
191 }
192 }
193 } else if (s[0] == '"') {
194 AddMatch(id + 19 * n, l + 1, l, matches);
195 if (s[1] == '>') {
196 AddMatch(id + 21 * n, l + 2, l, matches);
197 }
198 } else if (s[0] == '.') {
199 AddMatch(id + 20 * n, l + 1, l, matches);
200 if (s[1] == ' ') {
201 AddMatch(id + 31 * n, l + 2, l, matches);
202 if (s[2] == 'T' && s[3] == 'h') {
203 if (s[4] == 'e') {
204 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
205 } else if (s[4] == 'i') {
206 if (s[5] == 's' && s[6] == ' ') {
207 AddMatch(id + 75 * n, l + 7, l, matches);
208 }
209 }
210 }
211 }
212 } else if (s[0] == ',') {
213 AddMatch(id + 76 * n, l + 1, l, matches);
214 if (s[1] == ' ') {
215 AddMatch(id + 14 * n, l + 2, l, matches);
216 }
217 } else if (s[0] == '\n') {
218 AddMatch(id + 22 * n, l + 1, l, matches);
219 if (s[1] == '\t') {
220 AddMatch(id + 50 * n, l + 2, l, matches);
221 }
222 } else if (s[0] == ']') {
223 AddMatch(id + 24 * n, l + 1, l, matches);
224 } else if (s[0] == '\'') {
225 AddMatch(id + 36 * n, l + 1, l, matches);
226 } else if (s[0] == ':') {
227 AddMatch(id + 51 * n, l + 1, l, matches);
228 } else if (s[0] == '(') {
229 AddMatch(id + 57 * n, l + 1, l, matches);
230 } else if (s[0] == '=') {
231 if (s[1] == '"') {
232 AddMatch(id + 70 * n, l + 2, l, matches);
233 } else if (s[1] == '\'') {
234 AddMatch(id + 86 * n, l + 2, l, matches);
235 }
236 } else if (s[0] == 'a') {
237 if (s[1] == 'l' && s[2] == ' ') {
238 AddMatch(id + 84 * n, l + 3, l, matches);
239 }
240 } else if (s[0] == 'e') {
241 if (s[1] == 'd') {
242 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
243 } else if (s[1] == 'r') {
244 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
245 } else if (s[1] == 's') {
246 if (s[2] == 't' && s[3] == ' ') {
247 AddMatch(id + 95 * n, l + 4, l, matches);
248 }
249 }
250 } else if (s[0] == 'f') {
251 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
252 AddMatch(id + 90 * n, l + 4, l, matches);
253 }
254 } else if (s[0] == 'i') {
255 if (s[1] == 'v') {
256 if (s[2] == 'e' && s[3] == ' ') {
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200257 AddMatch(id + 92 * n, l + 4, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200258 }
259 } else if (s[1] == 'z') {
260 if (s[2] == 'e' && s[3] == ' ') {
261 AddMatch(id + 100 * n, l + 4, l, matches);
262 }
263 }
264 } else if (s[0] == 'l') {
265 if (s[1] == 'e') {
266 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
267 AddMatch(id + 93 * n, l + 5, l, matches);
268 }
269 } else if (s[1] == 'y') {
270 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
271 }
272 } else if (s[0] == 'o') {
273 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
274 AddMatch(id + 106 * n, l + 4, l, matches);
275 }
276 }
277 } else {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500278 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
279 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
280 transform. */
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200281 const BROTLI_BOOL is_all_caps =
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500282 TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200283 const uint8_t* s;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500284 if (!IsMatch(dictionary->words, w, data, max_length)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200285 continue;
286 }
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200287 /* Transform "" + kUppercase{First,All} + "" */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200288 AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200289 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200290 if (l + 1 >= max_length) {
291 continue;
292 }
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200293 /* Transforms "" + kUppercase{First,All} + <suffix> */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200294 s = &data[l];
Zoltan Szabadka66098832015-06-12 16:45:17 +0200295 if (s[0] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200296 AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200297 } else if (s[0] == '"') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200298 AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200299 if (s[1] == '>') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200300 AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200301 }
302 } else if (s[0] == '.') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200303 AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200304 if (s[1] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200305 AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200306 }
307 } else if (s[0] == ',') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200308 AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200309 if (s[1] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200310 AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200311 }
312 } else if (s[0] == '\'') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200313 AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200314 } else if (s[0] == '(') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200315 AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200316 } else if (s[0] == '=') {
317 if (s[1] == '"') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200318 AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200319 } else if (s[1] == '\'') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200320 AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200321 }
322 }
323 }
324 }
325 }
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200326 /* Transforms with prefixes " " and "." */
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200327 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200328 BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' ');
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500329 size_t offset = dictionary->buckets[Hash(&data[1])];
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200330 BROTLI_BOOL end = !offset;
331 while (!end) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500332 DictWord w = dictionary->dict_words[offset++];
Eugene Kliuchnikov8d3fdc12017-01-26 11:32:18 +0100333 const size_t l = w.len & 0x1F;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500334 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
Zoltan Szabadka8844b7f2016-01-07 16:27:49 +0100335 const size_t id = w.idx;
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200336 end = !!(w.len & 0x80);
337 w.len = (uint8_t)l;
Zoltan Szabadka66098832015-06-12 16:45:17 +0200338 if (w.transform == 0) {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200339 const uint8_t* s;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500340 if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200341 continue;
342 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500343 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and
344 "." + BROTLI_TRANSFORM_IDENTITY + "" */
Zoltan Szabadka66098832015-06-12 16:45:17 +0200345 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200346 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200347 if (l + 2 >= max_length) {
348 continue;
349 }
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500350 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and
351 "." + BROTLI_TRANSFORM_IDENTITY + <suffix>
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200352 */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200353 s = &data[l + 1];
Zoltan Szabadka66098832015-06-12 16:45:17 +0200354 if (s[0] == ' ') {
355 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
356 } else if (s[0] == '(') {
357 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
358 } else if (is_space) {
359 if (s[0] == ',') {
360 AddMatch(id + 103 * n, l + 2, l, matches);
361 if (s[1] == ' ') {
362 AddMatch(id + 33 * n, l + 3, l, matches);
363 }
364 } else if (s[0] == '.') {
365 AddMatch(id + 71 * n, l + 2, l, matches);
366 if (s[1] == ' ') {
367 AddMatch(id + 52 * n, l + 3, l, matches);
368 }
369 } else if (s[0] == '=') {
370 if (s[1] == '"') {
371 AddMatch(id + 81 * n, l + 3, l, matches);
372 } else if (s[1] == '\'') {
373 AddMatch(id + 98 * n, l + 3, l, matches);
374 }
375 }
376 }
377 } else if (is_space) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500378 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
379 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
380 transform. */
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200381 const BROTLI_BOOL is_all_caps =
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500382 TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200383 const uint8_t* s;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500384 if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200385 continue;
386 }
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200387 /* Transforms " " + kUppercase{First,All} + "" */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200388 AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200389 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200390 if (l + 2 >= max_length) {
391 continue;
392 }
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200393 /* Transforms " " + kUppercase{First,All} + <suffix> */
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200394 s = &data[l + 1];
Zoltan Szabadka66098832015-06-12 16:45:17 +0200395 if (s[0] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200396 AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200397 } else if (s[0] == ',') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200398 if (!is_all_caps) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200399 AddMatch(id + 109 * n, l + 2, l, matches);
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200400 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200401 if (s[1] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200402 AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200403 }
404 } else if (s[0] == '.') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200405 AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200406 if (s[1] == ' ') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200407 AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200408 }
409 } else if (s[0] == '=') {
410 if (s[1] == '"') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200411 AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200412 } else if (s[1] == '\'') {
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200413 AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
Zoltan Szabadka66098832015-06-12 16:45:17 +0200414 }
415 }
416 }
417 }
418 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200419 if (max_length >= 6) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500420 /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200421 if ((data[1] == ' ' &&
422 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500423 (data[0] == 0xC2 && data[1] == 0xA0)) {
424 size_t offset = dictionary->buckets[Hash(&data[2])];
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200425 BROTLI_BOOL end = !offset;
426 while (!end) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500427 DictWord w = dictionary->dict_words[offset++];
Eugene Kliuchnikov8d3fdc12017-01-26 11:32:18 +0100428 const size_t l = w.len & 0x1F;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500429 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
Zoltan Szabadka8844b7f2016-01-07 16:27:49 +0100430 const size_t id = w.idx;
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200431 end = !!(w.len & 0x80);
432 w.len = (uint8_t)l;
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +0100433 if (w.transform == 0 &&
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500434 IsMatch(dictionary->words, w, &data[2], max_length - 2)) {
435 if (data[0] == 0xC2) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200436 AddMatch(id + 102 * n, l + 2, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200437 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200438 } else if (l + 2 < max_length && data[l + 2] == ' ') {
Zoltan Szabadka8844b7f2016-01-07 16:27:49 +0100439 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200440 AddMatch(id + t * n, l + 3, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200441 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200442 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200443 }
444 }
445 }
446 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200447 if (max_length >= 9) {
Eugene Kliuchnikov352b0b22016-06-03 11:19:23 +0200448 /* Transforms with prefixes " the " and ".com/" */
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200449 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
450 data[3] == 'e' && data[4] == ' ') ||
451 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
452 data[3] == 'm' && data[4] == '/')) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500453 size_t offset = dictionary->buckets[Hash(&data[5])];
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200454 BROTLI_BOOL end = !offset;
455 while (!end) {
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500456 DictWord w = dictionary->dict_words[offset++];
Eugene Kliuchnikov8d3fdc12017-01-26 11:32:18 +0100457 const size_t l = w.len & 0x1F;
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500458 const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
Zoltan Szabadka8844b7f2016-01-07 16:27:49 +0100459 const size_t id = w.idx;
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200460 end = !!(w.len & 0x80);
461 w.len = (uint8_t)l;
Eugene Kliuchnikovcdca91b2017-03-06 14:22:45 +0100462 if (w.transform == 0 &&
Eugene Kliuchnikov35e69fc2018-02-26 09:04:36 -0500463 IsMatch(dictionary->words, w, &data[5], max_length - 5)) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200464 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
Eugene Kliuchnikov20481892016-07-26 14:41:59 +0200465 has_found_match = BROTLI_TRUE;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200466 if (l + 5 < max_length) {
467 const uint8_t* s = &data[l + 5];
468 if (data[0] == ' ') {
469 if (l + 8 < max_length &&
470 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
471 AddMatch(id + 62 * n, l + 9, l, matches);
472 if (l + 12 < max_length &&
473 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
474 AddMatch(id + 73 * n, l + 13, l, matches);
475 }
476 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200477 }
478 }
479 }
480 }
481 }
482 }
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200483 return has_found_match;
Zoltan Szabadka66098832015-06-12 16:45:17 +0200484}
485
Eugene Kliuchnikovb972c672016-06-13 11:01:04 +0200486#if defined(__cplusplus) || defined(c_plusplus)
487} /* extern "C" */
488#endif