blob: abe975f7f19852d8e2c49a4e91bb1509522a0480 [file] [log] [blame]
Zoltan Szabadka66098832015-06-12 16:45:17 +02001#include "./static_dict.h"
2
3#include <algorithm>
4
5#include "./dictionary.h"
6#include "./find_match_length.h"
7#include "./static_dict_lut.h"
8#include "./transform.h"
9
10namespace brotli {
11
12inline uint32_t Hash(const uint8_t *data) {
13 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;
14 // The higher bits contain more mixture from the multiplication,
15 // so we take our results from there.
16 return h >> (32 - kDictNumBits);
17}
18
19inline void AddMatch(int distance, int len, int len_code, int* matches) {
20 matches[len] = std::min(matches[len], (distance << 5) + len_code);
21}
22
Lode Vandevenne17ed2582015-08-10 13:13:58 +020023inline int DictMatchLength(const uint8_t* data, int id, int len, int maxlen) {
Zoltan Szabadka66098832015-06-12 16:45:17 +020024 const int offset = kBrotliDictionaryOffsetsByLength[len] + len * id;
Lode Vandevenne17ed2582015-08-10 13:13:58 +020025 return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data,
26 std::min(len, maxlen));
Zoltan Szabadka66098832015-06-12 16:45:17 +020027}
28
Lode Vandevenne17ed2582015-08-10 13:13:58 +020029inline bool IsMatch(DictWord w, const uint8_t* data, int max_length) {
30 if (w.len > max_length) return false;
Zoltan Szabadka66098832015-06-12 16:45:17 +020031 const int offset = kBrotliDictionaryOffsetsByLength[w.len] + w.len * w.idx;
32 const uint8_t* dict = &kBrotliDictionary[offset];
33 if (w.transform == 0) {
34 // Match against base dictionary word.
35 return FindMatchLengthWithLimit(dict, data, w.len) == w.len;
36 } else if (w.transform == 10) {
37 // Match against uppercase first transform.
38 // Note that there are only ASCII uppercase words in the lookup table.
39 return (dict[0] >= 'a' && dict[0] <= 'z' &&
40 (dict[0] ^ 32) == data[0] &&
41 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1) ==
42 w.len - 1);
43 } else {
44 // Match against uppercase all transform.
45 // Note that there are only ASCII uppercase words in the lookup table.
46 for (int i = 0; i < w.len; ++i) {
47 if (dict[i] >= 'a' && dict[i] <= 'z') {
48 if ((dict[i] ^ 32) != data[i]) return false;
49 } else {
50 if (dict[i] != data[i]) return false;
51 }
52 }
53 return true;
54 }
55}
56
57bool FindAllStaticDictionaryMatches(const uint8_t* data,
58 int min_length,
Lode Vandevenne17ed2582015-08-10 13:13:58 +020059 int max_length,
Zoltan Szabadka66098832015-06-12 16:45:17 +020060 int* matches) {
61 bool found_match = false;
62 uint32_t key = Hash(data);
63 uint32_t bucket = kStaticDictionaryBuckets[key];
64 if (bucket != 0) {
65 int num = bucket & 0xff;
66 int offset = bucket >> 8;
67 for (int i = 0; i < num; ++i) {
68 const DictWord w = kStaticDictionaryWords[offset + i];
69 const int l = w.len;
70 const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
71 const int id = w.idx;
72 if (w.transform == 0) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +020073 const int matchlen = DictMatchLength(data, id, l, max_length);
Zoltan Szabadka66098832015-06-12 16:45:17 +020074 // Transform "" + kIdentity + ""
75 if (matchlen == l) {
76 AddMatch(id, l, l, matches);
77 found_match = true;
78 }
Marcin Karpinski21ac39f2015-09-21 21:04:07 +020079 // Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing "
Zoltan Szabadka66098832015-06-12 16:45:17 +020080 if (matchlen >= l - 1) {
81 AddMatch(id + 12 * n, l - 1, l, matches);
Lode Vandevenne17ed2582015-08-10 13:13:58 +020082 if (l + 2 < max_length &&
83 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
Zoltan Szabadka66098832015-06-12 16:45:17 +020084 data[l + 2] == ' ') {
85 AddMatch(id + 49 * n, l + 3, l, matches);
86 }
87 found_match = true;
88 }
89 // Transform "" + kOmitLastN + "" (N = 2 .. 9)
90 int minlen = std::max<int>(min_length, l - 9);
91 int maxlen = std::min<int>(matchlen, l - 2);
92 for (int len = minlen; len <= maxlen; ++len) {
93 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
94 found_match = true;
95 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +020096 if (matchlen < l || l + 6 >= max_length) {
Zoltan Szabadka66098832015-06-12 16:45:17 +020097 continue;
98 }
99 const uint8_t* s = &data[l];
100 // Transforms "" + kIdentity + <suffix>
101 if (s[0] == ' ') {
102 AddMatch(id + n, l + 1, l, matches);
103 if (s[1] == 'a') {
104 if (s[2] == ' ') {
105 AddMatch(id + 28 * n, l + 3, l, matches);
106 } else if (s[2] == 's') {
107 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
108 } else if (s[2] == 't') {
109 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
110 } else if (s[2] == 'n') {
111 if (s[3] == 'd' && s[4] == ' ') {
112 AddMatch(id + 10 * n, l + 5, l, matches);
113 }
114 }
115 } else if (s[1] == 'b') {
116 if (s[2] == 'y' && s[3] == ' ') {
117 AddMatch(id + 38 * n, l + 4, l, matches);
118 }
119 } else if (s[1] == 'i') {
120 if (s[2] == 'n') {
121 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
122 } else if (s[2] == 's') {
123 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
124 }
125 } else if (s[1] == 'f') {
126 if (s[2] == 'o') {
127 if (s[3] == 'r' && s[4] == ' ') {
128 AddMatch(id + 25 * n, l + 5, l, matches);
129 }
130 } else if (s[2] == 'r') {
131 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
132 AddMatch(id + 37 * n, l + 6, l, matches);
133 }
134 }
135 } else if (s[1] == 'o') {
136 if (s[2] == 'f') {
137 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
138 } else if (s[2] == 'n') {
139 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
140 }
141 } else if (s[1] == 'n') {
142 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
143 AddMatch(id + 80 * n, l + 5, l, matches);
144 }
145 } else if (s[1] == 't') {
146 if (s[2] == 'h') {
147 if (s[3] == 'e') {
148 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
149 } else if (s[3] == 'a') {
150 if (s[4] == 't' && s[5] == ' ') {
151 AddMatch(id + 29 * n, l + 6, l, matches);
152 }
153 }
154 } else if (s[2] == 'o') {
155 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
156 }
157 } else if (s[1] == 'w') {
158 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
159 AddMatch(id + 35 * n, l + 6, l, matches);
160 }
161 }
162 } else if (s[0] == '"') {
163 AddMatch(id + 19 * n, l + 1, l, matches);
164 if (s[1] == '>') {
165 AddMatch(id + 21 * n, l + 2, l, matches);
166 }
167 } else if (s[0] == '.') {
168 AddMatch(id + 20 * n, l + 1, l, matches);
169 if (s[1] == ' ') {
170 AddMatch(id + 31 * n, l + 2, l, matches);
171 if (s[2] == 'T' && s[3] == 'h') {
172 if (s[4] == 'e') {
173 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
174 } else if (s[4] == 'i') {
175 if (s[5] == 's' && s[6] == ' ') {
176 AddMatch(id + 75 * n, l + 7, l, matches);
177 }
178 }
179 }
180 }
181 } else if (s[0] == ',') {
182 AddMatch(id + 76 * n, l + 1, l, matches);
183 if (s[1] == ' ') {
184 AddMatch(id + 14 * n, l + 2, l, matches);
185 }
186 } else if (s[0] == '\n') {
187 AddMatch(id + 22 * n, l + 1, l, matches);
188 if (s[1] == '\t') {
189 AddMatch(id + 50 * n, l + 2, l, matches);
190 }
191 } else if (s[0] == ']') {
192 AddMatch(id + 24 * n, l + 1, l, matches);
193 } else if (s[0] == '\'') {
194 AddMatch(id + 36 * n, l + 1, l, matches);
195 } else if (s[0] == ':') {
196 AddMatch(id + 51 * n, l + 1, l, matches);
197 } else if (s[0] == '(') {
198 AddMatch(id + 57 * n, l + 1, l, matches);
199 } else if (s[0] == '=') {
200 if (s[1] == '"') {
201 AddMatch(id + 70 * n, l + 2, l, matches);
202 } else if (s[1] == '\'') {
203 AddMatch(id + 86 * n, l + 2, l, matches);
204 }
205 } else if (s[0] == 'a') {
206 if (s[1] == 'l' && s[2] == ' ') {
207 AddMatch(id + 84 * n, l + 3, l, matches);
208 }
209 } else if (s[0] == 'e') {
210 if (s[1] == 'd') {
211 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
212 } else if (s[1] == 'r') {
213 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
214 } else if (s[1] == 's') {
215 if (s[2] == 't' && s[3] == ' ') {
216 AddMatch(id + 95 * n, l + 4, l, matches);
217 }
218 }
219 } else if (s[0] == 'f') {
220 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
221 AddMatch(id + 90 * n, l + 4, l, matches);
222 }
223 } else if (s[0] == 'i') {
224 if (s[1] == 'v') {
225 if (s[2] == 'e' && s[3] == ' ') {
226 AddMatch(id + 92 * n, l + 4, l, matches);
227 }
228 } else if (s[1] == 'z') {
229 if (s[2] == 'e' && s[3] == ' ') {
230 AddMatch(id + 100 * n, l + 4, l, matches);
231 }
232 }
233 } else if (s[0] == 'l') {
234 if (s[1] == 'e') {
235 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
236 AddMatch(id + 93 * n, l + 5, l, matches);
237 }
238 } else if (s[1] == 'y') {
239 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
240 }
241 } else if (s[0] == 'o') {
242 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
243 AddMatch(id + 106 * n, l + 4, l, matches);
244 }
245 }
246 } else {
247 // Set t=0 for kUppercaseFirst and t=1 for kUppercaseAll transform.
248 const int t = w.transform - 10;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200249 if (!IsMatch(w, data, max_length)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200250 continue;
251 }
252 // Transform "" + kUppercase{First,All} + ""
253 AddMatch(id + (t ? 44 : 9) * n, l, l, matches);
254 found_match = true;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200255 if (l + 1 >= max_length) {
256 continue;
257 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200258 // Transforms "" + kUppercase{First,All} + <suffix>
259 const uint8_t* s = &data[l];
260 if (s[0] == ' ') {
261 AddMatch(id + (t ? 68 : 4) * n, l + 1, l, matches);
262 } else if (s[0] == '"') {
263 AddMatch(id + (t ? 87 : 66) * n, l + 1, l, matches);
264 if (s[1] == '>') {
265 AddMatch(id + (t ? 97 : 69) * n, l + 2, l, matches);
266 }
267 } else if (s[0] == '.') {
268 AddMatch(id + (t ? 101 : 79) * n, l + 1, l, matches);
269 if (s[1] == ' ') {
270 AddMatch(id + (t ? 114 : 88) * n, l + 2, l, matches);
271 }
272 } else if (s[0] == ',') {
273 AddMatch(id + (t ? 112 : 99) * n, l + 1, l, matches);
274 if (s[1] == ' ') {
275 AddMatch(id + (t ? 107 : 58) * n, l + 2, l, matches);
276 }
277 } else if (s[0] == '\'') {
278 AddMatch(id + (t ? 94 : 74) * n, l + 1, l, matches);
279 } else if (s[0] == '(') {
280 AddMatch(id + (t ? 113 : 78) * n, l + 1, l, matches);
281 } else if (s[0] == '=') {
282 if (s[1] == '"') {
283 AddMatch(id + (t ? 105 : 104) * n, l + 2, l, matches);
284 } else if (s[1] == '\'') {
285 AddMatch(id + (t ? 116 : 108) * n, l + 2, l, matches);
286 }
287 }
288 }
289 }
290 }
291 // Transforms with prefixes " " and "."
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200292 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200293 bool is_space = (data[0] == ' ');
294 key = Hash(&data[1]);
295 bucket = kStaticDictionaryBuckets[key];
296 int num = bucket & 0xff;
297 int offset = bucket >> 8;
298 for (int i = 0; i < num; ++i) {
299 const DictWord w = kStaticDictionaryWords[offset + i];
300 const int l = w.len;
301 const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
302 const int id = w.idx;
303 if (w.transform == 0) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200304 if (!IsMatch(w, &data[1], max_length - 1)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200305 continue;
306 }
307 // Transforms " " + kIdentity + "" and "." + kIdentity + ""
308 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
309 found_match = true;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200310 if (l + 2 >= max_length) {
311 continue;
312 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200313 // Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
314 const uint8_t* s = &data[l + 1];
315 if (s[0] == ' ') {
316 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
317 } else if (s[0] == '(') {
318 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
319 } else if (is_space) {
320 if (s[0] == ',') {
321 AddMatch(id + 103 * n, l + 2, l, matches);
322 if (s[1] == ' ') {
323 AddMatch(id + 33 * n, l + 3, l, matches);
324 }
325 } else if (s[0] == '.') {
326 AddMatch(id + 71 * n, l + 2, l, matches);
327 if (s[1] == ' ') {
328 AddMatch(id + 52 * n, l + 3, l, matches);
329 }
330 } else if (s[0] == '=') {
331 if (s[1] == '"') {
332 AddMatch(id + 81 * n, l + 3, l, matches);
333 } else if (s[1] == '\'') {
334 AddMatch(id + 98 * n, l + 3, l, matches);
335 }
336 }
337 }
338 } else if (is_space) {
339 // Set t=0 for kUppercaseFirst and t=1 for kUppercaseAll transform.
340 const int t = w.transform - 10;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200341 if (!IsMatch(w, &data[1], max_length - 1)) {
Zoltan Szabadka66098832015-06-12 16:45:17 +0200342 continue;
343 }
344 // Transforms " " + kUppercase{First,All} + ""
345 AddMatch(id + (t ? 85 : 30) * n, l + 1, l, matches);
346 found_match = true;
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200347 if (l + 2 >= max_length) {
348 continue;
349 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200350 // Transforms " " + kUppercase{First,All} + <suffix>
351 const uint8_t* s = &data[l + 1];
352 if (s[0] == ' ') {
353 AddMatch(id + (t ? 83 : 15) * n, l + 2, l, matches);
354 } else if (s[0] == ',') {
355 if (t == 0) {
356 AddMatch(id + 109 * n, l + 2, l, matches);
357 }
358 if (s[1] == ' ') {
359 AddMatch(id + (t ? 111 : 65) * n, l + 3, l, matches);
360 }
361 } else if (s[0] == '.') {
362 AddMatch(id + (t ? 115 : 96) * n, l + 2, l, matches);
363 if (s[1] == ' ') {
364 AddMatch(id + (t ? 117 : 91) * n, l + 3, l, matches);
365 }
366 } else if (s[0] == '=') {
367 if (s[1] == '"') {
368 AddMatch(id + (t ? 110 : 118) * n, l + 3, l, matches);
369 } else if (s[1] == '\'') {
370 AddMatch(id + (t ? 119 : 120) * n, l + 3, l, matches);
371 }
372 }
373 }
374 }
375 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200376 if (max_length >= 6) {
377 // Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0"
378 if ((data[1] == ' ' &&
379 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
Lode Vandevenne6511d6b2015-08-28 16:09:23 +0200380 (data[0] == 0xc2 && data[1] == 0xa0)) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200381 key = Hash(&data[2]);
382 bucket = kStaticDictionaryBuckets[key];
383 int num = bucket & 0xff;
384 int offset = bucket >> 8;
385 for (int i = 0; i < num; ++i) {
386 const DictWord w = kStaticDictionaryWords[offset + i];
387 const int l = w.len;
388 const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
389 const int id = w.idx;
390 if (w.transform == 0 && IsMatch(w, &data[2], max_length - 2)) {
Lode Vandevenne6511d6b2015-08-28 16:09:23 +0200391 if (data[0] == 0xc2) {
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200392 AddMatch(id + 102 * n, l + 2, l, matches);
393 found_match = true;
394 } else if (l + 2 < max_length && data[l + 2] == ' ') {
395 int t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
396 AddMatch(id + t * n, l + 3, l, matches);
397 found_match = true;
398 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200399 }
400 }
401 }
402 }
Lode Vandevenne17ed2582015-08-10 13:13:58 +0200403 if (max_length >= 9) {
404 // Transforms with prefixes " the " and ".com/"
405 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
406 data[3] == 'e' && data[4] == ' ') ||
407 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
408 data[3] == 'm' && data[4] == '/')) {
409 key = Hash(&data[5]);
410 bucket = kStaticDictionaryBuckets[key];
411 int num = bucket & 0xff;
412 int offset = bucket >> 8;
413 for (int i = 0; i < num; ++i) {
414 const DictWord w = kStaticDictionaryWords[offset + i];
415 const int l = w.len;
416 const int n = 1 << kBrotliDictionarySizeBitsByLength[l];
417 const int id = w.idx;
418 if (w.transform == 0 && IsMatch(w, &data[5], max_length - 5)) {
419 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
420 found_match = true;
421 if (l + 5 < max_length) {
422 const uint8_t* s = &data[l + 5];
423 if (data[0] == ' ') {
424 if (l + 8 < max_length &&
425 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
426 AddMatch(id + 62 * n, l + 9, l, matches);
427 if (l + 12 < max_length &&
428 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
429 AddMatch(id + 73 * n, l + 13, l, matches);
430 }
431 }
Zoltan Szabadka66098832015-06-12 16:45:17 +0200432 }
433 }
434 }
435 }
436 }
437 }
438 return found_match;
439}
440
441} // namespace brotli