blob: bc3e1c4cf585175724a8bb3d5e25457263e6bccf [file] [log] [blame]
Zoltan Szabadka79e99af2013-10-23 13:06:13 +02001// Copyright 2010 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// A (forgetful) hash table to the data seen by the compressor, to
16// help create backward references to previous data.
17
18#ifndef BROTLI_ENC_HASH_H_
19#define BROTLI_ENC_HASH_H_
20
21#include <stddef.h>
22#include <stdint.h>
23#include <string.h>
24#include <sys/types.h>
25#include <algorithm>
Roderick Sheeter2e5995b2013-12-12 10:43:05 -080026#include <cstdlib>
Zoltan Szabadka278b8942014-03-20 14:32:35 +010027#include <memory>
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +010028#include <string>
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020029
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +010030#include "./transform.h"
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020031#include "./fast_log.h"
32#include "./find_match_length.h"
33#include "./port.h"
Zoltan Szabadka278b8942014-03-20 14:32:35 +010034#include "./static_dict.h"
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020035
36namespace brotli {
37
38// kHashMul32 multiplier has these properties:
39// * The multiplier must be odd. Otherwise we may lose the highest bit.
40// * No long streaks of 1s or 0s.
41// * There is no effort to ensure that it is a prime, the oddity is enough
42// for this use.
43// * The number has been tuned heuristically against compression benchmarks.
44static const uint32_t kHashMul32 = 0x1e35a7bd;
45
Zoltan Szabadka278b8942014-03-20 14:32:35 +010046template<int kShiftBits, int kMinLength>
47inline uint32_t Hash(const uint8_t *data) {
48 if (kMinLength <= 3) {
49 // If kMinLength is 2 or 3, we hash the first 3 bytes of data.
50 uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
51 // The higher bits contain more mixture from the multiplication,
52 // so we take our results from there.
53 return h >> (32 - kShiftBits);
54 } else {
55 // If kMinLength is at least 4, we hash the first 4 bytes of data.
56 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
57 // The higher bits contain more mixture from the multiplication,
58 // so we take our results from there.
59 return h >> (32 - kShiftBits);
60 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020061}
62
63// Usually, we always choose the longest backward reference. This function
64// allows for the exception of that rule.
65//
66// If we choose a backward reference that is further away, it will
67// usually be coded with more bits. We approximate this by assuming
68// log2(distance). If the distance can be expressed in terms of the
69// last four distances, we use some heuristic constants to estimate
70// the bits cost. For the first up to four literals we use the bit
71// cost of the literals from the literal cost model, after that we
72// use the average bit cost of the cost model.
73//
74// This function is used to sometimes discard a longer backward reference
75// when it is not much longer and the bit cost for encoding it is more
76// than the saved literals.
77inline double BackwardReferenceScore(double average_cost,
78 double start_cost4,
79 double start_cost3,
80 double start_cost2,
81 int copy_length,
Zoltan Szabadka278b8942014-03-20 14:32:35 +010082 int backward_reference_offset) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020083 double retval = 0;
84 switch (copy_length) {
85 case 2: retval = start_cost2; break;
86 case 3: retval = start_cost3; break;
87 default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
88 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +010089 retval -= 1.20 * Log2Floor(backward_reference_offset);
90 return retval;
91}
92
93inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
94 double start_cost4,
95 double start_cost3,
96 double start_cost2,
97 int copy_length,
98 int distance_short_code) {
99 double retval = 0;
100 switch (copy_length) {
101 case 2: retval = start_cost2; break;
102 case 3: retval = start_cost3; break;
103 default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200104 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100105 static const double kDistanceShortCodeBitCost[16] = {
106 -0.6, 0.95, 1.17, 1.27,
107 0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
108 1.05, 1.05, 1.15, 1.15, 1.25, 1.25
109 };
110 retval -= kDistanceShortCodeBitCost[distance_short_code];
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200111 return retval;
112}
113
114// A (forgetful) hash table to the data seen by the compressor, to
115// help create backward references to previous data.
116//
117// This is a hash map of fixed size (kBucketSize) to a ring buffer of
118// fixed size (kBlockSize). The ring buffer contains the last kBlockSize
119// index positions of the given hash key in the compressed data.
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100120template <int kBucketBits, int kBlockBits, int kMinLength>
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200121class HashLongestMatch {
122 public:
123 HashLongestMatch()
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100124 : last_distance1_(4),
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200125 last_distance2_(11),
126 last_distance3_(15),
127 last_distance4_(16),
128 insert_length_(0),
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100129 average_cost_(5.4),
130 static_dict_(NULL) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200131 Reset();
132 }
133 void Reset() {
134 std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
135 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100136 void SetStaticDictionary(const StaticDictionary *dict) {
137 static_dict_ = dict;
138 }
139 bool HasStaticDictionary() const {
140 return static_dict_ != NULL;
141 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200142
143 // Look at 3 bytes at data.
144 // Compute a hash from these, and store the value of ix at that position.
145 inline void Store(const uint8_t *data, const int ix) {
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100146 const uint32_t key = Hash<kBucketBits, kMinLength>(data);
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200147 const int minor_ix = num_[key] & kBlockMask;
148 buckets_[key][minor_ix] = ix;
149 ++num_[key];
150 }
151
152 // Store hashes for a range of data.
153 void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
154 for (int p = 0; p < len; ++p) {
155 Store(&data[p & mask], startix + p);
156 }
157 }
158
159 // Find a longest backward match of &data[cur_ix] up to the length of
160 // max_length.
161 //
162 // Does not look for matches longer than max_length.
163 // Does not look for matches further away than max_backward.
164 // Writes the best found match length into best_len_out.
165 // Writes the index (&data[index]) offset from the start of the best match
166 // into best_distance_out.
167 // Write the score of the best match into best_score_out.
168 bool FindLongestMatch(const uint8_t * __restrict data,
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100169 const float * __restrict literal_cost,
170 const size_t ring_buffer_mask,
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200171 const uint32_t cur_ix,
172 uint32_t max_length,
173 const uint32_t max_backward,
174 size_t * __restrict best_len_out,
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800175 size_t * __restrict best_len_code_out,
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200176 size_t * __restrict best_distance_out,
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +0100177 double * __restrict best_score_out,
178 bool * __restrict in_dictionary) {
179 *in_dictionary = true;
180 *best_len_code_out = 0;
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100181 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
182 const double start_cost4 = literal_cost == NULL ? 20 :
183 literal_cost[cur_ix_masked] +
184 literal_cost[(cur_ix + 1) & ring_buffer_mask] +
185 literal_cost[(cur_ix + 2) & ring_buffer_mask] +
186 literal_cost[(cur_ix + 3) & ring_buffer_mask];
187 const double start_cost3 = literal_cost == NULL ? 15 :
188 literal_cost[cur_ix_masked] +
189 literal_cost[(cur_ix + 1) & ring_buffer_mask] +
190 literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
191 double start_cost2 = literal_cost == NULL ? 10 :
192 literal_cost[cur_ix_masked] +
193 literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200194 bool match_found = false;
195 // Don't accept a short copy from far away.
Zoltan Szabadka0829e372014-03-25 16:48:25 +0100196 double best_score = 8.115;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200197 if (insert_length_ < 4) {
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +0100198 double cost_diff[4] = { 0.10, 0.04, 0.02, 0.01 };
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200199 best_score += cost_diff[insert_length_];
200 }
201 size_t best_len = *best_len_out;
202 *best_len_out = 0;
203 size_t best_ix = 1;
204 // Try last distance first.
205 for (int i = 0; i < 16; ++i) {
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100206 size_t prev_ix = cur_ix;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200207 switch(i) {
208 case 0: prev_ix -= last_distance1_; break;
209 case 1: prev_ix -= last_distance2_; break;
210 case 2: prev_ix -= last_distance3_; break;
211 case 3: prev_ix -= last_distance4_; break;
212
213 case 4: prev_ix -= last_distance1_ - 1; break;
214 case 5: prev_ix -= last_distance1_ + 1; break;
215 case 6: prev_ix -= last_distance1_ - 2; break;
216 case 7: prev_ix -= last_distance1_ + 2; break;
217 case 8: prev_ix -= last_distance1_ - 3; break;
218 case 9: prev_ix -= last_distance1_ + 3; break;
219
220 case 10: prev_ix -= last_distance2_ - 1; break;
221 case 11: prev_ix -= last_distance2_ + 1; break;
222 case 12: prev_ix -= last_distance2_ - 2; break;
223 case 13: prev_ix -= last_distance2_ + 2; break;
224 case 14: prev_ix -= last_distance2_ - 3; break;
225 case 15: prev_ix -= last_distance2_ + 3; break;
226 }
227 if (prev_ix >= cur_ix) {
228 continue;
229 }
230 const size_t backward = cur_ix - prev_ix;
231 if (PREDICT_FALSE(backward > max_backward)) {
232 continue;
233 }
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100234 prev_ix &= ring_buffer_mask;
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100235 if (cur_ix_masked + best_len > ring_buffer_mask ||
236 prev_ix + best_len > ring_buffer_mask ||
237 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200238 continue;
239 }
240 const size_t len =
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100241 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
242 max_length);
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100243 if (len >= std::max(kMinLength, 3) ||
244 (kMinLength == 2 && len == 2 && i < 2)) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200245 // Comparing for >= 2 does not change the semantics, but just saves for
246 // a few unnecessary binary logarithms in backward reference score,
247 // since we are not interested in such short matches.
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100248 const double score = BackwardReferenceScoreUsingLastDistance(
249 average_cost_,
250 start_cost4,
251 start_cost3,
252 start_cost2,
253 len, i);
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200254 if (best_score < score) {
255 best_score = score;
256 best_len = len;
257 best_ix = backward;
258 *best_len_out = best_len;
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800259 *best_len_code_out = best_len;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200260 *best_distance_out = best_ix;
261 *best_score_out = best_score;
262 match_found = true;
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +0100263 *in_dictionary = backward > max_backward;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200264 }
265 }
266 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100267 if (kMinLength == 2) {
268 int stop = int(cur_ix) - 64;
269 if (stop < 0) { stop = 0; }
270 start_cost2 -= 1.0;
271 for (int i = cur_ix - 1; i > stop; --i) {
272 size_t prev_ix = i;
273 const size_t backward = cur_ix - prev_ix;
274 if (PREDICT_FALSE(backward > max_backward)) {
275 break;
276 }
277 prev_ix &= ring_buffer_mask;
278 if (data[cur_ix_masked] != data[prev_ix] ||
279 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
280 continue;
281 }
282 int len = 2;
283 const double score = start_cost2 - 2.3 * Log2Floor(backward);
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200284
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100285 if (best_score < score) {
286 best_score = score;
287 best_len = len;
288 best_ix = backward;
289 *best_len_out = best_len;
290 *best_len_code_out = best_len;
291 *best_distance_out = best_ix;
292 match_found = true;
293 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200294 }
295 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100296 const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
297 const int * __restrict const bucket = &buckets_[key][0];
298 const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200299 for (int i = num_[key] - 1; i >= down; --i) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800300 int prev_ix = bucket[i & kBlockMask];
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100301 if (prev_ix >= 0) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800302 const size_t backward = cur_ix - prev_ix;
303 if (PREDICT_FALSE(backward > max_backward)) {
304 break;
305 }
306 prev_ix &= ring_buffer_mask;
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100307 if (cur_ix_masked + best_len > ring_buffer_mask ||
308 prev_ix + best_len > ring_buffer_mask ||
309 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800310 continue;
311 }
312 const size_t len =
313 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
314 max_length);
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100315 if (len >= std::max(kMinLength, 3)) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800316 // Comparing for >= 3 does not change the semantics, but just saves
317 // for a few unnecessary binary logarithms in backward reference
318 // score, since we are not interested in such short matches.
319 const double score = BackwardReferenceScore(average_cost_,
320 start_cost4,
321 start_cost3,
322 start_cost2,
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100323 len, backward);
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800324 if (best_score < score) {
325 best_score = score;
326 best_len = len;
327 best_ix = backward;
328 *best_len_out = best_len;
329 *best_len_code_out = best_len;
330 *best_distance_out = best_ix;
331 *best_score_out = best_score;
332 match_found = true;
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +0100333 *in_dictionary = false;
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800334 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200335 }
336 }
337 }
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100338 if (static_dict_ != NULL) {
339 // We decide based on first 4 bytes how many bytes to test for.
340 int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
341 int maxlen = static_dict_->GetLength(prefix);
342 for (int len = std::min<size_t>(maxlen, max_length);
343 len > best_len && len >= 4; --len) {
344 std::string snippet((const char *)&data[cur_ix_masked], len);
345 int copy_len_code;
346 int word_id;
347 if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
348 const size_t backward = max_backward + word_id + 1;
349 const double score = BackwardReferenceScore(average_cost_,
350 start_cost4,
351 start_cost3,
352 start_cost2,
353 len, backward);
354 if (best_score < score) {
355 best_score = score;
356 best_len = len;
357 best_ix = backward;
358 *best_len_out = best_len;
359 *best_len_code_out = copy_len_code;
360 *best_distance_out = best_ix;
361 *best_score_out = best_score;
362 match_found = true;
363 *in_dictionary = true;
364 }
365 }
366 }
367 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200368 return match_found;
369 }
370
371 void set_last_distance(int v) {
372 if (last_distance1_ != v) {
373 last_distance4_ = last_distance3_;
374 last_distance3_ = last_distance2_;
375 last_distance2_ = last_distance1_;
376 last_distance1_ = v;
377 }
378 }
379
380 int last_distance() const { return last_distance1_; }
381
382 void set_insert_length(int v) { insert_length_ = v; }
383
384 void set_average_cost(double v) { average_cost_ = v; }
385
386 private:
387 // Number of hash buckets.
388 static const uint32_t kBucketSize = 1 << kBucketBits;
389
390 // Only kBlockSize newest backward references are kept,
391 // and the older are forgotten.
392 static const uint32_t kBlockSize = 1 << kBlockBits;
393
394 // Mask for accessing entries in a block (in a ringbuffer manner).
395 static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
396
397 // Number of entries in a particular bucket.
398 uint16_t num_[kBucketSize];
399
400 // Buckets containing kBlockSize of backward references.
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800401 int buckets_[kBucketSize][kBlockSize];
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200402
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200403 int last_distance1_;
404 int last_distance2_;
405 int last_distance3_;
406 int last_distance4_;
407
408 // Cost adjustment for how many literals we are planning to insert
409 // anyway.
410 int insert_length_;
411
412 double average_cost_;
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100413
414 const StaticDictionary *static_dict_;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200415};
416
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100417struct Hashers {
418 enum Type {
419 HASH_15_8_4 = 0,
420 HASH_15_8_2 = 1,
421 };
422
423 void Init(Type type) {
424 switch (type) {
425 case HASH_15_8_4:
426 hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
427 break;
428 case HASH_15_8_2:
429 hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
430 break;
431 default:
432 break;
433 }
434 }
435
436 void SetStaticDictionary(const StaticDictionary *dict) {
437 if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
438 if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
439 }
440
441 std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
442 std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
443};
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100444
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200445} // namespace brotli
446
447#endif // BROTLI_ENC_HASH_H_