blob: cb38e8ff36cec2a42dfd2775ec008570feb29711 [file] [log] [blame]
Zoltan Szabadka79e99af2013-10-23 13:06:13 +02001// Copyright 2010 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// A (forgetful) hash table to the data seen by the compressor, to
16// help create backward references to previous data.
17
18#ifndef BROTLI_ENC_HASH_H_
19#define BROTLI_ENC_HASH_H_
20
21#include <stddef.h>
22#include <stdint.h>
23#include <string.h>
24#include <sys/types.h>
25#include <algorithm>
Roderick Sheeter2e5995b2013-12-12 10:43:05 -080026#include <cstdlib>
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020027
28#include "./fast_log.h"
29#include "./find_match_length.h"
30#include "./port.h"
31
32namespace brotli {
33
34// kHashMul32 multiplier has these properties:
35// * The multiplier must be odd. Otherwise we may lose the highest bit.
36// * No long streaks of 1s or 0s.
37// * There is no effort to ensure that it is a prime, the oddity is enough
38// for this use.
39// * The number has been tuned heuristically against compression benchmarks.
40static const uint32_t kHashMul32 = 0x1e35a7bd;
41
42inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) {
43 uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
44 // The higher bits contain more mixture from the multiplication,
45 // so we take our results from there.
46 return h >> (32 - bits);
47}
48
49// Usually, we always choose the longest backward reference. This function
50// allows for the exception of that rule.
51//
52// If we choose a backward reference that is further away, it will
53// usually be coded with more bits. We approximate this by assuming
54// log2(distance). If the distance can be expressed in terms of the
55// last four distances, we use some heuristic constants to estimate
56// the bits cost. For the first up to four literals we use the bit
57// cost of the literals from the literal cost model, after that we
58// use the average bit cost of the cost model.
59//
60// This function is used to sometimes discard a longer backward reference
61// when it is not much longer and the bit cost for encoding it is more
62// than the saved literals.
63inline double BackwardReferenceScore(double average_cost,
64 double start_cost4,
65 double start_cost3,
66 double start_cost2,
67 int copy_length,
68 int backward_reference_offset,
69 int last_distance1,
70 int last_distance2,
71 int last_distance3,
72 int last_distance4) {
73 double retval = 0;
74 switch (copy_length) {
75 case 2: retval = start_cost2; break;
76 case 3: retval = start_cost3; break;
77 default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
78 }
79 int diff_last1 = abs(backward_reference_offset - last_distance1);
80 int diff_last2 = abs(backward_reference_offset - last_distance2);
81 if (diff_last1 == 0) {
82 retval += 0.6;
83 } else if (diff_last1 < 4) {
84 retval -= 0.9 + 0.03 * diff_last1;
85 } else if (diff_last2 < 4) {
86 retval -= 0.95 + 0.1 * diff_last2;
87 } else if (backward_reference_offset == last_distance3) {
88 retval -= 1.17;
89 } else if (backward_reference_offset == last_distance4) {
90 retval -= 1.27;
91 } else {
92 retval -= 1.20 * Log2Floor(backward_reference_offset);
93 }
94 return retval;
95}
96
97// A (forgetful) hash table to the data seen by the compressor, to
98// help create backward references to previous data.
99//
100// This is a hash map of fixed size (kBucketSize) to a ring buffer of
101// fixed size (kBlockSize). The ring buffer contains the last kBlockSize
102// index positions of the given hash key in the compressed data.
103template <int kBucketBits, int kBlockBits>
104class HashLongestMatch {
105 public:
106 HashLongestMatch()
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100107 : last_distance1_(4),
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200108 last_distance2_(11),
109 last_distance3_(15),
110 last_distance4_(16),
111 insert_length_(0),
112 average_cost_(5.4) {
113 Reset();
114 }
115 void Reset() {
116 std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
117 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200118
119 // Look at 3 bytes at data.
120 // Compute a hash from these, and store the value of ix at that position.
121 inline void Store(const uint8_t *data, const int ix) {
122 const uint32_t key = Hash3Bytes(data, kBucketBits);
123 const int minor_ix = num_[key] & kBlockMask;
124 buckets_[key][minor_ix] = ix;
125 ++num_[key];
126 }
127
128 // Store hashes for a range of data.
129 void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
130 for (int p = 0; p < len; ++p) {
131 Store(&data[p & mask], startix + p);
132 }
133 }
134
135 // Find a longest backward match of &data[cur_ix] up to the length of
136 // max_length.
137 //
138 // Does not look for matches longer than max_length.
139 // Does not look for matches further away than max_backward.
140 // Writes the best found match length into best_len_out.
141 // Writes the index (&data[index]) offset from the start of the best match
142 // into best_distance_out.
143 // Write the score of the best match into best_score_out.
144 bool FindLongestMatch(const uint8_t * __restrict data,
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100145 const float * __restrict literal_cost,
146 const size_t ring_buffer_mask,
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200147 const uint32_t cur_ix,
148 uint32_t max_length,
149 const uint32_t max_backward,
150 size_t * __restrict best_len_out,
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800151 size_t * __restrict best_len_code_out,
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200152 size_t * __restrict best_distance_out,
153 double * __restrict best_score_out) {
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100154 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
155 const double start_cost4 = literal_cost == NULL ? 20 :
156 literal_cost[cur_ix_masked] +
157 literal_cost[(cur_ix + 1) & ring_buffer_mask] +
158 literal_cost[(cur_ix + 2) & ring_buffer_mask] +
159 literal_cost[(cur_ix + 3) & ring_buffer_mask];
160 const double start_cost3 = literal_cost == NULL ? 15 :
161 literal_cost[cur_ix_masked] +
162 literal_cost[(cur_ix + 1) & ring_buffer_mask] +
163 literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
164 double start_cost2 = literal_cost == NULL ? 10 :
165 literal_cost[cur_ix_masked] +
166 literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200167 bool match_found = false;
168 // Don't accept a short copy from far away.
169 double best_score = 8.25;
170 if (insert_length_ < 4) {
171 double cost_diff[4] = { 0.20, 0.09, 0.05, 0.03 };
172 best_score += cost_diff[insert_length_];
173 }
174 size_t best_len = *best_len_out;
175 *best_len_out = 0;
176 size_t best_ix = 1;
177 // Try last distance first.
178 for (int i = 0; i < 16; ++i) {
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100179 size_t prev_ix = cur_ix;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200180 switch(i) {
181 case 0: prev_ix -= last_distance1_; break;
182 case 1: prev_ix -= last_distance2_; break;
183 case 2: prev_ix -= last_distance3_; break;
184 case 3: prev_ix -= last_distance4_; break;
185
186 case 4: prev_ix -= last_distance1_ - 1; break;
187 case 5: prev_ix -= last_distance1_ + 1; break;
188 case 6: prev_ix -= last_distance1_ - 2; break;
189 case 7: prev_ix -= last_distance1_ + 2; break;
190 case 8: prev_ix -= last_distance1_ - 3; break;
191 case 9: prev_ix -= last_distance1_ + 3; break;
192
193 case 10: prev_ix -= last_distance2_ - 1; break;
194 case 11: prev_ix -= last_distance2_ + 1; break;
195 case 12: prev_ix -= last_distance2_ - 2; break;
196 case 13: prev_ix -= last_distance2_ + 2; break;
197 case 14: prev_ix -= last_distance2_ - 3; break;
198 case 15: prev_ix -= last_distance2_ + 3; break;
199 }
200 if (prev_ix >= cur_ix) {
201 continue;
202 }
203 const size_t backward = cur_ix - prev_ix;
204 if (PREDICT_FALSE(backward > max_backward)) {
205 continue;
206 }
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100207 prev_ix &= ring_buffer_mask;
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100208 if (cur_ix_masked + best_len > ring_buffer_mask ||
209 prev_ix + best_len > ring_buffer_mask ||
210 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200211 continue;
212 }
213 const size_t len =
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100214 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
215 max_length);
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200216 if (len >= 3 || (len == 2 && i < 2)) {
217 // Comparing for >= 2 does not change the semantics, but just saves for
218 // a few unnecessary binary logarithms in backward reference score,
219 // since we are not interested in such short matches.
220 const double score = BackwardReferenceScore(average_cost_,
221 start_cost4,
222 start_cost3,
223 start_cost2,
224 len, backward,
225 last_distance1_,
226 last_distance2_,
227 last_distance3_,
228 last_distance4_);
229 if (best_score < score) {
230 best_score = score;
231 best_len = len;
232 best_ix = backward;
233 *best_len_out = best_len;
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800234 *best_len_code_out = best_len;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200235 *best_distance_out = best_ix;
236 *best_score_out = best_score;
237 match_found = true;
238 }
239 }
240 }
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100241 const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800242 const int * __restrict const bucket = &buckets_[key][0];
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200243 const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
244 int stop = int(cur_ix) - 64;
245 if (stop < 0) { stop = 0; }
246
247 start_cost2 -= 1.0;
248 for (int i = cur_ix - 1; i > stop; --i) {
249 size_t prev_ix = i;
250 const size_t backward = cur_ix - prev_ix;
251 if (PREDICT_FALSE(backward > max_backward)) {
252 break;
253 }
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100254 prev_ix &= ring_buffer_mask;
255 if (data[cur_ix_masked] != data[prev_ix] ||
256 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200257 continue;
258 }
259 int len = 2;
260 const double score = start_cost2 - 1.70 * Log2Floor(backward);
261
262 if (best_score < score) {
263 best_score = score;
264 best_len = len;
265 best_ix = backward;
266 *best_len_out = best_len;
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800267 *best_len_code_out = best_len;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200268 *best_distance_out = best_ix;
269 match_found = true;
270 }
271 }
272 for (int i = num_[key] - 1; i >= down; --i) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800273 int prev_ix = bucket[i & kBlockMask];
274 if (prev_ix < 0) {
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200275 continue;
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800276 } else {
277 const size_t backward = cur_ix - prev_ix;
278 if (PREDICT_FALSE(backward > max_backward)) {
279 break;
280 }
281 prev_ix &= ring_buffer_mask;
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100282 if (cur_ix_masked + best_len > ring_buffer_mask ||
283 prev_ix + best_len > ring_buffer_mask ||
284 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800285 continue;
286 }
287 const size_t len =
288 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
289 max_length);
290 if (len >= 3) {
291 // Comparing for >= 3 does not change the semantics, but just saves
292 // for a few unnecessary binary logarithms in backward reference
293 // score, since we are not interested in such short matches.
294 const double score = BackwardReferenceScore(average_cost_,
295 start_cost4,
296 start_cost3,
297 start_cost2,
298 len, backward,
299 last_distance1_,
300 last_distance2_,
301 last_distance3_,
302 last_distance4_);
303 if (best_score < score) {
304 best_score = score;
305 best_len = len;
306 best_ix = backward;
307 *best_len_out = best_len;
308 *best_len_code_out = best_len;
309 *best_distance_out = best_ix;
310 *best_score_out = best_score;
311 match_found = true;
312 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200313 }
314 }
315 }
316 return match_found;
317 }
318
319 void set_last_distance(int v) {
320 if (last_distance1_ != v) {
321 last_distance4_ = last_distance3_;
322 last_distance3_ = last_distance2_;
323 last_distance2_ = last_distance1_;
324 last_distance1_ = v;
325 }
326 }
327
328 int last_distance() const { return last_distance1_; }
329
330 void set_insert_length(int v) { insert_length_ = v; }
331
332 void set_average_cost(double v) { average_cost_ = v; }
333
334 private:
335 // Number of hash buckets.
336 static const uint32_t kBucketSize = 1 << kBucketBits;
337
338 // Only kBlockSize newest backward references are kept,
339 // and the older are forgotten.
340 static const uint32_t kBlockSize = 1 << kBlockBits;
341
342 // Mask for accessing entries in a block (in a ringbuffer manner).
343 static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
344
345 // Number of entries in a particular bucket.
346 uint16_t num_[kBucketSize];
347
348 // Buckets containing kBlockSize of backward references.
Roderick Sheeter437bbad2013-11-19 14:32:56 -0800349 int buckets_[kBucketSize][kBlockSize];
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200350
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200351 int last_distance1_;
352 int last_distance2_;
353 int last_distance3_;
354 int last_distance4_;
355
356 // Cost adjustment for how many literals we are planning to insert
357 // anyway.
358 int insert_length_;
359
360 double average_cost_;
361};
362
Zoltan Szabadka1571db32013-11-15 19:02:17 +0100363typedef HashLongestMatch<13, 11> Hasher;
364
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200365} // namespace brotli
366
367#endif // BROTLI_ENC_HASH_H_