blob: a24d0fbcabd0b2cbebf93a45969c7f8dd5f2b03e [file] [log] [blame]
Zoltan Szabadka79e99af2013-10-23 13:06:13 +02001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Block split point selection utilities.
16
17#include "./block_splitter.h"
18
19#include <math.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23
24#include <algorithm>
25#include <map>
26
27#include "./cluster.h"
28#include "./command.h"
29#include "./fast_log.h"
30#include "./histogram.h"
31
32namespace brotli {
33
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +010034static const int kMaxLiteralHistograms = 100;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020035static const int kMaxCommandHistograms = 50;
36static const double kLiteralBlockSwitchCost = 26;
37static const double kCommandBlockSwitchCost = 13.5;
38static const double kDistanceBlockSwitchCost = 14.6;
39static const int kLiteralStrideLength = 70;
40static const int kCommandStrideLength = 40;
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +010041static const int kSymbolsPerLiteralHistogram = 544;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020042static const int kSymbolsPerCommandHistogram = 530;
Zoltan Szabadkacbd5cb52014-02-14 15:04:23 +010043static const int kSymbolsPerDistanceHistogram = 544;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020044static const int kMinLengthForBlockSplitting = 128;
45static const int kIterMulForRefining = 2;
46static const int kMinItersForRefining = 100;
47
48void CopyLiteralsToByteArray(const std::vector<Command>& cmds,
49 const uint8_t* data,
50 std::vector<uint8_t>* literals) {
51 // Count how many we have.
52 size_t total_length = 0;
53 for (int i = 0; i < cmds.size(); ++i) {
54 total_length += cmds[i].insert_length_;
55 }
56 if (total_length == 0) {
57 return;
58 }
59
60 // Allocate.
61 literals->resize(total_length);
62
63 // Loop again, and copy this time.
64 size_t pos = 0;
65 size_t from_pos = 0;
66 for (int i = 0; i < cmds.size() && pos < total_length; ++i) {
67 memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_length_);
68 pos += cmds[i].insert_length_;
69 from_pos += cmds[i].insert_length_ + cmds[i].copy_length_;
70 }
71}
72
73void CopyCommandsToByteArray(const std::vector<Command>& cmds,
74 std::vector<uint16_t>* insert_and_copy_codes,
75 std::vector<uint8_t>* distance_prefixes) {
76 for (int i = 0; i < cmds.size(); ++i) {
77 const Command& cmd = cmds[i];
78 insert_and_copy_codes->push_back(cmd.command_prefix_);
79 if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
80 distance_prefixes->push_back(cmd.distance_prefix_);
81 }
82 }
83}
84
Zoltan Szabadka79e99af2013-10-23 13:06:13 +020085template<typename HistogramType, typename DataType>
86void InitialEntropyCodes(const DataType* data, size_t length,
87 int literals_per_histogram,
88 int max_histograms,
89 size_t stride,
90 std::vector<HistogramType>* vec) {
91 int total_histograms = length / literals_per_histogram + 1;
92 if (total_histograms > max_histograms) {
93 total_histograms = max_histograms;
94 }
95 unsigned int seed = 7;
96 int block_length = length / total_histograms;
97 for (int i = 0; i < total_histograms; ++i) {
98 int pos = length * i / total_histograms;
99 if (i != 0) {
100 pos += rand_r(&seed) % block_length;
101 }
102 if (pos + stride >= length) {
103 pos = length - stride - 1;
104 }
105 HistogramType histo;
106 histo.Add(data + pos, stride);
107 vec->push_back(histo);
108 }
109}
110
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200111template<typename HistogramType, typename DataType>
112void RandomSample(unsigned int* seed,
113 const DataType* data,
114 size_t length,
115 size_t stride,
116 HistogramType* sample) {
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100117 size_t pos = 0;
118 if (stride >= length) {
119 pos = 0;
120 stride = length;
121 } else {
122 pos = rand_r(seed) % (length - stride + 1);
123 }
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200124 sample->Add(data + pos, stride);
125}
126
127template<typename HistogramType, typename DataType>
128void RefineEntropyCodes(const DataType* data, size_t length,
129 size_t stride,
130 std::vector<HistogramType>* vec) {
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100131 int iters =
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200132 kIterMulForRefining * length / stride + kMinItersForRefining;
133 unsigned int seed = 7;
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100134 iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200135 for (int iter = 0; iter < iters; ++iter) {
136 HistogramType sample;
137 RandomSample(&seed, data, length, stride, &sample);
Zoltan Szabadka40955ce2014-01-06 16:01:57 +0100138 int ix = iter % vec->size();
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200139 (*vec)[ix].AddHistogram(sample);
140 }
141}
142
143inline static float BitCost(int total, int count) {
144 return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
145}
146
147template<typename DataType, int kSize>
148void FindBlocks(const DataType* data, const size_t length,
149 const double block_switch_bitcost,
150 const std::vector<Histogram<kSize> > &vec,
151 uint8_t *block_id) {
152 if (vec.size() <= 1) {
153 for (int i = 0; i < length; ++i) {
154 block_id[i] = 0;
155 }
156 return;
157 }
158 int vecsize = vec.size();
159 double* insert_cost = new double[kSize * vecsize];
160 memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
161 for (int i = 0; i < kSize; ++i) {
162 for (int j = 0; j < vecsize; ++j) {
163 insert_cost[i * vecsize + j] =
164 BitCost(vec[j].total_count_, vec[j].data_[i]);
165 }
166 }
167 double *cost = new double[vecsize];
168 memset(cost, 0, sizeof(cost[0]) * vecsize);
169 bool* switch_signal = new bool[length * vecsize];
170 memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
171 // After each iteration of this loop, cost[k] will contain the difference
172 // between the minimum cost of arriving at the current byte position using
173 // entropy code k, and the minimum cost of arriving at the current byte
174 // position. This difference is capped at the block switch cost, and if it
175 // reaches block switch cost, it means that when we trace back from the last
176 // position, we need to switch here.
177 for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
178 int ix = byte_ix * vecsize;
179 int insert_cost_ix = data[byte_ix] * vecsize;
180 double min_cost = 1e99;
181 for (int k = 0; k < vecsize; ++k) {
182 // We are coding the symbol in data[byte_ix] with entropy code k.
183 cost[k] += insert_cost[insert_cost_ix + k];
184 if (cost[k] < min_cost) {
185 min_cost = cost[k];
186 block_id[byte_ix] = k;
187 }
188 }
189 double block_switch_cost = block_switch_bitcost;
190 // More blocks for the beginning.
191 if (byte_ix < 2000) {
192 block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
193 }
194 for (int k = 0; k < vecsize; ++k) {
195 cost[k] -= min_cost;
196 if (cost[k] >= block_switch_cost) {
197 cost[k] = block_switch_cost;
198 switch_signal[ix + k] = true;
199 }
200 }
201 }
202 // Now trace back from the last position and switch at the marked places.
203 int byte_ix = length - 1;
204 int ix = byte_ix * vecsize;
205 int cur_id = block_id[byte_ix];
206 while (byte_ix > 0) {
207 --byte_ix;
208 ix -= vecsize;
209 if (switch_signal[ix + cur_id]) {
210 cur_id = block_id[byte_ix];
211 }
212 block_id[byte_ix] = cur_id;
213 }
214 delete[] insert_cost;
215 delete[] cost;
216 delete[] switch_signal;
217}
218
219int RemapBlockIds(uint8_t* block_ids, const size_t length) {
220 std::map<uint8_t, uint8_t> new_id;
221 int next_id = 0;
222 for (int i = 0; i < length; ++i) {
223 if (new_id.find(block_ids[i]) == new_id.end()) {
224 new_id[block_ids[i]] = next_id;
225 ++next_id;
226 }
227 }
228 for (int i = 0; i < length; ++i) {
229 block_ids[i] = new_id[block_ids[i]];
230 }
231 return next_id;
232}
233
234template<typename HistogramType, typename DataType>
235void BuildBlockHistograms(const DataType* data, const size_t length,
236 uint8_t* block_ids,
237 std::vector<HistogramType>* histograms) {
238 int num_types = RemapBlockIds(block_ids, length);
239 histograms->clear();
240 histograms->resize(num_types);
241 for (int i = 0; i < length; ++i) {
242 (*histograms)[block_ids[i]].Add(data[i]);
243 }
244}
245
246template<typename HistogramType, typename DataType>
247void ClusterBlocks(const DataType* data, const size_t length,
248 uint8_t* block_ids) {
249 std::vector<HistogramType> histograms;
250 std::vector<int> block_index(length);
251 int cur_idx = 0;
252 HistogramType cur_histogram;
253 for (int i = 0; i < length; ++i) {
254 bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
255 block_index[i] = cur_idx;
256 cur_histogram.Add(data[i]);
257 if (block_boundary) {
258 histograms.push_back(cur_histogram);
259 cur_histogram.Clear();
260 ++cur_idx;
261 }
262 }
263 std::vector<HistogramType> clustered_histograms;
264 std::vector<int> histogram_symbols;
Zoltan Szabadkae7094912013-12-12 13:18:04 +0100265 // Block ids need to fit in one byte.
266 static const int kMaxNumberOfBlockTypes = 256;
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200267 ClusterHistograms(histograms, 1, histograms.size(),
268 kMaxNumberOfBlockTypes,
269 &clustered_histograms,
270 &histogram_symbols);
271 for (int i = 0; i < length; ++i) {
272 block_ids[i] = histogram_symbols[block_index[i]];
273 }
274}
275
276void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
277 int cur_id = block_ids[0];
278 int cur_length = 1;
279 split->num_types_ = -1;
280 for (int i = 1; i < block_ids.size(); ++i) {
281 if (block_ids[i] != cur_id) {
282 split->types_.push_back(cur_id);
283 split->lengths_.push_back(cur_length);
284 split->num_types_ = std::max(split->num_types_, cur_id);
285 cur_id = block_ids[i];
286 cur_length = 0;
287 }
288 ++cur_length;
289 }
290 split->types_.push_back(cur_id);
291 split->lengths_.push_back(cur_length);
292 split->num_types_ = std::max(split->num_types_, cur_id);
293 ++split->num_types_;
294}
295
296template<typename HistogramType, typename DataType>
297void SplitByteVector(const std::vector<DataType>& data,
298 const int literals_per_histogram,
299 const int max_histograms,
300 const int sampling_stride_length,
301 const double block_switch_cost,
302 BlockSplit* split) {
303 if (data.empty()) {
304 split->num_types_ = 0;
305 return;
306 } else if (data.size() < kMinLengthForBlockSplitting) {
307 split->num_types_ = 1;
308 split->types_.push_back(0);
309 split->lengths_.push_back(data.size());
310 return;
311 }
312 std::vector<HistogramType> histograms;
313 // Find good entropy codes.
314 InitialEntropyCodes(data.data(), data.size(),
315 literals_per_histogram,
316 max_histograms,
317 sampling_stride_length,
318 &histograms);
319 RefineEntropyCodes(data.data(), data.size(),
320 sampling_stride_length,
321 &histograms);
322 // Find a good path through literals with the good entropy codes.
323 std::vector<uint8_t> block_ids(data.size());
324 for (int i = 0; i < 10; ++i) {
325 FindBlocks(data.data(), data.size(),
326 block_switch_cost,
327 histograms,
328 &block_ids[0]);
329 BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
330 }
331 ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
332 BuildBlockSplit(block_ids, split);
333}
334
335void SplitBlock(const std::vector<Command>& cmds,
336 const uint8_t* data,
337 BlockSplit* literal_split,
338 BlockSplit* insert_and_copy_split,
339 BlockSplit* dist_split) {
340 // Create a continuous array of literals.
341 std::vector<uint8_t> literals;
342 CopyLiteralsToByteArray(cmds, data, &literals);
343
344 // Compute prefix codes for commands.
345 std::vector<uint16_t> insert_and_copy_codes;
346 std::vector<uint8_t> distance_prefixes;
347 CopyCommandsToByteArray(cmds,
348 &insert_and_copy_codes,
349 &distance_prefixes);
350
Zoltan Szabadka278b8942014-03-20 14:32:35 +0100351
Zoltan Szabadka79e99af2013-10-23 13:06:13 +0200352 SplitByteVector<HistogramLiteral>(
353 literals,
354 kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
355 kLiteralStrideLength, kLiteralBlockSwitchCost,
356 literal_split);
357 SplitByteVector<HistogramCommand>(
358 insert_and_copy_codes,
359 kSymbolsPerCommandHistogram, kMaxCommandHistograms,
360 kCommandStrideLength, kCommandBlockSwitchCost,
361 insert_and_copy_split);
362 SplitByteVector<HistogramDistance>(
363 distance_prefixes,
364 kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
365 kCommandStrideLength, kDistanceBlockSwitchCost,
366 dist_split);
367}
368
369void SplitBlockByTotalLength(const std::vector<Command>& all_commands,
370 int input_size,
371 int target_length,
372 std::vector<std::vector<Command> >* blocks) {
373 int num_blocks = input_size / target_length + 1;
374 int length_limit = input_size / num_blocks + 1;
375 int total_length = 0;
376 std::vector<Command> cur_block;
377 for (int i = 0; i < all_commands.size(); ++i) {
378 const Command& cmd = all_commands[i];
379 int cmd_length = cmd.insert_length_ + cmd.copy_length_;
380 if (total_length > length_limit) {
381 blocks->push_back(cur_block);
382 cur_block.clear();
383 total_length = 0;
384 }
385 cur_block.push_back(cmd);
386 total_length += cmd_length;
387 }
388 blocks->push_back(cur_block);
389}
390
391} // namespace brotli