Blame - brotli/enc/block_splitter.cc - fp2-dev/platform/external/chromium_org/third_party/brotli/src

blob: a24d0fbcabd0b2cbebf93a45969c7f8dd5f2b03e [file] [log] [blame]

Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14	//
				15	// Block split point selection utilities.
				16
				17	#include "./block_splitter.h"
				18
				19	#include <math.h>
				20	#include <stdio.h>
				21	#include <stdlib.h>
				22	#include <string.h>
				23
				24	#include <algorithm>
				25	#include <map>
				26
				27	#include "./cluster.h"
				28	#include "./command.h"
				29	#include "./fast_log.h"
				30	#include "./histogram.h"
				31
				32	namespace brotli {
				33
Zoltan Szabadka	cbd5cb5	2014-02-14 15:04:23 +0100	[diff] [blame]	34	static const int kMaxLiteralHistograms = 100;
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	35	static const int kMaxCommandHistograms = 50;
				36	static const double kLiteralBlockSwitchCost = 26;
				37	static const double kCommandBlockSwitchCost = 13.5;
				38	static const double kDistanceBlockSwitchCost = 14.6;
				39	static const int kLiteralStrideLength = 70;
				40	static const int kCommandStrideLength = 40;
Zoltan Szabadka	cbd5cb5	2014-02-14 15:04:23 +0100	[diff] [blame]	41	static const int kSymbolsPerLiteralHistogram = 544;
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	42	static const int kSymbolsPerCommandHistogram = 530;
Zoltan Szabadka	cbd5cb5	2014-02-14 15:04:23 +0100	[diff] [blame]	43	static const int kSymbolsPerDistanceHistogram = 544;
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	44	static const int kMinLengthForBlockSplitting = 128;
				45	static const int kIterMulForRefining = 2;
				46	static const int kMinItersForRefining = 100;
				47
				48	void CopyLiteralsToByteArray(const std::vector<Command>& cmds,
				49	const uint8_t* data,
				50	std::vector<uint8_t>* literals) {
				51	// Count how many we have.
				52	size_t total_length = 0;
				53	for (int i = 0; i < cmds.size(); ++i) {
				54	total_length += cmds[i].insert_length_;
				55	}
				56	if (total_length == 0) {
				57	return;
				58	}
				59
				60	// Allocate.
				61	literals->resize(total_length);
				62
				63	// Loop again, and copy this time.
				64	size_t pos = 0;
				65	size_t from_pos = 0;
				66	for (int i = 0; i < cmds.size() && pos < total_length; ++i) {
				67	memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_length_);
				68	pos += cmds[i].insert_length_;
				69	from_pos += cmds[i].insert_length_ + cmds[i].copy_length_;
				70	}
				71	}
				72
				73	void CopyCommandsToByteArray(const std::vector<Command>& cmds,
				74	std::vector<uint16_t>* insert_and_copy_codes,
				75	std::vector<uint8_t>* distance_prefixes) {
				76	for (int i = 0; i < cmds.size(); ++i) {
				77	const Command& cmd = cmds[i];
				78	insert_and_copy_codes->push_back(cmd.command_prefix_);
				79	if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
				80	distance_prefixes->push_back(cmd.distance_prefix_);
				81	}
				82	}
				83	}
				84
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	85	template<typename HistogramType, typename DataType>
				86	void InitialEntropyCodes(const DataType* data, size_t length,
				87	int literals_per_histogram,
				88	int max_histograms,
				89	size_t stride,
				90	std::vector<HistogramType>* vec) {
				91	int total_histograms = length / literals_per_histogram + 1;
				92	if (total_histograms > max_histograms) {
				93	total_histograms = max_histograms;
				94	}
				95	unsigned int seed = 7;
				96	int block_length = length / total_histograms;
				97	for (int i = 0; i < total_histograms; ++i) {
				98	int pos = length * i / total_histograms;
				99	if (i != 0) {
				100	pos += rand_r(&seed) % block_length;
				101	}
				102	if (pos + stride >= length) {
				103	pos = length - stride - 1;
				104	}
				105	HistogramType histo;
				106	histo.Add(data + pos, stride);
				107	vec->push_back(histo);
				108	}
				109	}
				110
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	111	template<typename HistogramType, typename DataType>
				112	void RandomSample(unsigned int* seed,
				113	const DataType* data,
				114	size_t length,
				115	size_t stride,
				116	HistogramType* sample) {
Zoltan Szabadka	40955ce	2014-01-06 16:01:57 +0100	[diff] [blame]	117	size_t pos = 0;
				118	if (stride >= length) {
				119	pos = 0;
				120	stride = length;
				121	} else {
				122	pos = rand_r(seed) % (length - stride + 1);
				123	}
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	124	sample->Add(data + pos, stride);
				125	}
				126
				127	template<typename HistogramType, typename DataType>
				128	void RefineEntropyCodes(const DataType* data, size_t length,
				129	size_t stride,
				130	std::vector<HistogramType>* vec) {
Zoltan Szabadka	40955ce	2014-01-06 16:01:57 +0100	[diff] [blame]	131	int iters =
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	132	kIterMulForRefining * length / stride + kMinItersForRefining;
				133	unsigned int seed = 7;
Zoltan Szabadka	40955ce	2014-01-06 16:01:57 +0100	[diff] [blame]	134	iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	135	for (int iter = 0; iter < iters; ++iter) {
				136	HistogramType sample;
				137	RandomSample(&seed, data, length, stride, &sample);
Zoltan Szabadka	40955ce	2014-01-06 16:01:57 +0100	[diff] [blame]	138	int ix = iter % vec->size();
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	139	(*vec)[ix].AddHistogram(sample);
				140	}
				141	}
				142
				143	inline static float BitCost(int total, int count) {
				144	return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
				145	}
				146
				147	template<typename DataType, int kSize>
				148	void FindBlocks(const DataType* data, const size_t length,
				149	const double block_switch_bitcost,
				150	const std::vector<Histogram<kSize> > &vec,
				151	uint8_t *block_id) {
				152	if (vec.size() <= 1) {
				153	for (int i = 0; i < length; ++i) {
				154	block_id[i] = 0;
				155	}
				156	return;
				157	}
				158	int vecsize = vec.size();
				159	double* insert_cost = new double[kSize * vecsize];
				160	memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
				161	for (int i = 0; i < kSize; ++i) {
				162	for (int j = 0; j < vecsize; ++j) {
				163	insert_cost[i * vecsize + j] =
				164	BitCost(vec[j].total_count_, vec[j].data_[i]);
				165	}
				166	}
				167	double *cost = new double[vecsize];
				168	memset(cost, 0, sizeof(cost[0]) * vecsize);
				169	bool* switch_signal = new bool[length * vecsize];
				170	memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
				171	// After each iteration of this loop, cost[k] will contain the difference
				172	// between the minimum cost of arriving at the current byte position using
				173	// entropy code k, and the minimum cost of arriving at the current byte
				174	// position. This difference is capped at the block switch cost, and if it
				175	// reaches block switch cost, it means that when we trace back from the last
				176	// position, we need to switch here.
				177	for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
				178	int ix = byte_ix * vecsize;
				179	int insert_cost_ix = data[byte_ix] * vecsize;
				180	double min_cost = 1e99;
				181	for (int k = 0; k < vecsize; ++k) {
				182	// We are coding the symbol in data[byte_ix] with entropy code k.
				183	cost[k] += insert_cost[insert_cost_ix + k];
				184	if (cost[k] < min_cost) {
				185	min_cost = cost[k];
				186	block_id[byte_ix] = k;
				187	}
				188	}
				189	double block_switch_cost = block_switch_bitcost;
				190	// More blocks for the beginning.
				191	if (byte_ix < 2000) {
				192	block_switch_cost = 0.77 + 0.07 byte_ix / 2000;
				193	}
				194	for (int k = 0; k < vecsize; ++k) {
				195	cost[k] -= min_cost;
				196	if (cost[k] >= block_switch_cost) {
				197	cost[k] = block_switch_cost;
				198	switch_signal[ix + k] = true;
				199	}
				200	}
				201	}
				202	// Now trace back from the last position and switch at the marked places.
				203	int byte_ix = length - 1;
				204	int ix = byte_ix * vecsize;
				205	int cur_id = block_id[byte_ix];
				206	while (byte_ix > 0) {
				207	--byte_ix;
				208	ix -= vecsize;
				209	if (switch_signal[ix + cur_id]) {
				210	cur_id = block_id[byte_ix];
				211	}
				212	block_id[byte_ix] = cur_id;
				213	}
				214	delete[] insert_cost;
				215	delete[] cost;
				216	delete[] switch_signal;
				217	}
				218
				219	int RemapBlockIds(uint8_t* block_ids, const size_t length) {
				220	std::map<uint8_t, uint8_t> new_id;
				221	int next_id = 0;
				222	for (int i = 0; i < length; ++i) {
				223	if (new_id.find(block_ids[i]) == new_id.end()) {
				224	new_id[block_ids[i]] = next_id;
				225	++next_id;
				226	}
				227	}
				228	for (int i = 0; i < length; ++i) {
				229	block_ids[i] = new_id[block_ids[i]];
				230	}
				231	return next_id;
				232	}
				233
				234	template<typename HistogramType, typename DataType>
				235	void BuildBlockHistograms(const DataType* data, const size_t length,
				236	uint8_t* block_ids,
				237	std::vector<HistogramType>* histograms) {
				238	int num_types = RemapBlockIds(block_ids, length);
				239	histograms->clear();
				240	histograms->resize(num_types);
				241	for (int i = 0; i < length; ++i) {
				242	(*histograms)[block_ids[i]].Add(data[i]);
				243	}
				244	}
				245
				246	template<typename HistogramType, typename DataType>
				247	void ClusterBlocks(const DataType* data, const size_t length,
				248	uint8_t* block_ids) {
				249	std::vector<HistogramType> histograms;
				250	std::vector<int> block_index(length);
				251	int cur_idx = 0;
				252	HistogramType cur_histogram;
				253	for (int i = 0; i < length; ++i) {
				254	bool block_boundary = (i + 1 == length \|\| block_ids[i] != block_ids[i + 1]);
				255	block_index[i] = cur_idx;
				256	cur_histogram.Add(data[i]);
				257	if (block_boundary) {
				258	histograms.push_back(cur_histogram);
				259	cur_histogram.Clear();
				260	++cur_idx;
				261	}
				262	}
				263	std::vector<HistogramType> clustered_histograms;
				264	std::vector<int> histogram_symbols;
Zoltan Szabadka	e709491	2013-12-12 13:18:04 +0100	[diff] [blame]	265	// Block ids need to fit in one byte.
				266	static const int kMaxNumberOfBlockTypes = 256;
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	267	ClusterHistograms(histograms, 1, histograms.size(),
				268	kMaxNumberOfBlockTypes,
				269	&clustered_histograms,
				270	&histogram_symbols);
				271	for (int i = 0; i < length; ++i) {
				272	block_ids[i] = histogram_symbols[block_index[i]];
				273	}
				274	}
				275
				276	void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
				277	int cur_id = block_ids[0];
				278	int cur_length = 1;
				279	split->num_types_ = -1;
				280	for (int i = 1; i < block_ids.size(); ++i) {
				281	if (block_ids[i] != cur_id) {
				282	split->types_.push_back(cur_id);
				283	split->lengths_.push_back(cur_length);
				284	split->num_types_ = std::max(split->num_types_, cur_id);
				285	cur_id = block_ids[i];
				286	cur_length = 0;
				287	}
				288	++cur_length;
				289	}
				290	split->types_.push_back(cur_id);
				291	split->lengths_.push_back(cur_length);
				292	split->num_types_ = std::max(split->num_types_, cur_id);
				293	++split->num_types_;
				294	}
				295
				296	template<typename HistogramType, typename DataType>
				297	void SplitByteVector(const std::vector<DataType>& data,
				298	const int literals_per_histogram,
				299	const int max_histograms,
				300	const int sampling_stride_length,
				301	const double block_switch_cost,
				302	BlockSplit* split) {
				303	if (data.empty()) {
				304	split->num_types_ = 0;
				305	return;
				306	} else if (data.size() < kMinLengthForBlockSplitting) {
				307	split->num_types_ = 1;
				308	split->types_.push_back(0);
				309	split->lengths_.push_back(data.size());
				310	return;
				311	}
				312	std::vector<HistogramType> histograms;
				313	// Find good entropy codes.
				314	InitialEntropyCodes(data.data(), data.size(),
				315	literals_per_histogram,
				316	max_histograms,
				317	sampling_stride_length,
				318	&histograms);
				319	RefineEntropyCodes(data.data(), data.size(),
				320	sampling_stride_length,
				321	&histograms);
				322	// Find a good path through literals with the good entropy codes.
				323	std::vector<uint8_t> block_ids(data.size());
				324	for (int i = 0; i < 10; ++i) {
				325	FindBlocks(data.data(), data.size(),
				326	block_switch_cost,
				327	histograms,
				328	&block_ids[0]);
				329	BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
				330	}
				331	ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
				332	BuildBlockSplit(block_ids, split);
				333	}
				334
				335	void SplitBlock(const std::vector<Command>& cmds,
				336	const uint8_t* data,
				337	BlockSplit* literal_split,
				338	BlockSplit* insert_and_copy_split,
				339	BlockSplit* dist_split) {
				340	// Create a continuous array of literals.
				341	std::vector<uint8_t> literals;
				342	CopyLiteralsToByteArray(cmds, data, &literals);
				343
				344	// Compute prefix codes for commands.
				345	std::vector<uint16_t> insert_and_copy_codes;
				346	std::vector<uint8_t> distance_prefixes;
				347	CopyCommandsToByteArray(cmds,
				348	&insert_and_copy_codes,
				349	&distance_prefixes);
				350
Zoltan Szabadka	278b894	2014-03-20 14:32:35 +0100	[diff] [blame]	351
Zoltan Szabadka	79e99af	2013-10-23 13:06:13 +0200	[diff] [blame]	352	SplitByteVector<HistogramLiteral>(
				353	literals,
				354	kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
				355	kLiteralStrideLength, kLiteralBlockSwitchCost,
				356	literal_split);
				357	SplitByteVector<HistogramCommand>(
				358	insert_and_copy_codes,
				359	kSymbolsPerCommandHistogram, kMaxCommandHistograms,
				360	kCommandStrideLength, kCommandBlockSwitchCost,
				361	insert_and_copy_split);
				362	SplitByteVector<HistogramDistance>(
				363	distance_prefixes,
				364	kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
				365	kCommandStrideLength, kDistanceBlockSwitchCost,
				366	dist_split);
				367	}
				368
				369	void SplitBlockByTotalLength(const std::vector<Command>& all_commands,
				370	int input_size,
				371	int target_length,
				372	std::vector<std::vector<Command> >* blocks) {
				373	int num_blocks = input_size / target_length + 1;
				374	int length_limit = input_size / num_blocks + 1;
				375	int total_length = 0;
				376	std::vector<Command> cur_block;
				377	for (int i = 0; i < all_commands.size(); ++i) {
				378	const Command& cmd = all_commands[i];
				379	int cmd_length = cmd.insert_length_ + cmd.copy_length_;
				380	if (total_length > length_limit) {
				381	blocks->push_back(cur_block);
				382	cur_block.clear();
				383	total_length = 0;
				384	}
				385	cur_block.push_back(cmd);
				386	total_length += cmd_length;
				387	}
				388	blocks->push_back(cur_block);
				389	}
				390
				391	} // namespace brotli