Blame - src/blockhash.h - fp2-dev/platform/external/chromium_org/sdch/open-vcdiff

blob: 8b7f948d713e315e3a5b8a7a5b859fe223a0d46c [file] [log] [blame]

openvcdiff	311c714	2008-08-26 19:29:25 +0000	[diff] [blame]	1	// Copyright 2006 Google Inc.
				2	// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith
				3	//
				4	// Licensed under the Apache License, Version 2.0 (the "License");
				5	// you may not use this file except in compliance with the License.
				6	// You may obtain a copy of the License at
				7	//
				8	// http://www.apache.org/licenses/LICENSE-2.0
				9	//
				10	// Unless required by applicable law or agreed to in writing, software
				11	// distributed under the License is distributed on an "AS IS" BASIS,
				12	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	// See the License for the specific language governing permissions and
				14	// limitations under the License.
				15	//
				16	// Implementation of the Bentley/McIlroy algorithm for finding differences.
				17	// Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings.
				18	// http://citeseer.ist.psu.edu/555557.html
				19
				20	#ifndef OPEN_VCDIFF_BLOCKHASH_H_
				21	#define OPEN_VCDIFF_BLOCKHASH_H_
				22
				23	#include <config.h>
openvcdiff	28db807	2008-10-10 23:29:11 +0000	[diff] [blame^]	24	#include <stddef.h> // size_t
openvcdiff	311c714	2008-08-26 19:29:25 +0000	[diff] [blame]	25	#include <stdint.h> // uint32_t
openvcdiff	311c714	2008-08-26 19:29:25 +0000	[diff] [blame]	26	#include <vector>
				27
				28	namespace open_vcdiff {
				29
				30	// A generic hash table which will be used to keep track of byte runs
				31	// of size kBlockSize in both the incrementally processed target data
				32	// and the preprocessed source dictionary.
				33	//
				34	// A custom hash table implementation is used instead of the standard
				35	// hash_map template because we know that there will be exactly one
				36	// entry in the BlockHash corresponding to each kBlockSize bytes
				37	// in the source data, which makes certain optimizations possible:
				38	// * The memory for the hash table and for all hash entries can be allocated
				39	// in one step rather than incrementally for each insert operation.
				40	// * A single integer can be used to represent both
				41	// the index of the next hash entry in the chain
				42	// and the position of the entry within the source data
				43	// (== kBlockSize * block_number). This greatly reduces the size
				44	// of a hash entry.
				45	//
				46	class BlockHash {
				47	public:
				48	// Block size as per Bentley/McIlroy; must be a power of two.
				49	//
				50	// Using (for example) kBlockSize = 4 guarantees that no match smaller
				51	// than size 4 will be identified, that some matches having sizes
				52	// 4, 5, or 6 may be identified, and that all matches
				53	// having size 7 or greater will be identified (because any string of
				54	// 7 bytes must contain a complete aligned block of 4 bytes.)
				55	//
				56	// Increasing kBlockSize by a factor of two will halve the amount of
				57	// memory needed for the next block table, and will halve the setup time
				58	// for a new BlockHash. However, it also doubles the minimum
				59	// match length that is guaranteed to be found in FindBestMatch(),
				60	// so that function will be less effective in finding matches.
				61	//
				62	// Computational effort in FindBestMatch (which is the inner loop of
				63	// the encoding algorithm) will be proportional to the number of
				64	// matches found, and a low value of kBlockSize will waste time
				65	// tracking down small matches. On the other hand, if this value
				66	// is set too high, no matches will be found at all.
				67	//
				68	// It is suggested that different values of kBlockSize be tried against
				69	// a representative data set to find the best tradeoff between
				70	// memory/CPU and the effectiveness of FindBestMatch().
				71	//
				72	// If you change kBlockSize to a smaller value, please increase
				73	// kMaxMatchesToCheck accordingly.
				74	static const int kBlockSize = 32;
				75
				76	// This class is used to store the best match found by FindBestMatch()
				77	// and return it to the caller.
				78	class Match {
				79	public:
				80	Match() : size_(0), source_offset_(-1), target_offset_(-1) { }
				81
				82	void ReplaceIfBetterMatch(size_t candidate_size,
				83	int candidate_source_offset,
				84	int candidate_target_offset) {
				85	if (candidate_size > size_) {
				86	size_ = candidate_size;
				87	source_offset_ = candidate_source_offset;
				88	target_offset_ = candidate_target_offset;
				89	}
				90	}
				91
				92	size_t size() const { return size_; }
				93	int source_offset() const { return source_offset_; }
				94	int target_offset() const { return target_offset_; }
				95
				96	private:
				97	// The size of the best (longest) match passed to ReplaceIfBetterMatch().
				98	size_t size_;
				99
				100	// The source offset of the match, including the starting_offset_
				101	// of the BlockHash for which the match was found.
				102	int source_offset_;
				103
				104	// The target offset of the match. An offset of 0 corresponds to the
				105	// data at target_start, which is an argument of FindBestMatch().
				106	int target_offset_;
				107
				108	// Making these private avoids implicit copy constructor
				109	// & assignment operator
				110	Match(const Match&); // NOLINT
				111	void operator=(const Match&);
				112	};
				113
				114	// A BlockHash is created using a buffer of source data. The hash table
				115	// will contain one entry for each kBlockSize-byte block in the
				116	// source data.
				117	//
				118	// See the comments for starting_offset_, below, for a description of
				119	// the starting_offset argument. For a hash of source (dictionary) data,
				120	// starting_offset_ will be zero; for a hash of previously encoded
				121	// target data, starting_offset_ will be equal to the dictionary size.
				122	//
				123	BlockHash(const char* source_data, size_t source_size, int starting_offset);
				124
				125	~BlockHash();
				126
				127	// Initializes the object before use.
				128	// This method must be called after constructing a BlockHash object,
				129	// and before any other method may be called. This is because
				130	// Init() dynamically allocates hash_table_ and next_block_table_.
				131	// Returns true if initialization succeeded, or false if an error occurred,
				132	// in which case no other method except the destructor may then be used
				133	// on the object.
				134	//
				135	// If populate_hash_table is true, then AddAllBlocks() will be called
				136	// to populate the hash table. If populate_hash_table is false, then
				137	// classes that inherit from BlockHash are expected to call AddBlock()
				138	// to incrementally populate individual blocks of data.
				139	//
				140	bool Init(bool populate_hash_table);
				141
				142	// In the context of the open-vcdiff encoder, BlockHash is used for two
				143	// purposes: to hash the source (dictionary) data, and to hash
				144	// the previously encoded target data. The main differences between
				145	// a dictionary BlockHash and a target BlockHash are as follows:
				146	//
				147	// 1. The best_match->source_offset() returned from FindBestMatch()
				148	// for a target BlockHash is computed in the following manner:
				149	// the starting offset of the first byte in the target data
				150	// is equal to the dictionary size. FindBestMatch() will add
				151	// starting_offset_ to any best_match->source_offset() value it returns,
				152	// in order to produce the correct offset value for a target BlockHash.
				153	// 2. For a dictionary BlockHash, the entire data set is hashed at once
				154	// when Init() is called with the parameter populate_hash_table = true.
				155	// For a target BlockHash, because the previously encoded target data
				156	// includes only the data seen up to the current encoding position,
				157	// the data blocks are hashed incrementally as the encoding position
				158	// advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().
				159	//
				160	// The following two factory functions can be used to create BlockHash
				161	// objects for each of these two purposes. Each factory function calls
				162	// the object constructor and also calls Init(). If an error occurs,
				163	// NULL is returned; otherwise a valid BlockHash object is returned.
				164	// Since a dictionary BlockHash is not expected to be modified after
				165	// initialization, a const object is returned.
				166	// The caller is responsible for deleting the returned object
				167	// (using the C++ delete operator) once it is no longer needed.
				168	static const BlockHash* CreateDictionaryHash(const char* dictionary_data,
				169	size_t dictionary_size);
				170	static BlockHash* CreateTargetHash(const char* target_data,
				171	size_t target_size,
				172	size_t dictionary_size);
				173
				174	// This function will be called to add blocks incrementally to the target hash
				175	// as the encoding position advances through the target data. It will be
				176	// called for every kBlockSize-byte block in the target data, regardless
				177	// of whether the block is aligned evenly on a block boundary. The
				178	// BlockHash will only store hash entries for the evenly-aligned blocks.
				179	//
				180	void AddOneIndexHash(int index, uint32_t hash_value) {
				181	if (index == NextIndexToAdd()) {
				182	AddBlock(hash_value);
				183	}
				184	}
				185
				186	// Calls AddBlock() for each kBlockSize-byte block in the range
				187	// (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.
				188	// If end_index <= the last index added (last_block_added_ * kBlockSize),
				189	// this function does nothing.
				190	//
				191	// A partial block beginning anywhere up to (end_index - 1) is also added,
				192	// unless it extends outside the end of the source data. Like AddAllBlocks(),
				193	// this function computes the hash value for each of the blocks in question
				194	// from scratch, so it is not a good option if the hash values have already
				195	// been computed for some other purpose.
				196	//
				197	// Example: assume kBlockSize = 4, last_block_added_ = 1, and there are
				198	// 14 bytes of source data.
				199	// If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()
				200	// only for block number 2 (at index 8).
				201	// If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call
				202	// AddBlock() at all, because block 3 (beginning at index 12) would
				203	// fall outside the range of source data.
				204	//
				205	// VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to
				206	// add a whole range of data to a target hash when a COPY instruction
				207	// is generated.
				208	void AddAllBlocksThroughIndex(int end_index);
				209
				210	// FindBestMatch takes a position within the unencoded target data
				211	// (target_candidate_start) and the hash value of the kBlockSize bytes
				212	// beginning at that position (hash_value). It attempts to find a matching
				213	// set of bytes within the source (== dictionary) data, expanding
				214	// the match both below and above the target block. It cannot expand
				215	// the match outside the bounds of the source data, or below
				216	// target_start within the target data, or past
				217	// the end limit of (target_start + target_length).
				218	//
				219	// target_candidate_start is the start of the candidate block within the
				220	// target data for which a match will be sought, while
				221	// target_start (which is <= target_candidate_start)
				222	// is the start of the target data that has yet to be encoded.
				223	//
				224	// If a match is found whose size is greater than the size
				225	// of best_match, this function populates *best_match with the
				226	// size, source_offset, and target_offset of the match found.
				227	// best_match->source_offset() will contain the index of the start of the
				228	// matching source data, plus starting_offset_
				229	// (see description of starting_offset_ for details);
				230	// best_match->target_offset() will contain the offset of the match
				231	// beginning with target_start = offset 0, such that
				232	// 0 <= best_match->target_offset()
				233	// <= (target_candidate_start - target_start);
				234	// and best_match->size() will contain the size of the match.
				235	// If no such match is found, this function leaves *best_match unmodified.
				236	//
				237	// On calling FindBestMatch(), best_match must
				238	// point to a valid Match object, and cannot be NULL.
				239	// The same Match object can be passed
				240	// when calling FindBestMatch() on a different BlockHash object
				241	// for the same candidate data block, in order to find
				242	// the best match possible across both objects. For example:
				243	//
				244	// open_vcdiff::BlockHash::Match best_match;
				245	// uint32_t hash_value =
				246	// RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);
				247	// bh1.FindBestMatch(hash_value,
				248	// target_candidate_start,
				249	// target_start,
				250	// target_length,
				251	// &best_match);
				252	// bh2.FindBestMatch(hash_value,
				253	// target_candidate_start,
				254	// target_start,
				255	// target_length,
				256	// &best_match);
				257	// if (best_size >= 0) {
				258	// // a match was found; its size, source offset, and target offset
				259	// // can be found in best_match
				260	// }
				261	//
				262	// hash_value is passed as a separate parameter from target_candidate_start,
				263	// (rather than calculated within FindBestMatch) in order to take
				264	// advantage of the rolling hash, which quickly calculates the hash value
				265	// of the block starting at target_candidate_start based on
				266	// the known hash value of the block starting at (target_candidate_start - 1).
				267	// See vcdiffengine.cc for more details.
				268	//
				269	// Example:
				270	// kBlockSize: 4
				271	// target text: "ANDREW LLOYD WEBBER"
				272	// 1^ 5^2^ 3^
				273	// dictionary: "INSURANCE : LLOYDS OF LONDON"
				274	// 4^
				275	// hashed dictionary blocks:
				276	// "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"
				277	//
				278	// 1: target_start (beginning of unencoded data)
				279	// 2: target_candidate_start (for the block "LLOY")
				280	// 3: target_length (points one byte beyond the last byte of data.)
				281	// 4: best_match->source_offset() (after calling FindBestMatch)
				282	// 5: best_match->target_offset() (after calling FindBestMatch)
				283	//
				284	// Under these conditions, FindBestMatch will find a matching
				285	// hashed dictionary block for "LLOY", and will extend the beginning of
				286	// this match backwards by one byte, and the end of the match forwards
				287	// by one byte, finding that the best match is " LLOYD"
				288	// with best_match->source_offset() = 10
				289	// (offset of " LLOYD" in the source string),
				290	// best_match->target_offset() = 6
				291	// (offset of " LLOYD" in the target string),
				292	// and best_match->size() = 6.
				293	//
				294	void FindBestMatch(uint32_t hash_value,
				295	const char* target_candidate_start,
				296	const char* target_start,
				297	size_t target_size,
				298	Match* best_match) const;
				299
				300	protected:
				301	// FindBestMatch() will not process more than this number
				302	// of matching hash entries.
				303	//
				304	// It is necessary to have a limit on the maximum number of matches
				305	// that will be checked in order to avoid the worst-case performance
				306	// possible if, for example, all the blocks in the dictionary have
				307	// the same hash value. See the unit test SearchStringFindsTooManyMatches
				308	// for an example of such a case. The encoder uses a loop in
				309	// VCDiffEngine::Encode over each target byte, containing a loop in
				310	// BlockHash::FindBestMatch over the number of matches (up to a maximum
				311	// of the number of source blocks), containing two loops that extend
				312	// the match forwards and backwards up to the number of source bytes.
				313	// Total complexity in the worst case is
				314	// O([target size] * source_size_ * source_size_)
				315	// Placing a limit on the possible number of matches checked changes this to
				316	// O([target size] * source_size_ * kMaxMatchesToCheck)
				317	//
				318	// In empirical testing on real HTML text, using a block size of 4,
				319	// the number of true matches per call to FindBestMatch() did not exceed 78;
				320	// with a block size of 32, the number of matches did not exceed 3.
				321	//
				322	// The expected number of true matches scales super-linearly
				323	// with the inverse of kBlockSize, but here a linear scale is used
				324	// for block sizes smaller than 32.
				325	static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 8 :
				326	(8 * (32 / kBlockSize));
				327
				328	// Do not skip more than this number of non-matching hash collisions
				329	// to find the next matching entry in the hash chain.
				330	static const int kMaxProbes = 16;
				331
				332	// Internal routine which calculates a hash table size based on kBlockSize and
				333	// the dictionary_size. Will return a power of two if successful, or 0 if an
				334	// internal error occurs. Some calculations (such as GetHashTableIndex())
				335	// depend on the table size being a power of two.
				336	static const size_t CalcTableSize(const size_t dictionary_size);
				337
				338	const size_t GetNumberOfBlocks() const {
				339	return source_size_ / kBlockSize;
				340	}
				341
				342	// Use the lowest-order bits of the hash value
				343	// as the index into the hash table.
				344	uint32_t GetHashTableIndex(uint32_t hash_value) const {
				345	return hash_value & hash_table_mask_;
				346	}
				347
				348	// The index within source_data_ of the next block
				349	// for which AddBlock() should be called.
				350	int NextIndexToAdd() const {
				351	return (last_block_added_ + 1) * kBlockSize;
				352	}
				353
				354	static inline bool TooManyMatches(int* match_counter);
				355
				356	const char* const source_data() { return source_data_; }
				357	const size_t source_size() { return source_size_; }
				358
				359	// Adds an entry to the hash table for one block of source data of length
				360	// kBlockSize, starting at source_data_[block_number * kBlockSize],
				361	// where block_number is always (last_block_added_ + 1). That is,
				362	// AddBlock() must be called once for each block in source_data_
				363	// in increasing order.
				364	void AddBlock(uint32_t hash_value);
				365
				366	// Calls AddBlock() for each complete kBlockSize-byte block between
				367	// source_data_ and (source_data_ + source_size_). It is equivalent
				368	// to calling AddAllBlocksThroughIndex(source_data + source_size).
				369	// This function is called when Init(true) is invoked.
				370	void AddAllBlocks();
				371
				372	// Returns true if the contents of the kBlockSize-byte block
				373	// beginning at block1 are identical to the contents of
				374	// the block beginning at block2; false otherwise.
				375	static bool BlockContentsMatch(const char* block1, const char* block2);
				376
				377	// Compares each machine word of the two (possibly unaligned) blocks, rather
				378	// than each byte, thus reducing the number of test-and-branch instructions
				379	// executed. Returns a boolean (do the blocks match?) rather than
				380	// the signed byte difference returned by memcmp.
				381	//
				382	// BlockContentsMatch will use either this function or memcmp to do its work,
				383	// depending on which is faster for a particular architecture.
				384	//
				385	// For gcc on x86-based architectures, this function has been shown to run
				386	// about twice as fast as the library function memcmp(), and between five and
				387	// nine times faster than the assembly instructions (repz and cmpsb) that gcc
				388	// uses by default for builtin memcmp. On other architectures, or using
				389	// other compilers, this function has not shown to be faster than memcmp.
				390	static bool BlockCompareWords(const char* block1, const char* block2);
				391
				392	// Finds the first block number within the hashed data
				393	// that represents a match for the given hash value.
				394	// Returns -1 if no match was found.
				395	//
				396	// Init() must have been called and returned true before using
				397	// FirstMatchingBlock or NextMatchingBlock. No check is performed
				398	// for this condition; the code will crash if this condition is violated.
				399	//
				400	// The hash table is initially populated with -1 (not found) values,
				401	// so if this function is called before the hash table has been populated
				402	// using AddAllBlocks() or AddBlock(), it will simply return -1
				403	// for any value of hash_value.
				404	int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;
				405
				406	// Given a block number returned by FirstMatchingBlock()
				407	// or by a previous call to NextMatchingBlock(), returns
				408	// the next block number that matches the same hash value.
				409	// Returns -1 if no match was found.
				410	int NextMatchingBlock(int block_number, const char* block_ptr) const;
				411
				412	// Inline version of FirstMatchingBlock. This saves the cost of a function
				413	// call when this routine is called from within the module. The external
				414	// (non-inlined) version is called only by unit tests.
				415	inline int FirstMatchingBlockInline(uint32_t hash_value,
				416	const char* block_ptr) const;
				417
				418	// Walk through the hash entry chain, skipping over any false matches
				419	// (for which the lowest bits of the fingerprints match,
				420	// but the actual block data does not.) Returns the block number of
				421	// the first true match found, or -1 if no true match was found.
				422	// If block_number is a matching block, the function will return block_number
				423	// without skipping to the next block.
				424	int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;
				425
				426	// Returns the number of bytes to the left of source_match_start
				427	// that match the corresponding bytes to the left of target_match_start.
				428	// Will not examine more than max_bytes bytes, which is to say that
				429	// the return value will be in the range [0, max_bytes] inclusive.
				430	static int MatchingBytesToLeft(const char* source_match_start,
				431	const char* target_match_start,
				432	int max_bytes);
				433
				434	// Returns the number of bytes starting at source_match_end
				435	// that match the corresponding bytes starting at target_match_end.
				436	// Will not examine more than max_bytes bytes, which is to say that
				437	// the return value will be in the range [0, max_bytes] inclusive.
				438	static int MatchingBytesToRight(const char* source_match_end,
				439	const char* target_match_end,
				440	int max_bytes);
				441
				442	// The protected functions BlockContentsMatch, FirstMatchingBlock,
				443	// NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight
				444	// should be made accessible to unit tests.
				445	friend class BlockHashTest;
				446
				447	private:
				448	const char* const source_data_;
				449	const size_t source_size_;
				450
				451	// The size of this array is determined using CalcTableSize(). It has at
				452	// least one element for each kBlockSize-byte block in the source data.
				453	// GetHashTableIndex() returns an index into this table for a given hash
				454	// value. The value of each element of hash_table_ is the lowest block
				455	// number in the source data whose hash value would return the same value from
				456	// GetHashTableIndex(), or -1 if there is no matching block. This value can
				457	// then be used as an index into next_block_table_ to retrieve the entire set
				458	// of matching block numbers.
				459	std::vector<int> hash_table_;
				460
				461	// An array containing one element for each source block. Each element is
				462	// either -1 (== not found) or the index of the next block whose hash value
				463	// would produce a matching result from GetHashTableIndex().
				464	std::vector<int> next_block_table_;
				465
				466	// This vector has the same size as next_block_table_. For every block number
				467	// B that is referenced in hash_table_, last_block_table_[B] will contain
				468	// the maximum block number that has the same GetHashTableIndex() value
				469	// as block B. This number may be B itself. For a block number B' that
				470	// is not referenced in hash_table_, the value of last_block_table_[B'] is -1.
				471	// This table is used only while populating the hash table, not while looking
				472	// up hash values in the table. Keeping track of the last block number in the
				473	// chain allows us to construct the block chains as FIFO rather than LIFO
				474	// lists, so that the match with the lowest index is returned first. This
				475	// should result in a more compact encoding because the VCDIFF format favors
				476	// smaller index values and repeated index values.
				477	std::vector<int> last_block_table_;
				478
				479	// Performing a bitwise AND with hash_table_mask_ will produce a value ranging
				480	// from 0 to the number of elements in hash_table_.
				481	uint32_t hash_table_mask_;
				482
				483	// The offset of the first byte of source data (the data at source_data_[0]).
				484	// For the purpose of computing offsets, the source data and target data
				485	// are considered to be concatenated -- not literally in a single memory
				486	// buffer, but conceptually as described in the RFC.
				487	// The first byte of the previously encoded target data
				488	// has an offset that is equal to dictionary_size, i.e., just after
				489	// the last byte of source data.
				490	// For a hash of source (dictionary) data, starting_offset_ will be zero;
				491	// for a hash of previously encoded target data, starting_offset_ will be
				492	// equal to the dictionary size.
				493	const int starting_offset_;
				494
				495	// The last index added by AddBlock(). This determines the block number
				496	// for successive calls to AddBlock(), and is also
				497	// used to determine the starting block for AddAllBlocksThroughIndex().
				498	int last_block_added_;
				499
				500	// Making these private avoids implicit copy constructor & assignment operator
				501	BlockHash(const BlockHash&); // NOLINT
				502	void operator=(const BlockHash&);
				503	};
				504
				505	} // namespace open_vcdiff
				506
				507	#endif // OPEN_VCDIFF_BLOCKHASH_H_