blob: 8b7f948d713e315e3a5b8a7a5b859fe223a0d46c [file] [log] [blame]
openvcdiff311c7142008-08-26 19:29:25 +00001// Copyright 2006 Google Inc.
2// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// Implementation of the Bentley/McIlroy algorithm for finding differences.
17// Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings.
18// http://citeseer.ist.psu.edu/555557.html
19
20#ifndef OPEN_VCDIFF_BLOCKHASH_H_
21#define OPEN_VCDIFF_BLOCKHASH_H_
22
23#include <config.h>
openvcdiff28db8072008-10-10 23:29:11 +000024#include <stddef.h> // size_t
openvcdiff311c7142008-08-26 19:29:25 +000025#include <stdint.h> // uint32_t
openvcdiff311c7142008-08-26 19:29:25 +000026#include <vector>
27
28namespace open_vcdiff {
29
30// A generic hash table which will be used to keep track of byte runs
31// of size kBlockSize in both the incrementally processed target data
32// and the preprocessed source dictionary.
33//
34// A custom hash table implementation is used instead of the standard
35// hash_map template because we know that there will be exactly one
36// entry in the BlockHash corresponding to each kBlockSize bytes
37// in the source data, which makes certain optimizations possible:
38// * The memory for the hash table and for all hash entries can be allocated
39// in one step rather than incrementally for each insert operation.
40// * A single integer can be used to represent both
41// the index of the next hash entry in the chain
42// and the position of the entry within the source data
43// (== kBlockSize * block_number). This greatly reduces the size
44// of a hash entry.
45//
46class BlockHash {
47 public:
48 // Block size as per Bentley/McIlroy; must be a power of two.
49 //
50 // Using (for example) kBlockSize = 4 guarantees that no match smaller
51 // than size 4 will be identified, that some matches having sizes
52 // 4, 5, or 6 may be identified, and that all matches
53 // having size 7 or greater will be identified (because any string of
54 // 7 bytes must contain a complete aligned block of 4 bytes.)
55 //
56 // Increasing kBlockSize by a factor of two will halve the amount of
57 // memory needed for the next block table, and will halve the setup time
58 // for a new BlockHash. However, it also doubles the minimum
59 // match length that is guaranteed to be found in FindBestMatch(),
60 // so that function will be less effective in finding matches.
61 //
62 // Computational effort in FindBestMatch (which is the inner loop of
63 // the encoding algorithm) will be proportional to the number of
64 // matches found, and a low value of kBlockSize will waste time
65 // tracking down small matches. On the other hand, if this value
66 // is set too high, no matches will be found at all.
67 //
68 // It is suggested that different values of kBlockSize be tried against
69 // a representative data set to find the best tradeoff between
70 // memory/CPU and the effectiveness of FindBestMatch().
71 //
72 // If you change kBlockSize to a smaller value, please increase
73 // kMaxMatchesToCheck accordingly.
74 static const int kBlockSize = 32;
75
76 // This class is used to store the best match found by FindBestMatch()
77 // and return it to the caller.
78 class Match {
79 public:
80 Match() : size_(0), source_offset_(-1), target_offset_(-1) { }
81
82 void ReplaceIfBetterMatch(size_t candidate_size,
83 int candidate_source_offset,
84 int candidate_target_offset) {
85 if (candidate_size > size_) {
86 size_ = candidate_size;
87 source_offset_ = candidate_source_offset;
88 target_offset_ = candidate_target_offset;
89 }
90 }
91
92 size_t size() const { return size_; }
93 int source_offset() const { return source_offset_; }
94 int target_offset() const { return target_offset_; }
95
96 private:
97 // The size of the best (longest) match passed to ReplaceIfBetterMatch().
98 size_t size_;
99
100 // The source offset of the match, including the starting_offset_
101 // of the BlockHash for which the match was found.
102 int source_offset_;
103
104 // The target offset of the match. An offset of 0 corresponds to the
105 // data at target_start, which is an argument of FindBestMatch().
106 int target_offset_;
107
108 // Making these private avoids implicit copy constructor
109 // & assignment operator
110 Match(const Match&); // NOLINT
111 void operator=(const Match&);
112 };
113
114 // A BlockHash is created using a buffer of source data. The hash table
115 // will contain one entry for each kBlockSize-byte block in the
116 // source data.
117 //
118 // See the comments for starting_offset_, below, for a description of
119 // the starting_offset argument. For a hash of source (dictionary) data,
120 // starting_offset_ will be zero; for a hash of previously encoded
121 // target data, starting_offset_ will be equal to the dictionary size.
122 //
123 BlockHash(const char* source_data, size_t source_size, int starting_offset);
124
125 ~BlockHash();
126
127 // Initializes the object before use.
128 // This method must be called after constructing a BlockHash object,
129 // and before any other method may be called. This is because
130 // Init() dynamically allocates hash_table_ and next_block_table_.
131 // Returns true if initialization succeeded, or false if an error occurred,
132 // in which case no other method except the destructor may then be used
133 // on the object.
134 //
135 // If populate_hash_table is true, then AddAllBlocks() will be called
136 // to populate the hash table. If populate_hash_table is false, then
137 // classes that inherit from BlockHash are expected to call AddBlock()
138 // to incrementally populate individual blocks of data.
139 //
140 bool Init(bool populate_hash_table);
141
142 // In the context of the open-vcdiff encoder, BlockHash is used for two
143 // purposes: to hash the source (dictionary) data, and to hash
144 // the previously encoded target data. The main differences between
145 // a dictionary BlockHash and a target BlockHash are as follows:
146 //
147 // 1. The best_match->source_offset() returned from FindBestMatch()
148 // for a target BlockHash is computed in the following manner:
149 // the starting offset of the first byte in the target data
150 // is equal to the dictionary size. FindBestMatch() will add
151 // starting_offset_ to any best_match->source_offset() value it returns,
152 // in order to produce the correct offset value for a target BlockHash.
153 // 2. For a dictionary BlockHash, the entire data set is hashed at once
154 // when Init() is called with the parameter populate_hash_table = true.
155 // For a target BlockHash, because the previously encoded target data
156 // includes only the data seen up to the current encoding position,
157 // the data blocks are hashed incrementally as the encoding position
158 // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().
159 //
160 // The following two factory functions can be used to create BlockHash
161 // objects for each of these two purposes. Each factory function calls
162 // the object constructor and also calls Init(). If an error occurs,
163 // NULL is returned; otherwise a valid BlockHash object is returned.
164 // Since a dictionary BlockHash is not expected to be modified after
165 // initialization, a const object is returned.
166 // The caller is responsible for deleting the returned object
167 // (using the C++ delete operator) once it is no longer needed.
168 static const BlockHash* CreateDictionaryHash(const char* dictionary_data,
169 size_t dictionary_size);
170 static BlockHash* CreateTargetHash(const char* target_data,
171 size_t target_size,
172 size_t dictionary_size);
173
174 // This function will be called to add blocks incrementally to the target hash
175 // as the encoding position advances through the target data. It will be
176 // called for every kBlockSize-byte block in the target data, regardless
177 // of whether the block is aligned evenly on a block boundary. The
178 // BlockHash will only store hash entries for the evenly-aligned blocks.
179 //
180 void AddOneIndexHash(int index, uint32_t hash_value) {
181 if (index == NextIndexToAdd()) {
182 AddBlock(hash_value);
183 }
184 }
185
186 // Calls AddBlock() for each kBlockSize-byte block in the range
187 // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.
188 // If end_index <= the last index added (last_block_added_ * kBlockSize),
189 // this function does nothing.
190 //
191 // A partial block beginning anywhere up to (end_index - 1) is also added,
192 // unless it extends outside the end of the source data. Like AddAllBlocks(),
193 // this function computes the hash value for each of the blocks in question
194 // from scratch, so it is not a good option if the hash values have already
195 // been computed for some other purpose.
196 //
197 // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are
198 // 14 bytes of source data.
199 // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()
200 // only for block number 2 (at index 8).
201 // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call
202 // AddBlock() at all, because block 3 (beginning at index 12) would
203 // fall outside the range of source data.
204 //
205 // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to
206 // add a whole range of data to a target hash when a COPY instruction
207 // is generated.
208 void AddAllBlocksThroughIndex(int end_index);
209
210 // FindBestMatch takes a position within the unencoded target data
211 // (target_candidate_start) and the hash value of the kBlockSize bytes
212 // beginning at that position (hash_value). It attempts to find a matching
213 // set of bytes within the source (== dictionary) data, expanding
214 // the match both below and above the target block. It cannot expand
215 // the match outside the bounds of the source data, or below
216 // target_start within the target data, or past
217 // the end limit of (target_start + target_length).
218 //
219 // target_candidate_start is the start of the candidate block within the
220 // target data for which a match will be sought, while
221 // target_start (which is <= target_candidate_start)
222 // is the start of the target data that has yet to be encoded.
223 //
224 // If a match is found whose size is greater than the size
225 // of best_match, this function populates *best_match with the
226 // size, source_offset, and target_offset of the match found.
227 // best_match->source_offset() will contain the index of the start of the
228 // matching source data, plus starting_offset_
229 // (see description of starting_offset_ for details);
230 // best_match->target_offset() will contain the offset of the match
231 // beginning with target_start = offset 0, such that
232 // 0 <= best_match->target_offset()
233 // <= (target_candidate_start - target_start);
234 // and best_match->size() will contain the size of the match.
235 // If no such match is found, this function leaves *best_match unmodified.
236 //
237 // On calling FindBestMatch(), best_match must
238 // point to a valid Match object, and cannot be NULL.
239 // The same Match object can be passed
240 // when calling FindBestMatch() on a different BlockHash object
241 // for the same candidate data block, in order to find
242 // the best match possible across both objects. For example:
243 //
244 // open_vcdiff::BlockHash::Match best_match;
245 // uint32_t hash_value =
246 // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);
247 // bh1.FindBestMatch(hash_value,
248 // target_candidate_start,
249 // target_start,
250 // target_length,
251 // &best_match);
252 // bh2.FindBestMatch(hash_value,
253 // target_candidate_start,
254 // target_start,
255 // target_length,
256 // &best_match);
257 // if (best_size >= 0) {
258 // // a match was found; its size, source offset, and target offset
259 // // can be found in best_match
260 // }
261 //
262 // hash_value is passed as a separate parameter from target_candidate_start,
263 // (rather than calculated within FindBestMatch) in order to take
264 // advantage of the rolling hash, which quickly calculates the hash value
265 // of the block starting at target_candidate_start based on
266 // the known hash value of the block starting at (target_candidate_start - 1).
267 // See vcdiffengine.cc for more details.
268 //
269 // Example:
270 // kBlockSize: 4
271 // target text: "ANDREW LLOYD WEBBER"
272 // 1^ 5^2^ 3^
273 // dictionary: "INSURANCE : LLOYDS OF LONDON"
274 // 4^
275 // hashed dictionary blocks:
276 // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"
277 //
278 // 1: target_start (beginning of unencoded data)
279 // 2: target_candidate_start (for the block "LLOY")
280 // 3: target_length (points one byte beyond the last byte of data.)
281 // 4: best_match->source_offset() (after calling FindBestMatch)
282 // 5: best_match->target_offset() (after calling FindBestMatch)
283 //
284 // Under these conditions, FindBestMatch will find a matching
285 // hashed dictionary block for "LLOY", and will extend the beginning of
286 // this match backwards by one byte, and the end of the match forwards
287 // by one byte, finding that the best match is " LLOYD"
288 // with best_match->source_offset() = 10
289 // (offset of " LLOYD" in the source string),
290 // best_match->target_offset() = 6
291 // (offset of " LLOYD" in the target string),
292 // and best_match->size() = 6.
293 //
294 void FindBestMatch(uint32_t hash_value,
295 const char* target_candidate_start,
296 const char* target_start,
297 size_t target_size,
298 Match* best_match) const;
299
300 protected:
301 // FindBestMatch() will not process more than this number
302 // of matching hash entries.
303 //
304 // It is necessary to have a limit on the maximum number of matches
305 // that will be checked in order to avoid the worst-case performance
306 // possible if, for example, all the blocks in the dictionary have
307 // the same hash value. See the unit test SearchStringFindsTooManyMatches
308 // for an example of such a case. The encoder uses a loop in
309 // VCDiffEngine::Encode over each target byte, containing a loop in
310 // BlockHash::FindBestMatch over the number of matches (up to a maximum
311 // of the number of source blocks), containing two loops that extend
312 // the match forwards and backwards up to the number of source bytes.
313 // Total complexity in the worst case is
314 // O([target size] * source_size_ * source_size_)
315 // Placing a limit on the possible number of matches checked changes this to
316 // O([target size] * source_size_ * kMaxMatchesToCheck)
317 //
318 // In empirical testing on real HTML text, using a block size of 4,
319 // the number of true matches per call to FindBestMatch() did not exceed 78;
320 // with a block size of 32, the number of matches did not exceed 3.
321 //
322 // The expected number of true matches scales super-linearly
323 // with the inverse of kBlockSize, but here a linear scale is used
324 // for block sizes smaller than 32.
325 static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 8 :
326 (8 * (32 / kBlockSize));
327
328 // Do not skip more than this number of non-matching hash collisions
329 // to find the next matching entry in the hash chain.
330 static const int kMaxProbes = 16;
331
332 // Internal routine which calculates a hash table size based on kBlockSize and
333 // the dictionary_size. Will return a power of two if successful, or 0 if an
334 // internal error occurs. Some calculations (such as GetHashTableIndex())
335 // depend on the table size being a power of two.
336 static const size_t CalcTableSize(const size_t dictionary_size);
337
338 const size_t GetNumberOfBlocks() const {
339 return source_size_ / kBlockSize;
340 }
341
342 // Use the lowest-order bits of the hash value
343 // as the index into the hash table.
344 uint32_t GetHashTableIndex(uint32_t hash_value) const {
345 return hash_value & hash_table_mask_;
346 }
347
348 // The index within source_data_ of the next block
349 // for which AddBlock() should be called.
350 int NextIndexToAdd() const {
351 return (last_block_added_ + 1) * kBlockSize;
352 }
353
354 static inline bool TooManyMatches(int* match_counter);
355
356 const char* const source_data() { return source_data_; }
357 const size_t source_size() { return source_size_; }
358
359 // Adds an entry to the hash table for one block of source data of length
360 // kBlockSize, starting at source_data_[block_number * kBlockSize],
361 // where block_number is always (last_block_added_ + 1). That is,
362 // AddBlock() must be called once for each block in source_data_
363 // in increasing order.
364 void AddBlock(uint32_t hash_value);
365
366 // Calls AddBlock() for each complete kBlockSize-byte block between
367 // source_data_ and (source_data_ + source_size_). It is equivalent
368 // to calling AddAllBlocksThroughIndex(source_data + source_size).
369 // This function is called when Init(true) is invoked.
370 void AddAllBlocks();
371
372 // Returns true if the contents of the kBlockSize-byte block
373 // beginning at block1 are identical to the contents of
374 // the block beginning at block2; false otherwise.
375 static bool BlockContentsMatch(const char* block1, const char* block2);
376
377 // Compares each machine word of the two (possibly unaligned) blocks, rather
378 // than each byte, thus reducing the number of test-and-branch instructions
379 // executed. Returns a boolean (do the blocks match?) rather than
380 // the signed byte difference returned by memcmp.
381 //
382 // BlockContentsMatch will use either this function or memcmp to do its work,
383 // depending on which is faster for a particular architecture.
384 //
385 // For gcc on x86-based architectures, this function has been shown to run
386 // about twice as fast as the library function memcmp(), and between five and
387 // nine times faster than the assembly instructions (repz and cmpsb) that gcc
388 // uses by default for builtin memcmp. On other architectures, or using
389 // other compilers, this function has not shown to be faster than memcmp.
390 static bool BlockCompareWords(const char* block1, const char* block2);
391
392 // Finds the first block number within the hashed data
393 // that represents a match for the given hash value.
394 // Returns -1 if no match was found.
395 //
396 // Init() must have been called and returned true before using
397 // FirstMatchingBlock or NextMatchingBlock. No check is performed
398 // for this condition; the code will crash if this condition is violated.
399 //
400 // The hash table is initially populated with -1 (not found) values,
401 // so if this function is called before the hash table has been populated
402 // using AddAllBlocks() or AddBlock(), it will simply return -1
403 // for any value of hash_value.
404 int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;
405
406 // Given a block number returned by FirstMatchingBlock()
407 // or by a previous call to NextMatchingBlock(), returns
408 // the next block number that matches the same hash value.
409 // Returns -1 if no match was found.
410 int NextMatchingBlock(int block_number, const char* block_ptr) const;
411
412 // Inline version of FirstMatchingBlock. This saves the cost of a function
413 // call when this routine is called from within the module. The external
414 // (non-inlined) version is called only by unit tests.
415 inline int FirstMatchingBlockInline(uint32_t hash_value,
416 const char* block_ptr) const;
417
418 // Walk through the hash entry chain, skipping over any false matches
419 // (for which the lowest bits of the fingerprints match,
420 // but the actual block data does not.) Returns the block number of
421 // the first true match found, or -1 if no true match was found.
422 // If block_number is a matching block, the function will return block_number
423 // without skipping to the next block.
424 int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;
425
426 // Returns the number of bytes to the left of source_match_start
427 // that match the corresponding bytes to the left of target_match_start.
428 // Will not examine more than max_bytes bytes, which is to say that
429 // the return value will be in the range [0, max_bytes] inclusive.
430 static int MatchingBytesToLeft(const char* source_match_start,
431 const char* target_match_start,
432 int max_bytes);
433
434 // Returns the number of bytes starting at source_match_end
435 // that match the corresponding bytes starting at target_match_end.
436 // Will not examine more than max_bytes bytes, which is to say that
437 // the return value will be in the range [0, max_bytes] inclusive.
438 static int MatchingBytesToRight(const char* source_match_end,
439 const char* target_match_end,
440 int max_bytes);
441
442 // The protected functions BlockContentsMatch, FirstMatchingBlock,
443 // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight
444 // should be made accessible to unit tests.
445 friend class BlockHashTest;
446
447 private:
448 const char* const source_data_;
449 const size_t source_size_;
450
451 // The size of this array is determined using CalcTableSize(). It has at
452 // least one element for each kBlockSize-byte block in the source data.
453 // GetHashTableIndex() returns an index into this table for a given hash
454 // value. The value of each element of hash_table_ is the lowest block
455 // number in the source data whose hash value would return the same value from
456 // GetHashTableIndex(), or -1 if there is no matching block. This value can
457 // then be used as an index into next_block_table_ to retrieve the entire set
458 // of matching block numbers.
459 std::vector<int> hash_table_;
460
461 // An array containing one element for each source block. Each element is
462 // either -1 (== not found) or the index of the next block whose hash value
463 // would produce a matching result from GetHashTableIndex().
464 std::vector<int> next_block_table_;
465
466 // This vector has the same size as next_block_table_. For every block number
467 // B that is referenced in hash_table_, last_block_table_[B] will contain
468 // the maximum block number that has the same GetHashTableIndex() value
469 // as block B. This number may be B itself. For a block number B' that
470 // is not referenced in hash_table_, the value of last_block_table_[B'] is -1.
471 // This table is used only while populating the hash table, not while looking
472 // up hash values in the table. Keeping track of the last block number in the
473 // chain allows us to construct the block chains as FIFO rather than LIFO
474 // lists, so that the match with the lowest index is returned first. This
475 // should result in a more compact encoding because the VCDIFF format favors
476 // smaller index values and repeated index values.
477 std::vector<int> last_block_table_;
478
479 // Performing a bitwise AND with hash_table_mask_ will produce a value ranging
480 // from 0 to the number of elements in hash_table_.
481 uint32_t hash_table_mask_;
482
483 // The offset of the first byte of source data (the data at source_data_[0]).
484 // For the purpose of computing offsets, the source data and target data
485 // are considered to be concatenated -- not literally in a single memory
486 // buffer, but conceptually as described in the RFC.
487 // The first byte of the previously encoded target data
488 // has an offset that is equal to dictionary_size, i.e., just after
489 // the last byte of source data.
490 // For a hash of source (dictionary) data, starting_offset_ will be zero;
491 // for a hash of previously encoded target data, starting_offset_ will be
492 // equal to the dictionary size.
493 const int starting_offset_;
494
495 // The last index added by AddBlock(). This determines the block number
496 // for successive calls to AddBlock(), and is also
497 // used to determine the starting block for AddAllBlocksThroughIndex().
498 int last_block_added_;
499
500 // Making these private avoids implicit copy constructor & assignment operator
501 BlockHash(const BlockHash&); // NOLINT
502 void operator=(const BlockHash&);
503};
504
505} // namespace open_vcdiff
506
507#endif // OPEN_VCDIFF_BLOCKHASH_H_