blob: 55d3f191b3940a48d4cb0d2f172f7e877f654f1f [file] [log] [blame]
openvcdiff311c7142008-08-26 19:29:25 +00001// Copyright 2007, 2008 Google Inc.
2// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#ifndef OPEN_VCDIFF_ROLLING_HASH_H_
17#define OPEN_VCDIFF_ROLLING_HASH_H_
18
19#include <config.h>
20#include <stdint.h> // uint32_t
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +000021#include "compile_assert.h"
openvcdiff311c7142008-08-26 19:29:25 +000022#include "logging.h"
23
24namespace open_vcdiff {
25
26// Rabin-Karp hasher module -- this is a faster version with different
27// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
28// close enough for most applications.
29
30// Definitions common to all hash window sizes.
31class RollingHashUtil {
32 public:
33 // Multiplier for incremental hashing. The compiler should be smart enough to
34 // convert (val * kMult) into ((val << 8) + val).
35 static const uint32_t kMult = 257;
36
37 // All hashes are returned modulo "kBase". Current implementation requires
38 // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two
39 // so that we can compute modulus efficiently.
40 static const uint32_t kBase = (1 << 23);
41
42 // Returns operand % kBase, assuming that kBase is a power of two.
43 static inline uint32_t ModBase(uint32_t operand) {
44 return operand & (kBase - 1);
45 }
46
47 // Given an unsigned integer "operand", returns an unsigned integer "result"
48 // such that
49 // result < kBase
50 // and
51 // ModBase(operand + result) == 0
52 static inline uint32_t FindModBaseInverse(uint32_t operand) {
53 // The subtraction (0 - operand) produces an unsigned underflow for any
54 // operand except 0. The underflow results in a (very large) unsigned
55 // number. Binary subtraction is used instead of unary negation because
56 // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
57 // value is negated.
58 //
59 // The C++ mod operation (operand % kBase) may produce different results for
60 // different compilers if operand is negative. That is not a problem in
61 // this case, since all numbers used are unsigned, and ModBase does its work
62 // using bitwise arithmetic rather than the % operator.
63 return ModBase(uint32_t(0) - operand);
64 }
65
66 // Here's the heart of the hash algorithm. Start with a partial_hash value of
67 // 0, and run this HashStep once against each byte in the data window to be
68 // hashed. The result will be the hash value for the entire data window. The
69 // Hash() function, below, does exactly this, albeit with some refinements.
70 static inline uint32_t HashStep(uint32_t partial_hash,
71 unsigned char next_byte) {
72 return ModBase((partial_hash * kMult) + next_byte);
73 }
74
75 // Use this function to start computing a new hash value based on the first
76 // two bytes in the window. It is equivalent to calling
77 // HashStep(HashStep(0, ptr[0]), ptr[1])
78 // but takes advantage of the fact that the maximum value of
79 // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
80 // avoiding an unnecessary ModBase operation.
81 static inline uint32_t HashFirstTwoBytes(const char* ptr) {
82 return (static_cast<unsigned char>(ptr[0]) * kMult)
83 + static_cast<unsigned char>(ptr[1]);
84 }
85 private:
86 // Making these private avoids copy constructor and assignment operator.
87 // No objects of this type should be constructed.
88 RollingHashUtil();
89 RollingHashUtil(const RollingHashUtil&); // NOLINT
90 void operator=(const RollingHashUtil&);
91};
92
93// window_size must be >= 2.
94template<int window_size>
95class RollingHash {
96 public:
97 // Perform global initialization that is required in order to instantiate a
98 // RollingHash. This function *must* be called (preferably on startup) by any
99 // program that uses a RollingHash. It is harmless to call this function more
100 // than once. It is not thread-safe, but calling it from two different
101 // threads at the same time can only cause a memory leak, not incorrect
102 // behavior. Make sure to call it before spawning any threads that could use
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000103 // RollingHash.
104 static void Init();
openvcdiff311c7142008-08-26 19:29:25 +0000105
106 // Initialize hasher to maintain a window of the specified size. You need an
107 // instance of this type to use UpdateHash(), but Hash() does not depend on
108 // remove_table_, so it is static.
109 RollingHash() {
110 if (!remove_table_) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000111 VCD_DFATAL << "RollingHash object instantiated"
112 " before calling RollingHash::Init()" << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +0000113 }
114 }
115
116 // Compute a hash of the window "ptr[0, window_size - 1]".
117 static uint32_t Hash(const char* ptr) {
118 uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
119 for (int i = 2; i < window_size; ++i) {
120 h = RollingHashUtil::HashStep(h, ptr[i]);
121 }
122 return h;
123 }
124
125 // Update a hash by removing the oldest byte and adding a new byte.
126 //
127 // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
128 // along with the value of buffer[0] (the "old_first_byte" argument)
129 // and the value of buffer[window_size] (the "new_last_byte" argument).
130 // It quickly computes the hash value of buffer[1] ... buffer[window_size]
131 // without having to run Hash() on the entire window.
132 //
133 // The larger the window, the more advantage comes from using UpdateHash()
134 // (which runs in time independent of window_size) instead of Hash().
135 // Each time window_size doubles, the time to execute Hash() also doubles,
136 // while the time to execute UpdateHash() remains constant. Empirical tests
137 // have borne out this statement.
138 uint32_t UpdateHash(uint32_t old_hash,
139 const char old_first_byte,
140 const char new_last_byte) const {
141 uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
142 return RollingHashUtil::HashStep(partial_hash, new_last_byte);
143 }
144
145 protected:
146 // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
147 // value of the first byte buffer[0], this function returns a *partial* hash
148 // value for buffer[1] ... buffer[window_size -1]. See the comments in
149 // Init(), below, for a description of how the contents of remove_table_ are
150 // computed.
151 static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
152 unsigned char first_byte) {
153 return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
154 }
155
156 private:
157 // We keep a table that maps from any byte "b" to
158 // (- b * pow(kMult, window_size - 1)) % kBase
159 static const uint32_t* remove_table_;
160};
161
162// For each window_size, fill a 256-entry table such that
163// the hash value of buffer[0] ... buffer[window_size - 1]
164// + remove_table_[buffer[0]]
165// == the hash value of buffer[1] ... buffer[window_size - 1]
166// See the comments in Init(), below, for a description of how the contents of
167// remove_table_ are computed.
168template<int window_size>
169const uint32_t* RollingHash<window_size>::remove_table_ = NULL;
170
171// Init() checks to make sure that the static object remove_table_ has been
172// initialized; if not, it does the considerable work of populating it. Once
173// it's ready, the table can be used for any number of RollingHash objects of
174// the same window_size.
175//
176template<int window_size>
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000177void RollingHash<window_size>::Init() {
178 VCD_COMPILE_ASSERT(window_size >= 2,
179 RollingHash_window_size_must_be_at_least_2);
openvcdiff311c7142008-08-26 19:29:25 +0000180 if (remove_table_ == NULL) {
181 // The new object is placed into a local pointer instead of directly into
182 // remove_table_, for two reasons:
183 // 1. remove_table_ is a pointer to const. The table is populated using
184 // the non-const local pointer and then assigned to the global const
185 // pointer once it's ready.
186 // 2. No other thread will ever see remove_table_ pointing to a
187 // partially-initialized table. If two threads happen to call Init()
188 // at the same time, two tables with the same contents may be created
189 // (causing a memory leak), but the results will be consistent
190 // no matter which of the two tables is used.
191 uint32_t* new_remove_table = new uint32_t[256];
192 // Compute multiplier. Concisely, it is:
193 // pow(kMult, (window_size - 1)) % kBase,
194 // but we compute the power in integer form.
195 uint32_t multiplier = 1;
196 for (int i = 0; i < window_size - 1; ++i) {
197 multiplier =
198 RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
199 }
200 // For each character removed_byte, compute
201 // remove_table_[removed_byte] ==
202 // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
203 // where the power operator "pow" is taken in integer form.
204 //
205 // If you take a hash value fp representing the hash of
206 // buffer[0] ... buffer[window_size - 1]
207 // and add the value of remove_table_[buffer[0]] to it, the result will be
208 // a partial hash value for
209 // buffer[1] ... buffer[window_size - 1]
210 // that is to say, it no longer includes buffer[0].
211 //
212 // The following byte at buffer[window_size] can then be merged with this
213 // partial hash value to arrive quickly at the hash value for a window that
214 // has advanced by one byte, to
215 // buffer[1] ... buffer[window_size]
216 // In fact, that is precisely what happens in UpdateHash, above.
217 uint32_t byte_times_multiplier = 0;
218 for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
219 new_remove_table[removed_byte] =
220 RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
221 // Iteratively adding the multiplier in this loop is equivalent to
222 // computing (removed_byte * multiplier), and is faster
223 byte_times_multiplier =
224 RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
225 }
226 remove_table_ = new_remove_table;
227 }
openvcdiff311c7142008-08-26 19:29:25 +0000228}
229
230} // namespace open_vcdiff
231
232#endif // OPEN_VCDIFF_ROLLING_HASH_H_