openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 1 | // Copyright 2007, 2008 Google Inc. |
| 2 | // Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | // you may not use this file except in compliance with the License. |
| 6 | // You may obtain a copy of the License at |
| 7 | // |
| 8 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | // |
| 10 | // Unless required by applicable law or agreed to in writing, software |
| 11 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | // See the License for the specific language governing permissions and |
| 14 | // limitations under the License. |
| 15 | |
| 16 | #ifndef OPEN_VCDIFF_ROLLING_HASH_H_ |
| 17 | #define OPEN_VCDIFF_ROLLING_HASH_H_ |
| 18 | |
| 19 | #include <config.h> |
| 20 | #include <stdint.h> // uint32_t |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 21 | #include "compile_assert.h" |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 22 | #include "logging.h" |
| 23 | |
| 24 | namespace open_vcdiff { |
| 25 | |
| 26 | // Rabin-Karp hasher module -- this is a faster version with different |
| 27 | // constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is |
| 28 | // close enough for most applications. |
| 29 | |
| 30 | // Definitions common to all hash window sizes. |
| 31 | class RollingHashUtil { |
| 32 | public: |
| 33 | // Multiplier for incremental hashing. The compiler should be smart enough to |
| 34 | // convert (val * kMult) into ((val << 8) + val). |
| 35 | static const uint32_t kMult = 257; |
| 36 | |
| 37 | // All hashes are returned modulo "kBase". Current implementation requires |
| 38 | // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two |
| 39 | // so that we can compute modulus efficiently. |
| 40 | static const uint32_t kBase = (1 << 23); |
| 41 | |
| 42 | // Returns operand % kBase, assuming that kBase is a power of two. |
| 43 | static inline uint32_t ModBase(uint32_t operand) { |
| 44 | return operand & (kBase - 1); |
| 45 | } |
| 46 | |
| 47 | // Given an unsigned integer "operand", returns an unsigned integer "result" |
| 48 | // such that |
| 49 | // result < kBase |
| 50 | // and |
| 51 | // ModBase(operand + result) == 0 |
| 52 | static inline uint32_t FindModBaseInverse(uint32_t operand) { |
| 53 | // The subtraction (0 - operand) produces an unsigned underflow for any |
| 54 | // operand except 0. The underflow results in a (very large) unsigned |
| 55 | // number. Binary subtraction is used instead of unary negation because |
| 56 | // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned |
| 57 | // value is negated. |
| 58 | // |
| 59 | // The C++ mod operation (operand % kBase) may produce different results for |
| 60 | // different compilers if operand is negative. That is not a problem in |
| 61 | // this case, since all numbers used are unsigned, and ModBase does its work |
| 62 | // using bitwise arithmetic rather than the % operator. |
| 63 | return ModBase(uint32_t(0) - operand); |
| 64 | } |
| 65 | |
| 66 | // Here's the heart of the hash algorithm. Start with a partial_hash value of |
| 67 | // 0, and run this HashStep once against each byte in the data window to be |
| 68 | // hashed. The result will be the hash value for the entire data window. The |
| 69 | // Hash() function, below, does exactly this, albeit with some refinements. |
| 70 | static inline uint32_t HashStep(uint32_t partial_hash, |
| 71 | unsigned char next_byte) { |
| 72 | return ModBase((partial_hash * kMult) + next_byte); |
| 73 | } |
| 74 | |
| 75 | // Use this function to start computing a new hash value based on the first |
| 76 | // two bytes in the window. It is equivalent to calling |
| 77 | // HashStep(HashStep(0, ptr[0]), ptr[1]) |
| 78 | // but takes advantage of the fact that the maximum value of |
| 79 | // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus |
| 80 | // avoiding an unnecessary ModBase operation. |
| 81 | static inline uint32_t HashFirstTwoBytes(const char* ptr) { |
| 82 | return (static_cast<unsigned char>(ptr[0]) * kMult) |
| 83 | + static_cast<unsigned char>(ptr[1]); |
| 84 | } |
| 85 | private: |
| 86 | // Making these private avoids copy constructor and assignment operator. |
| 87 | // No objects of this type should be constructed. |
| 88 | RollingHashUtil(); |
| 89 | RollingHashUtil(const RollingHashUtil&); // NOLINT |
| 90 | void operator=(const RollingHashUtil&); |
| 91 | }; |
| 92 | |
| 93 | // window_size must be >= 2. |
| 94 | template<int window_size> |
| 95 | class RollingHash { |
| 96 | public: |
| 97 | // Perform global initialization that is required in order to instantiate a |
| 98 | // RollingHash. This function *must* be called (preferably on startup) by any |
| 99 | // program that uses a RollingHash. It is harmless to call this function more |
| 100 | // than once. It is not thread-safe, but calling it from two different |
| 101 | // threads at the same time can only cause a memory leak, not incorrect |
| 102 | // behavior. Make sure to call it before spawning any threads that could use |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 103 | // RollingHash. |
| 104 | static void Init(); |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 105 | |
| 106 | // Initialize hasher to maintain a window of the specified size. You need an |
| 107 | // instance of this type to use UpdateHash(), but Hash() does not depend on |
| 108 | // remove_table_, so it is static. |
| 109 | RollingHash() { |
| 110 | if (!remove_table_) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 111 | VCD_DFATAL << "RollingHash object instantiated" |
| 112 | " before calling RollingHash::Init()" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 113 | } |
| 114 | } |
| 115 | |
| 116 | // Compute a hash of the window "ptr[0, window_size - 1]". |
| 117 | static uint32_t Hash(const char* ptr) { |
| 118 | uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr); |
| 119 | for (int i = 2; i < window_size; ++i) { |
| 120 | h = RollingHashUtil::HashStep(h, ptr[i]); |
| 121 | } |
| 122 | return h; |
| 123 | } |
| 124 | |
| 125 | // Update a hash by removing the oldest byte and adding a new byte. |
| 126 | // |
| 127 | // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1] |
| 128 | // along with the value of buffer[0] (the "old_first_byte" argument) |
| 129 | // and the value of buffer[window_size] (the "new_last_byte" argument). |
| 130 | // It quickly computes the hash value of buffer[1] ... buffer[window_size] |
| 131 | // without having to run Hash() on the entire window. |
| 132 | // |
| 133 | // The larger the window, the more advantage comes from using UpdateHash() |
| 134 | // (which runs in time independent of window_size) instead of Hash(). |
| 135 | // Each time window_size doubles, the time to execute Hash() also doubles, |
| 136 | // while the time to execute UpdateHash() remains constant. Empirical tests |
| 137 | // have borne out this statement. |
| 138 | uint32_t UpdateHash(uint32_t old_hash, |
| 139 | const char old_first_byte, |
| 140 | const char new_last_byte) const { |
| 141 | uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte); |
| 142 | return RollingHashUtil::HashStep(partial_hash, new_last_byte); |
| 143 | } |
| 144 | |
| 145 | protected: |
| 146 | // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the |
| 147 | // value of the first byte buffer[0], this function returns a *partial* hash |
| 148 | // value for buffer[1] ... buffer[window_size -1]. See the comments in |
| 149 | // Init(), below, for a description of how the contents of remove_table_ are |
| 150 | // computed. |
| 151 | static uint32_t RemoveFirstByteFromHash(uint32_t full_hash, |
| 152 | unsigned char first_byte) { |
| 153 | return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]); |
| 154 | } |
| 155 | |
| 156 | private: |
| 157 | // We keep a table that maps from any byte "b" to |
| 158 | // (- b * pow(kMult, window_size - 1)) % kBase |
| 159 | static const uint32_t* remove_table_; |
| 160 | }; |
| 161 | |
| 162 | // For each window_size, fill a 256-entry table such that |
| 163 | // the hash value of buffer[0] ... buffer[window_size - 1] |
| 164 | // + remove_table_[buffer[0]] |
| 165 | // == the hash value of buffer[1] ... buffer[window_size - 1] |
| 166 | // See the comments in Init(), below, for a description of how the contents of |
| 167 | // remove_table_ are computed. |
| 168 | template<int window_size> |
| 169 | const uint32_t* RollingHash<window_size>::remove_table_ = NULL; |
| 170 | |
| 171 | // Init() checks to make sure that the static object remove_table_ has been |
| 172 | // initialized; if not, it does the considerable work of populating it. Once |
| 173 | // it's ready, the table can be used for any number of RollingHash objects of |
| 174 | // the same window_size. |
| 175 | // |
| 176 | template<int window_size> |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 177 | void RollingHash<window_size>::Init() { |
| 178 | VCD_COMPILE_ASSERT(window_size >= 2, |
| 179 | RollingHash_window_size_must_be_at_least_2); |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 180 | if (remove_table_ == NULL) { |
| 181 | // The new object is placed into a local pointer instead of directly into |
| 182 | // remove_table_, for two reasons: |
| 183 | // 1. remove_table_ is a pointer to const. The table is populated using |
| 184 | // the non-const local pointer and then assigned to the global const |
| 185 | // pointer once it's ready. |
| 186 | // 2. No other thread will ever see remove_table_ pointing to a |
| 187 | // partially-initialized table. If two threads happen to call Init() |
| 188 | // at the same time, two tables with the same contents may be created |
| 189 | // (causing a memory leak), but the results will be consistent |
| 190 | // no matter which of the two tables is used. |
| 191 | uint32_t* new_remove_table = new uint32_t[256]; |
| 192 | // Compute multiplier. Concisely, it is: |
| 193 | // pow(kMult, (window_size - 1)) % kBase, |
| 194 | // but we compute the power in integer form. |
| 195 | uint32_t multiplier = 1; |
| 196 | for (int i = 0; i < window_size - 1; ++i) { |
| 197 | multiplier = |
| 198 | RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult); |
| 199 | } |
| 200 | // For each character removed_byte, compute |
| 201 | // remove_table_[removed_byte] == |
| 202 | // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase |
| 203 | // where the power operator "pow" is taken in integer form. |
| 204 | // |
| 205 | // If you take a hash value fp representing the hash of |
| 206 | // buffer[0] ... buffer[window_size - 1] |
| 207 | // and add the value of remove_table_[buffer[0]] to it, the result will be |
| 208 | // a partial hash value for |
| 209 | // buffer[1] ... buffer[window_size - 1] |
| 210 | // that is to say, it no longer includes buffer[0]. |
| 211 | // |
| 212 | // The following byte at buffer[window_size] can then be merged with this |
| 213 | // partial hash value to arrive quickly at the hash value for a window that |
| 214 | // has advanced by one byte, to |
| 215 | // buffer[1] ... buffer[window_size] |
| 216 | // In fact, that is precisely what happens in UpdateHash, above. |
| 217 | uint32_t byte_times_multiplier = 0; |
| 218 | for (int removed_byte = 0; removed_byte < 256; ++removed_byte) { |
| 219 | new_remove_table[removed_byte] = |
| 220 | RollingHashUtil::FindModBaseInverse(byte_times_multiplier); |
| 221 | // Iteratively adding the multiplier in this loop is equivalent to |
| 222 | // computing (removed_byte * multiplier), and is faster |
| 223 | byte_times_multiplier = |
| 224 | RollingHashUtil::ModBase(byte_times_multiplier + multiplier); |
| 225 | } |
| 226 | remove_table_ = new_remove_table; |
| 227 | } |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 228 | } |
| 229 | |
| 230 | } // namespace open_vcdiff |
| 231 | |
| 232 | #endif // OPEN_VCDIFF_ROLLING_HASH_H_ |