openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 1 | // Copyright 2006, 2008 Google Inc. |
| 2 | // Authors: Chandra Chereddi, Lincoln Smith |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | // you may not use this file except in compliance with the License. |
| 6 | // You may obtain a copy of the License at |
| 7 | // |
| 8 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | // |
| 10 | // Unless required by applicable law or agreed to in writing, software |
| 11 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | // See the License for the specific language governing permissions and |
| 14 | // limitations under the License. |
| 15 | |
| 16 | #include <config.h> |
| 17 | #include "vcdiffengine.h" |
| 18 | #include <stdint.h> // uint32_t |
openvcdiff | 83bbde0 | 2008-10-23 23:43:46 +0000 | [diff] [blame] | 19 | #include <string.h> // memcpy |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 20 | #include "blockhash.h" |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 21 | #include "codetablewriter_interface.h" |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 22 | #include "logging.h" |
| 23 | #include "rolling_hash.h" |
| 24 | |
| 25 | namespace open_vcdiff { |
| 26 | |
| 27 | VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size) |
| 28 | // If dictionary_size == 0, then dictionary could be NULL. Guard against |
| 29 | // using a NULL value. |
| 30 | : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""), |
| 31 | dictionary_size_(dictionary_size), |
| 32 | hashed_dictionary_(NULL) { |
| 33 | if (dictionary_size > 0) { |
| 34 | memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size); |
| 35 | } |
| 36 | } |
| 37 | |
| 38 | VCDiffEngine::~VCDiffEngine() { |
| 39 | delete hashed_dictionary_; |
| 40 | if (dictionary_size_ > 0) { |
| 41 | delete[] dictionary_; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | bool VCDiffEngine::Init() { |
| 46 | if (hashed_dictionary_) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 47 | VCD_DFATAL << "Init() called twice for same VCDiffEngine object" |
| 48 | << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 49 | return false; |
| 50 | } |
| 51 | hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_, |
| 52 | dictionary_size()); |
| 53 | if (!hashed_dictionary_) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 54 | VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 55 | return false; |
| 56 | } |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 57 | RollingHash<BlockHash::kBlockSize>::Init(); |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 58 | return true; |
| 59 | } |
| 60 | |
| 61 | // This helper function tries to find an appropriate match within |
| 62 | // hashed_dictionary_ for the block starting at the current target position. |
| 63 | // If target_hash is not NULL, this function will also look for a match |
| 64 | // within the previously encoded target data. |
| 65 | // |
| 66 | // If a match is found, this function will generate an ADD instruction |
| 67 | // for all unencoded data that precedes the match, |
| 68 | // and a COPY instruction for the match itself; then it returns |
| 69 | // the number of bytes processed by both instructions, |
| 70 | // which is guaranteed to be > 0. |
| 71 | // If no appropriate match is found, the function returns 0. |
| 72 | // |
| 73 | // The first four parameters are input parameters which are passed |
| 74 | // directly to BlockHash::FindBestMatch; please see that function |
| 75 | // for a description of their allowable values. |
| 76 | template<bool look_for_target_matches> |
| 77 | inline size_t VCDiffEngine::EncodeCopyForBestMatch( |
| 78 | uint32_t hash_value, |
| 79 | const char* target_candidate_start, |
| 80 | const char* unencoded_target_start, |
| 81 | size_t unencoded_target_size, |
| 82 | const BlockHash* target_hash, |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 83 | CodeTableWriterInterface* coder) const { |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 84 | // When FindBestMatch() comes up with a match for a candidate block, |
| 85 | // it will populate best_match with the size, source offset, |
| 86 | // and target offset of the match. |
| 87 | BlockHash::Match best_match; |
| 88 | |
| 89 | // First look for a match in the dictionary. |
| 90 | hashed_dictionary_->FindBestMatch(hash_value, |
| 91 | target_candidate_start, |
| 92 | unencoded_target_start, |
| 93 | unencoded_target_size, |
| 94 | &best_match); |
| 95 | // If target matching is enabled, then see if there is a better match |
| 96 | // within the target data that has been encoded so far. |
| 97 | if (look_for_target_matches) { |
| 98 | target_hash->FindBestMatch(hash_value, |
| 99 | target_candidate_start, |
| 100 | unencoded_target_start, |
| 101 | unencoded_target_size, |
| 102 | &best_match); |
| 103 | } |
| 104 | if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) { |
| 105 | return 0; |
| 106 | } |
| 107 | if (best_match.target_offset() > 0) { |
| 108 | // Create an ADD instruction to encode all target bytes |
| 109 | // from the end of the last COPY match, if any, up to |
| 110 | // the beginning of this COPY match. |
| 111 | coder->Add(unencoded_target_start, best_match.target_offset()); |
| 112 | } |
| 113 | coder->Copy(best_match.source_offset(), best_match.size()); |
| 114 | return best_match.target_offset() // ADD size |
| 115 | + best_match.size(); // + COPY size |
| 116 | } |
| 117 | |
| 118 | // Once the encoder loop has finished checking for matches in the target data, |
| 119 | // this function creates an ADD instruction to encode all target bytes |
| 120 | // from the end of the last COPY match, if any, through the end of |
| 121 | // the target data. In the worst case, if no matches were found at all, |
| 122 | // this function will create one big ADD instruction |
| 123 | // for the entire buffer of target data. |
| 124 | inline void VCDiffEngine::AddUnmatchedRemainder( |
| 125 | const char* unencoded_target_start, |
| 126 | size_t unencoded_target_size, |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 127 | CodeTableWriterInterface* coder) const { |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 128 | if (unencoded_target_size > 0) { |
| 129 | coder->Add(unencoded_target_start, unencoded_target_size); |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | // This helper function tells the coder to finish the encoding and write |
| 134 | // the results into the output string "diff". |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 135 | inline void VCDiffEngine::FinishEncoding( |
| 136 | size_t target_size, |
| 137 | OutputStringInterface* diff, |
| 138 | CodeTableWriterInterface* coder) const { |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 139 | if (target_size != static_cast<size_t>(coder->target_length())) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 140 | VCD_DFATAL << "Internal error in VCDiffEngine::Encode: " |
| 141 | "original target size (" << target_size |
| 142 | << ") does not match number of bytes processed (" |
| 143 | << coder->target_length() << ")" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 144 | } |
| 145 | coder->Output(diff); |
| 146 | } |
| 147 | |
| 148 | template<bool look_for_target_matches> |
| 149 | void VCDiffEngine::EncodeInternal(const char* target_data, |
| 150 | size_t target_size, |
| 151 | OutputStringInterface* diff, |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 152 | CodeTableWriterInterface* coder) const { |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 153 | if (!hashed_dictionary_) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 154 | VCD_DFATAL << "Internal error: VCDiffEngine::Encode() " |
| 155 | "called before VCDiffEngine::Init()" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 156 | return; |
| 157 | } |
| 158 | if (target_size == 0) { |
| 159 | return; // Do nothing for empty target |
| 160 | } |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 161 | // Special case for really small input |
| 162 | if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) { |
| 163 | AddUnmatchedRemainder(target_data, target_size, coder); |
| 164 | FinishEncoding(target_size, diff, coder); |
| 165 | return; |
| 166 | } |
| 167 | RollingHash<BlockHash::kBlockSize> hasher; |
| 168 | BlockHash* target_hash = NULL; |
| 169 | if (look_for_target_matches) { |
| 170 | // Check matches against previously encoded target data |
| 171 | // in this same target window, as well as against the dictionary |
| 172 | target_hash = BlockHash::CreateTargetHash(target_data, |
| 173 | target_size, |
| 174 | dictionary_size()); |
| 175 | if (!target_hash) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 176 | VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 177 | return; |
| 178 | } |
| 179 | } |
| 180 | const char* const target_end = target_data + target_size; |
| 181 | const char* const start_of_last_block = target_end - BlockHash::kBlockSize; |
| 182 | // Offset of next bytes in string to ADD if NOT copied (i.e., not found in |
| 183 | // dictionary) |
| 184 | const char* next_encode = target_data; |
| 185 | // candidate_pos points to the start of the kBlockSize-byte block that may |
| 186 | // begin a match with the dictionary or previously encoded target data. |
| 187 | const char* candidate_pos = target_data; |
| 188 | uint32_t hash_value = hasher.Hash(candidate_pos); |
| 189 | while (1) { |
| 190 | const size_t bytes_encoded = |
| 191 | EncodeCopyForBestMatch<look_for_target_matches>( |
| 192 | hash_value, |
| 193 | candidate_pos, |
| 194 | next_encode, |
| 195 | (target_end - next_encode), |
| 196 | target_hash, |
| 197 | coder); |
| 198 | if (bytes_encoded > 0) { |
| 199 | next_encode += bytes_encoded; // Advance past COPYed data |
| 200 | candidate_pos = next_encode; |
| 201 | if (candidate_pos > start_of_last_block) { |
| 202 | break; // Reached end of target data |
| 203 | } |
| 204 | // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash |
| 205 | // can't be used to calculate the hash value at its new position. |
| 206 | hash_value = hasher.Hash(candidate_pos); |
| 207 | if (look_for_target_matches) { |
| 208 | // Update the target hash for the ADDed and COPYed data |
| 209 | target_hash->AddAllBlocksThroughIndex( |
| 210 | static_cast<int>(next_encode - target_data)); |
| 211 | } |
| 212 | } else { |
| 213 | // No match, or match is too small to be worth a COPY instruction. |
| 214 | // Move to the next position in the target data. |
| 215 | if ((candidate_pos + 1) > start_of_last_block) { |
| 216 | break; // Reached end of target data |
| 217 | } |
| 218 | if (look_for_target_matches) { |
| 219 | target_hash->AddOneIndexHash( |
| 220 | static_cast<int>(candidate_pos - target_data), |
| 221 | hash_value); |
| 222 | } |
| 223 | hash_value = hasher.UpdateHash(hash_value, |
| 224 | candidate_pos[0], |
| 225 | candidate_pos[BlockHash::kBlockSize]); |
| 226 | ++candidate_pos; |
| 227 | } |
| 228 | } |
| 229 | AddUnmatchedRemainder(next_encode, target_end - next_encode, coder); |
| 230 | FinishEncoding(target_size, diff, coder); |
| 231 | delete target_hash; |
| 232 | } |
| 233 | |
| 234 | void VCDiffEngine::Encode(const char* target_data, |
| 235 | size_t target_size, |
| 236 | bool look_for_target_matches, |
| 237 | OutputStringInterface* diff, |
openvcdiff | d184578 | 2009-03-20 21:56:15 +0000 | [diff] [blame] | 238 | CodeTableWriterInterface* coder) const { |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 239 | if (look_for_target_matches) { |
| 240 | EncodeInternal<true>(target_data, target_size, diff, coder); |
| 241 | } else { |
| 242 | EncodeInternal<false>(target_data, target_size, diff, coder); |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | } // namespace open_vcdiff |