blob: e9ee13c118c85f9afb5d1de58dbee3318e6ef604 [file] [log] [blame]
openvcdiff311c7142008-08-26 19:29:25 +00001// Copyright 2006, 2008 Google Inc.
2// Authors: Chandra Chereddi, Lincoln Smith
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <config.h>
17#include "vcdiffengine.h"
18#include <stdint.h> // uint32_t
openvcdiff83bbde02008-10-23 23:43:46 +000019#include <string.h> // memcpy
openvcdiff311c7142008-08-26 19:29:25 +000020#include "blockhash.h"
openvcdiffd1845782009-03-20 21:56:15 +000021#include "codetablewriter_interface.h"
openvcdiff311c7142008-08-26 19:29:25 +000022#include "logging.h"
23#include "rolling_hash.h"
24
25namespace open_vcdiff {
26
27VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28 // If dictionary_size == 0, then dictionary could be NULL. Guard against
29 // using a NULL value.
30 : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31 dictionary_size_(dictionary_size),
32 hashed_dictionary_(NULL) {
33 if (dictionary_size > 0) {
34 memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35 }
36}
37
38VCDiffEngine::~VCDiffEngine() {
39 delete hashed_dictionary_;
40 if (dictionary_size_ > 0) {
41 delete[] dictionary_;
42 }
43}
44
45bool VCDiffEngine::Init() {
46 if (hashed_dictionary_) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +000047 VCD_DFATAL << "Init() called twice for same VCDiffEngine object"
48 << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +000049 return false;
50 }
51 hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52 dictionary_size());
53 if (!hashed_dictionary_) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +000054 VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +000055 return false;
56 }
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +000057 RollingHash<BlockHash::kBlockSize>::Init();
openvcdiff311c7142008-08-26 19:29:25 +000058 return true;
59}
60
61// This helper function tries to find an appropriate match within
62// hashed_dictionary_ for the block starting at the current target position.
63// If target_hash is not NULL, this function will also look for a match
64// within the previously encoded target data.
65//
66// If a match is found, this function will generate an ADD instruction
67// for all unencoded data that precedes the match,
68// and a COPY instruction for the match itself; then it returns
69// the number of bytes processed by both instructions,
70// which is guaranteed to be > 0.
71// If no appropriate match is found, the function returns 0.
72//
73// The first four parameters are input parameters which are passed
74// directly to BlockHash::FindBestMatch; please see that function
75// for a description of their allowable values.
76template<bool look_for_target_matches>
77inline size_t VCDiffEngine::EncodeCopyForBestMatch(
78 uint32_t hash_value,
79 const char* target_candidate_start,
80 const char* unencoded_target_start,
81 size_t unencoded_target_size,
82 const BlockHash* target_hash,
openvcdiffd1845782009-03-20 21:56:15 +000083 CodeTableWriterInterface* coder) const {
openvcdiff311c7142008-08-26 19:29:25 +000084 // When FindBestMatch() comes up with a match for a candidate block,
85 // it will populate best_match with the size, source offset,
86 // and target offset of the match.
87 BlockHash::Match best_match;
88
89 // First look for a match in the dictionary.
90 hashed_dictionary_->FindBestMatch(hash_value,
91 target_candidate_start,
92 unencoded_target_start,
93 unencoded_target_size,
94 &best_match);
95 // If target matching is enabled, then see if there is a better match
96 // within the target data that has been encoded so far.
97 if (look_for_target_matches) {
98 target_hash->FindBestMatch(hash_value,
99 target_candidate_start,
100 unencoded_target_start,
101 unencoded_target_size,
102 &best_match);
103 }
104 if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
105 return 0;
106 }
107 if (best_match.target_offset() > 0) {
108 // Create an ADD instruction to encode all target bytes
109 // from the end of the last COPY match, if any, up to
110 // the beginning of this COPY match.
111 coder->Add(unencoded_target_start, best_match.target_offset());
112 }
113 coder->Copy(best_match.source_offset(), best_match.size());
114 return best_match.target_offset() // ADD size
115 + best_match.size(); // + COPY size
116}
117
118// Once the encoder loop has finished checking for matches in the target data,
119// this function creates an ADD instruction to encode all target bytes
120// from the end of the last COPY match, if any, through the end of
121// the target data. In the worst case, if no matches were found at all,
122// this function will create one big ADD instruction
123// for the entire buffer of target data.
124inline void VCDiffEngine::AddUnmatchedRemainder(
125 const char* unencoded_target_start,
126 size_t unencoded_target_size,
openvcdiffd1845782009-03-20 21:56:15 +0000127 CodeTableWriterInterface* coder) const {
openvcdiff311c7142008-08-26 19:29:25 +0000128 if (unencoded_target_size > 0) {
129 coder->Add(unencoded_target_start, unencoded_target_size);
130 }
131}
132
133// This helper function tells the coder to finish the encoding and write
134// the results into the output string "diff".
openvcdiffd1845782009-03-20 21:56:15 +0000135inline void VCDiffEngine::FinishEncoding(
136 size_t target_size,
137 OutputStringInterface* diff,
138 CodeTableWriterInterface* coder) const {
openvcdiff311c7142008-08-26 19:29:25 +0000139 if (target_size != static_cast<size_t>(coder->target_length())) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000140 VCD_DFATAL << "Internal error in VCDiffEngine::Encode: "
141 "original target size (" << target_size
142 << ") does not match number of bytes processed ("
143 << coder->target_length() << ")" << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +0000144 }
145 coder->Output(diff);
146}
147
148template<bool look_for_target_matches>
149void VCDiffEngine::EncodeInternal(const char* target_data,
150 size_t target_size,
151 OutputStringInterface* diff,
openvcdiffd1845782009-03-20 21:56:15 +0000152 CodeTableWriterInterface* coder) const {
openvcdiff311c7142008-08-26 19:29:25 +0000153 if (!hashed_dictionary_) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000154 VCD_DFATAL << "Internal error: VCDiffEngine::Encode() "
155 "called before VCDiffEngine::Init()" << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +0000156 return;
157 }
158 if (target_size == 0) {
159 return; // Do nothing for empty target
160 }
openvcdiff311c7142008-08-26 19:29:25 +0000161 // Special case for really small input
162 if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
163 AddUnmatchedRemainder(target_data, target_size, coder);
164 FinishEncoding(target_size, diff, coder);
165 return;
166 }
167 RollingHash<BlockHash::kBlockSize> hasher;
168 BlockHash* target_hash = NULL;
169 if (look_for_target_matches) {
170 // Check matches against previously encoded target data
171 // in this same target window, as well as against the dictionary
172 target_hash = BlockHash::CreateTargetHash(target_data,
173 target_size,
174 dictionary_size());
175 if (!target_hash) {
openvcdiff@gmail.com732fff22010-08-04 18:00:00 +0000176 VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL;
openvcdiff311c7142008-08-26 19:29:25 +0000177 return;
178 }
179 }
180 const char* const target_end = target_data + target_size;
181 const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
182 // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
183 // dictionary)
184 const char* next_encode = target_data;
185 // candidate_pos points to the start of the kBlockSize-byte block that may
186 // begin a match with the dictionary or previously encoded target data.
187 const char* candidate_pos = target_data;
188 uint32_t hash_value = hasher.Hash(candidate_pos);
189 while (1) {
190 const size_t bytes_encoded =
191 EncodeCopyForBestMatch<look_for_target_matches>(
192 hash_value,
193 candidate_pos,
194 next_encode,
195 (target_end - next_encode),
196 target_hash,
197 coder);
198 if (bytes_encoded > 0) {
199 next_encode += bytes_encoded; // Advance past COPYed data
200 candidate_pos = next_encode;
201 if (candidate_pos > start_of_last_block) {
202 break; // Reached end of target data
203 }
204 // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
205 // can't be used to calculate the hash value at its new position.
206 hash_value = hasher.Hash(candidate_pos);
207 if (look_for_target_matches) {
208 // Update the target hash for the ADDed and COPYed data
209 target_hash->AddAllBlocksThroughIndex(
210 static_cast<int>(next_encode - target_data));
211 }
212 } else {
213 // No match, or match is too small to be worth a COPY instruction.
214 // Move to the next position in the target data.
215 if ((candidate_pos + 1) > start_of_last_block) {
216 break; // Reached end of target data
217 }
218 if (look_for_target_matches) {
219 target_hash->AddOneIndexHash(
220 static_cast<int>(candidate_pos - target_data),
221 hash_value);
222 }
223 hash_value = hasher.UpdateHash(hash_value,
224 candidate_pos[0],
225 candidate_pos[BlockHash::kBlockSize]);
226 ++candidate_pos;
227 }
228 }
229 AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
230 FinishEncoding(target_size, diff, coder);
231 delete target_hash;
232}
233
234void VCDiffEngine::Encode(const char* target_data,
235 size_t target_size,
236 bool look_for_target_matches,
237 OutputStringInterface* diff,
openvcdiffd1845782009-03-20 21:56:15 +0000238 CodeTableWriterInterface* coder) const {
openvcdiff311c7142008-08-26 19:29:25 +0000239 if (look_for_target_matches) {
240 EncodeInternal<true>(target_data, target_size, diff, coder);
241 } else {
242 EncodeInternal<false>(target_data, target_size, diff, coder);
243 }
244}
245
246} // namespace open_vcdiff