openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 1 | // Copyright 2007 Google Inc. |
| 2 | // Author: Lincoln Smith |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | // you may not use this file except in compliance with the License. |
| 6 | // You may obtain a copy of the License at |
| 7 | // |
| 8 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | // |
| 10 | // Unless required by applicable law or agreed to in writing, software |
| 11 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | // See the License for the specific language governing permissions and |
| 14 | // limitations under the License. |
| 15 | // |
| 16 | // Implementation of the Address Cache and Address Encoding |
| 17 | // algorithms described in sections 5.1 - 5.4 of RFC 3284 - |
| 18 | // The VCDIFF Generic Differencing and Compression Data Format. |
| 19 | // The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html |
| 20 | // |
| 21 | // Assumptions: |
| 22 | // * The VCDAddress type is large enough to hold any offset within |
| 23 | // the source and target windows. The limit (for int32_t) is 2^31-1 bytes. |
| 24 | // The source (dictionary) should not approach this size limit; |
| 25 | // to compress a target file that is larger than |
| 26 | // INT_MAX - (dictionary size) bytes, the encoder must |
| 27 | // break it up into multiple target windows. |
| 28 | |
| 29 | #include <config.h> |
| 30 | #include "addrcache.h" |
| 31 | #include "logging.h" |
| 32 | #include "varint_bigendian.h" |
| 33 | #include "vcdiff_defs.h" // RESULT_ERROR |
| 34 | |
| 35 | namespace open_vcdiff { |
| 36 | |
| 37 | // The constructor does not initialize near_addresses_ and same_addresses_. |
| 38 | // Therefore, Init() must be called before any other method can be used. |
| 39 | // |
| 40 | // Arguments: |
| 41 | // near_cache_size: Size of the NEAR cache (number of 4-byte integers) |
| 42 | // same_cache_size: Size of the SAME cache (number of blocks of |
| 43 | // 256 4-byte integers per block) |
| 44 | // Because the mode is expressed as a byte value, |
| 45 | // near_cache_size + same_cache_size should not exceed 254. |
| 46 | // |
| 47 | VCDiffAddressCache::VCDiffAddressCache(int near_cache_size, |
| 48 | int same_cache_size) |
| 49 | : near_cache_size_(near_cache_size), |
| 50 | same_cache_size_(same_cache_size), |
| 51 | next_slot_(0) { } |
| 52 | |
| 53 | VCDiffAddressCache::VCDiffAddressCache() |
| 54 | : near_cache_size_(kDefaultNearCacheSize), |
| 55 | same_cache_size_(kDefaultSameCacheSize), |
| 56 | next_slot_(0) { } |
| 57 | |
| 58 | // Sets up data structures needed to call other methods. Operations that may |
| 59 | // fail at runtime (for example, validating the provided near_cache_size_ and |
| 60 | // same_cache_size_ parameters against their maximum allowed values) are |
| 61 | // confined to this routine in order to guarantee that the class constructor |
| 62 | // will never fail. Other methods (except the destructor) cannot be invoked |
| 63 | // until this method has been called successfully. After the object has been |
| 64 | // initialized and used, Init() can be called again to reset it to its initial |
| 65 | // state. |
| 66 | // |
| 67 | // Return value: "true" if initialization succeeded, "false" if it failed. |
| 68 | // No other method except the destructor may be invoked if this function |
| 69 | // returns false. The caller is responsible for checking the return value |
| 70 | // and providing an exit path in case of error. |
| 71 | // |
| 72 | bool VCDiffAddressCache::Init() { |
| 73 | // The mode is expressed as a byte value, so there is only room for 256 modes, |
| 74 | // including the two non-cached modes (SELF and HERE). Do not allow a larger |
| 75 | // number of modes to be defined. We do a separate sanity check for |
| 76 | // near_cache_size_ and same_cache_size_ because adding them together can |
| 77 | // cause an integer overflow if each is set to, say, INT_MAX. |
| 78 | if ((near_cache_size_ > (VCD_MAX_MODES - 2)) || (near_cache_size_ < 0)) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 79 | VCD_ERROR << "Near cache size " << near_cache_size_ << " is invalid" |
| 80 | << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 81 | return false; |
| 82 | } |
| 83 | if ((same_cache_size_ > (VCD_MAX_MODES - 2)) || (same_cache_size_ < 0)) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 84 | VCD_ERROR << "Same cache size " << same_cache_size_ << " is invalid" |
| 85 | << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 86 | return false; |
| 87 | } |
| 88 | if ((near_cache_size_ + same_cache_size_) > VCD_MAX_MODES - 2) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 89 | VCD_ERROR << "Using near cache size " << near_cache_size_ |
| 90 | << " and same cache size " << same_cache_size_ |
| 91 | << " would exceed maximum number of COPY modes (" |
| 92 | << VCD_MAX_MODES << ")" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 93 | return false; |
| 94 | } |
| 95 | if (near_cache_size_ > 0) { |
| 96 | near_addresses_.assign(near_cache_size_, 0); |
| 97 | } |
| 98 | if (same_cache_size_ > 0) { |
| 99 | same_addresses_.assign(same_cache_size_ * 256, 0); |
| 100 | } |
| 101 | next_slot_ = 0; // in case Init() is called a second time to reinit |
| 102 | return true; |
| 103 | } |
| 104 | |
| 105 | // This method will be called whenever an address is calculated for an |
| 106 | // encoded or decoded COPY instruction, and will update the contents |
| 107 | // of the SAME and NEAR caches. It is vital that the use of |
| 108 | // UpdateCache (called cache_update in the RFC examples) exactly match |
| 109 | // the RFC standard, and that the same caching logic be used in the |
| 110 | // decoder as in the encoder, in order for the decoded addresses to |
| 111 | // match. |
| 112 | // |
| 113 | // Argument: |
| 114 | // address: This must be a valid address between 0 and |
| 115 | // (source window size + target window size). It is assumed that |
| 116 | // these bounds have been checked before calling UpdateCache. |
| 117 | // |
| 118 | void VCDiffAddressCache::UpdateCache(VCDAddress address) { |
| 119 | if (near_cache_size_ > 0) { |
| 120 | near_addresses_[next_slot_] = address; |
| 121 | next_slot_ = (next_slot_ + 1) % near_cache_size_; |
| 122 | } |
| 123 | if (same_cache_size_ > 0) { |
| 124 | same_addresses_[address % (same_cache_size_ * 256)] = address; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | // Determines the address mode that yields the most compact encoding |
| 129 | // of the given address value, writes the encoded address into the |
| 130 | // address stream, and returns the mode used. The most compact encoding |
| 131 | // is found by looking for the numerically lowest encoded address. |
| 132 | // The Init() function must already have been called. |
| 133 | // |
| 134 | // Arguments: |
| 135 | // address: The address to be encoded. Must be a non-negative integer |
| 136 | // between 0 and (here_address - 1). |
| 137 | // here_address: The current location in the target data (i.e., the |
| 138 | // position just after the last encoded value.) Must be non-negative. |
| 139 | // encoded_addr: Points to an VCDAddress that will be replaced |
| 140 | // with the encoded representation of address. |
| 141 | // If WriteAddressAsVarintForMode returns true when passed |
| 142 | // the return value, then encoded_addr should be written |
| 143 | // into the delta file as a variable-length integer (Varint); |
| 144 | // otherwise, it should be written as a byte (unsigned char). |
| 145 | // |
| 146 | // Return value: A mode value between 0 and 255. The mode will tell |
| 147 | // how to interpret the next value in the address stream. |
| 148 | // The values 0 and 1 correspond to SELF and HERE addressing. |
| 149 | // |
| 150 | // The function is guaranteed to succeed unless the conditions on the arguments |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 151 | // have not been met, in which case a VCD_DFATAL message will be produced, |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 152 | // 0 will be returned, and *encoded_addr will be replaced with 0. |
| 153 | // |
| 154 | unsigned char VCDiffAddressCache::EncodeAddress(VCDAddress address, |
| 155 | VCDAddress here_address, |
| 156 | VCDAddress* encoded_addr) { |
| 157 | if (address < 0) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 158 | VCD_DFATAL << "EncodeAddress was passed a negative address: " |
| 159 | << address << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 160 | *encoded_addr = 0; |
| 161 | return 0; |
| 162 | } |
| 163 | if (address >= here_address) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 164 | VCD_DFATAL << "EncodeAddress was called with address (" << address |
| 165 | << ") < here_address (" << here_address << ")" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 166 | *encoded_addr = 0; |
| 167 | return 0; |
| 168 | } |
| 169 | // Try using the SAME cache. This method, if available, always |
| 170 | // results in the smallest encoding and takes priority over other modes. |
| 171 | if (same_cache_size() > 0) { |
| 172 | const VCDAddress same_cache_pos = |
| 173 | address % (same_cache_size() * 256); |
| 174 | if (SameAddress(same_cache_pos) == address) { |
| 175 | // This is the only mode for which an single byte will be written |
| 176 | // to the address stream instead of a variable-length integer. |
| 177 | UpdateCache(address); |
| 178 | *encoded_addr = same_cache_pos % 256; |
| 179 | return FirstSameMode() + (same_cache_pos / 256); // SAME mode |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | // Try SELF mode |
| 184 | unsigned char best_mode = VCD_SELF_MODE; |
| 185 | VCDAddress best_encoded_address = address; |
| 186 | |
| 187 | // Try HERE mode |
| 188 | { |
| 189 | const VCDAddress here_encoded_address = here_address - address; |
| 190 | if (here_encoded_address < best_encoded_address) { |
| 191 | best_mode = VCD_HERE_MODE; |
| 192 | best_encoded_address = here_encoded_address; |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | // Try using the NEAR cache |
| 197 | for (int i = 0; i < near_cache_size(); ++i) { |
| 198 | const VCDAddress near_encoded_address = address - NearAddress(i); |
| 199 | if ((near_encoded_address >= 0) && |
| 200 | (near_encoded_address < best_encoded_address)) { |
| 201 | best_mode = FirstNearMode() + i; |
| 202 | best_encoded_address = near_encoded_address; |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | UpdateCache(address); |
| 207 | *encoded_addr = best_encoded_address; |
| 208 | return best_mode; |
| 209 | } |
| 210 | |
| 211 | // Increments *byte_pointer and returns the byte it pointed to before the |
| 212 | // increment. The caller must check bounds to ensure that *byte_pointer |
| 213 | // points to a valid address in memory. |
| 214 | static unsigned char ParseByte(const char** byte_pointer) { |
| 215 | unsigned char byte_value = static_cast<unsigned char>(**byte_pointer); |
| 216 | ++(*byte_pointer); |
| 217 | return byte_value; |
| 218 | } |
| 219 | |
| 220 | // Checks the given decoded address for validity. Returns true if the |
| 221 | // address is valid; otherwise, prints an error message to the log and |
| 222 | // returns false. |
| 223 | static bool IsDecodedAddressValid(VCDAddress decoded_address, |
| 224 | VCDAddress here_address) { |
| 225 | if (decoded_address < 0) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 226 | VCD_ERROR << "Decoded address " << decoded_address << " is invalid" |
| 227 | << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 228 | return false; |
| 229 | } else if (decoded_address >= here_address) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 230 | VCD_ERROR << "Decoded address (" << decoded_address |
| 231 | << ") is beyond location in target file (" << here_address |
| 232 | << ")" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 233 | return false; |
| 234 | } |
| 235 | return true; |
| 236 | } |
| 237 | |
| 238 | // Interprets the next value in the address_stream using the provided mode, |
| 239 | // which may need to access the SAME or NEAR address cache. Returns the |
| 240 | // decoded address. |
| 241 | // The Init() function must already have been called. |
| 242 | // |
| 243 | // Arguments: |
| 244 | // here_address: The current location in the source + target data (i.e., the |
| 245 | // location into which the COPY instruction will copy.) By definition, |
| 246 | // all addresses between 0 and (here_address - 1) are valid, and |
| 247 | // any other address is invalid. |
| 248 | // mode: A byte value between 0 and (near_cache_size_ + same_cache_size_ + 1) |
| 249 | // which tells how to interpret the next value in the address stream. |
| 250 | // The values 0 and 1 correspond to SELF and HERE addressing. |
| 251 | // The validity of "mode" should already have been checked before |
| 252 | // calling this function. |
| 253 | // address_stream: Points to a pointer holding the position |
| 254 | // in the "Addresses section for COPYs" part of the input data. |
| 255 | // That section must already have been uncompressed |
| 256 | // using a secondary decompressor (if necessary.) |
| 257 | // This is an IN/OUT argument; the value of *address_stream will be |
| 258 | // incremented by the size of an integer, or (if the SAME cache |
| 259 | // was used) by the size of a byte (1). |
| 260 | // address_stream_end: Points to the position just after the end of |
| 261 | // the address stream buffer. All addresses between *address_stream |
| 262 | // and address_stream_end should contain valid address data. |
| 263 | // |
| 264 | // Return value: If the input conditions were met, and the address section |
| 265 | // of the input data contains properly encoded addresses that match |
| 266 | // the instructions section, then an integer between 0 and here_address - 1 |
| 267 | // will be returned, representing the address from which data should |
| 268 | // be copied from the source or target window into the output stream. |
| 269 | // If an invalid address value is found in address_stream, then |
| 270 | // RESULT_ERROR will be returned. If the limit address_stream_end |
| 271 | // is reached before the address can be decoded, then |
| 272 | // RESULT_END_OF_DATA will be returned. If more streamed data |
| 273 | // is expected, this means that the consumer should block and wait |
| 274 | // for more data before continuing to decode. If no more data is expected, |
| 275 | // this return value signals an error condition. |
| 276 | // |
| 277 | VCDAddress VCDiffAddressCache::DecodeAddress(VCDAddress here_address, |
| 278 | unsigned char mode, |
| 279 | const char** address_stream, |
| 280 | const char* address_stream_end) { |
| 281 | if (here_address < 0) { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 282 | VCD_DFATAL << "DecodeAddress was passed a negative value" |
| 283 | " for here_address: " << here_address << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 284 | return RESULT_ERROR; |
| 285 | } |
| 286 | const char* new_address_pos = *address_stream; |
| 287 | if (new_address_pos >= address_stream_end) { |
| 288 | return RESULT_END_OF_DATA; |
| 289 | } |
| 290 | VCDAddress decoded_address; |
| 291 | if (IsSameMode(mode)) { |
| 292 | // SAME mode expects a byte value as the encoded address |
| 293 | unsigned char encoded_address = ParseByte(&new_address_pos); |
| 294 | decoded_address = DecodeSameAddress(mode, encoded_address); |
| 295 | } else { |
| 296 | // All modes except SAME mode expect a VarintBE as the encoded address |
| 297 | int32_t encoded_address = VarintBE<int32_t>::Parse(address_stream_end, |
| 298 | &new_address_pos); |
| 299 | switch (encoded_address) { |
| 300 | case RESULT_ERROR: |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 301 | VCD_ERROR << "Found invalid variable-length integer " |
| 302 | "as encoded address value" << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 303 | return RESULT_ERROR; |
| 304 | case RESULT_END_OF_DATA: |
| 305 | return RESULT_END_OF_DATA; |
| 306 | default: |
| 307 | break; |
| 308 | } |
| 309 | if (IsSelfMode(mode)) { |
| 310 | decoded_address = DecodeSelfAddress(encoded_address); |
| 311 | } else if (IsHereMode(mode)) { |
| 312 | decoded_address = DecodeHereAddress(encoded_address, here_address); |
| 313 | } else if (IsNearMode(mode)) { |
| 314 | decoded_address = DecodeNearAddress(mode, encoded_address); |
| 315 | } else { |
openvcdiff@gmail.com | 732fff2 | 2010-08-04 18:00:00 +0000 | [diff] [blame] | 316 | VCD_DFATAL << "Invalid mode value (" << static_cast<int>(mode) |
| 317 | << ") passed to DecodeAddress; maximum mode value = " |
| 318 | << static_cast<int>(LastMode()) << VCD_ENDL; |
openvcdiff | 311c714 | 2008-08-26 19:29:25 +0000 | [diff] [blame] | 319 | return RESULT_ERROR; |
| 320 | } |
| 321 | } |
| 322 | // Check for an out-of-bounds address (corrupt/malicious data) |
| 323 | if (!IsDecodedAddressValid(decoded_address, here_address)) { |
| 324 | return RESULT_ERROR; |
| 325 | } |
| 326 | *address_stream = new_address_pos; |
| 327 | UpdateCache(decoded_address); |
| 328 | return decoded_address; |
| 329 | } |
| 330 | |
| 331 | } // namespace open_vcdiff |