Scott Anderson | b0114cb | 2012-04-09 14:08:22 -0700 | [diff] [blame] | 1 | // Copyright 2008 Google Inc. All Rights Reserved. |
| 2 | |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include "adler32memcpy.h" |
| 16 | |
| 17 | // We are using (a modified form of) adler-32 checksum algorithm instead |
| 18 | // of CRC since adler-32 is faster than CRC. |
| 19 | // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/) |
| 20 | // This form of adler is bit modified, instead of treating the data in |
| 21 | // units of bytes, 32-bit data is taken as a unit and two 64-bit |
| 22 | // checksums are done (we could have one checksum but two checksums |
| 23 | // make the code run faster). |
| 24 | |
| 25 | // Adler-32 implementation: |
| 26 | // Data is treated as 1-byte numbers and, |
| 27 | // there are two 16-bit numbers a and b |
| 28 | // Initialize a with 1 and b with 0. |
| 29 | // for each data unit 'd' |
| 30 | // a += d |
| 31 | // b += a |
| 32 | // checksum = a<<16 + b |
| 33 | // This sum should never overflow. |
| 34 | // |
| 35 | // Adler-64+64 implementation: |
| 36 | // (applied in this code) |
| 37 | // Data is treated as 32-bit numbers and whole data is separated into two |
| 38 | // streams, and hence the two checksums a1, a2, b1 and b2. |
| 39 | // Initialize a1 and a2 with 1, b1 and b2 with 0 |
| 40 | // add first dataunit to a1 |
| 41 | // add a1 to b1 |
| 42 | // add second dataunit to a1 |
| 43 | // add a1 to b1 |
| 44 | // add third dataunit to a2 |
| 45 | // add a2 to b2 |
| 46 | // add fourth dataunit to a2 |
| 47 | // add a2 to b2 |
| 48 | // ... |
| 49 | // repeat the sequence back for next 4 dataunits |
| 50 | // |
| 51 | // variable A = XMM6 and variable B = XMM7. |
| 52 | // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7) |
| 53 | |
| 54 | // Assumptions |
| 55 | // 1. size_in_bytes is a multiple of 16. |
| 56 | // 2. srcmem and dstmem are 16 byte aligned. |
| 57 | // 3. size_in_bytes is less than 2^19 bytes. |
| 58 | |
| 59 | // Assumption 3 ensures that there is no overflow when numbers are being |
| 60 | // added (we can remove this assumption by doing modulus with a prime |
| 61 | // number when it is just about to overflow but that would be a very costly |
| 62 | // exercise) |
| 63 | |
| 64 | // Returns true if the checksums are equal. |
| 65 | bool AdlerChecksum::Equals(const AdlerChecksum &other) const { |
| 66 | return ( (a1_ == other.a1_) && (a2_ == other.a2_) && |
| 67 | (b1_ == other.b1_) && (b2_ == other.b2_) ); |
| 68 | } |
| 69 | |
| 70 | // Returns string representation of the Adler checksum. |
| 71 | string AdlerChecksum::ToHexString() const { |
| 72 | char buffer[128]; |
| 73 | snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_); |
| 74 | return string(buffer); |
| 75 | } |
| 76 | |
| 77 | // Sets components of the Adler checksum. |
| 78 | void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) { |
| 79 | a1_ = a1; |
| 80 | a2_ = a2; |
| 81 | b1_ = b1; |
| 82 | b2_ = b2; |
| 83 | } |
| 84 | |
| 85 | // Calculates Adler checksum for supplied data. |
| 86 | bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes, |
| 87 | AdlerChecksum *checksum) { |
| 88 | // Use this data wrapper to access memory with 64bit read/write. |
| 89 | datacast_t data; |
| 90 | unsigned int count = size_in_bytes / sizeof(data); |
| 91 | |
| 92 | if (count > (1U) << 19) { |
| 93 | // Size is too large, must be strictly less than 512 KB. |
| 94 | return false; |
| 95 | } |
| 96 | |
| 97 | uint64 a1 = 1; |
| 98 | uint64 a2 = 1; |
| 99 | uint64 b1 = 0; |
| 100 | uint64 b2 = 0; |
| 101 | |
| 102 | unsigned int i = 0; |
| 103 | while (i < count) { |
| 104 | // Process 64 bits at a time. |
| 105 | data.l64 = data64[i]; |
| 106 | a1 = a1 + data.l32.l; |
| 107 | b1 = b1 + a1; |
| 108 | a1 = a1 + data.l32.h; |
| 109 | b1 = b1 + a1; |
| 110 | i++; |
| 111 | |
| 112 | data.l64 = data64[i]; |
| 113 | a2 = a2 + data.l32.l; |
| 114 | b2 = b2 + a2; |
| 115 | a2 = a2 + data.l32.h; |
| 116 | b2 = b2 + a2; |
| 117 | i++; |
| 118 | } |
| 119 | checksum->Set(a1, a2, b1, b2); |
| 120 | return true; |
| 121 | } |
| 122 | |
| 123 | // C implementation of Adler memory copy. |
| 124 | bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64, |
| 125 | unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| 126 | // Use this data wrapper to access memory with 64bit read/write. |
| 127 | datacast_t data; |
| 128 | unsigned int count = size_in_bytes / sizeof(data); |
| 129 | |
| 130 | if (count > ((1U) << 19)) { |
| 131 | // Size is too large, must be strictly less than 512 KB. |
| 132 | return false; |
| 133 | } |
| 134 | |
| 135 | uint64 a1 = 1; |
| 136 | uint64 a2 = 1; |
| 137 | uint64 b1 = 0; |
| 138 | uint64 b2 = 0; |
| 139 | |
| 140 | unsigned int i = 0; |
| 141 | while (i < count) { |
| 142 | // Process 64 bits at a time. |
| 143 | data.l64 = srcmem64[i]; |
| 144 | a1 = a1 + data.l32.l; |
| 145 | b1 = b1 + a1; |
| 146 | a1 = a1 + data.l32.h; |
| 147 | b1 = b1 + a1; |
| 148 | dstmem64[i] = data.l64; |
| 149 | i++; |
| 150 | |
| 151 | data.l64 = srcmem64[i]; |
| 152 | a2 = a2 + data.l32.l; |
| 153 | b2 = b2 + a2; |
| 154 | a2 = a2 + data.l32.h; |
| 155 | b2 = b2 + a2; |
| 156 | dstmem64[i] = data.l64; |
| 157 | i++; |
| 158 | } |
| 159 | checksum->Set(a1, a2, b1, b2); |
| 160 | return true; |
| 161 | } |
| 162 | |
| 163 | // C implementation of Adler memory copy with some float point ops, |
| 164 | // attempting to warm up the CPU. |
| 165 | bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64, |
| 166 | unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| 167 | // Use this data wrapper to access memory with 64bit read/write. |
| 168 | datacast_t data; |
| 169 | unsigned int count = size_in_bytes / sizeof(data); |
| 170 | |
| 171 | if (count > ((1U) << 19)) { |
| 172 | // Size is too large, must be strictly less than 512 KB. |
| 173 | return false; |
| 174 | } |
| 175 | |
| 176 | uint64 a1 = 1; |
| 177 | uint64 a2 = 1; |
| 178 | uint64 b1 = 0; |
| 179 | uint64 b2 = 0; |
| 180 | |
| 181 | double a = 2.0 * static_cast<double>(srcmem64[0]); |
| 182 | double b = 5.0 * static_cast<double>(srcmem64[0]); |
| 183 | double c = 7.0 * static_cast<double>(srcmem64[0]); |
| 184 | double d = 9.0 * static_cast<double>(srcmem64[0]); |
| 185 | |
| 186 | unsigned int i = 0; |
| 187 | while (i < count) { |
| 188 | // Process 64 bits at a time. |
| 189 | data.l64 = srcmem64[i]; |
| 190 | a1 = a1 + data.l32.l; |
| 191 | b1 = b1 + a1; |
| 192 | a1 = a1 + data.l32.h; |
| 193 | b1 = b1 + a1; |
| 194 | dstmem64[i] = data.l64; |
| 195 | i++; |
| 196 | |
| 197 | // Warm cpu up. |
| 198 | a = a * b; |
| 199 | b = b + c; |
| 200 | |
| 201 | data.l64 = srcmem64[i]; |
| 202 | a2 = a2 + data.l32.l; |
| 203 | b2 = b2 + a2; |
| 204 | a2 = a2 + data.l32.h; |
| 205 | b2 = b2 + a2; |
| 206 | dstmem64[i] = data.l64; |
| 207 | i++; |
| 208 | |
| 209 | // Warm cpu up. |
| 210 | c = c * d; |
| 211 | d = d + d; |
| 212 | } |
| 213 | |
| 214 | // Warm cpu up. |
| 215 | d = a + b + c + d; |
| 216 | if (d == 1.0) { |
| 217 | // Reference the result so that it can't be discarded by the compiler. |
| 218 | printf("Log: This will probably never happen.\n"); |
| 219 | } |
| 220 | |
| 221 | checksum->Set(a1, a2, b1, b2); |
| 222 | return true; |
| 223 | } |
| 224 | |
| 225 | // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy. |
| 226 | bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, |
| 227 | unsigned int size_in_bytes, AdlerChecksum *checksum) { |
| 228 | // Use assembly implementation where supported. |
| 229 | #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) |
| 230 | |
| 231 | // Pull a bit of tricky preprocessing to make the inline asm both |
| 232 | // 32 bit and 64 bit. |
| 233 | #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86... |
| 234 | #define rAX "%%eax" |
| 235 | #define rCX "%%ecx" |
| 236 | #define rDX "%%edx" |
| 237 | #define rBX "%%ebx" |
| 238 | #define rSP "%%esp" |
| 239 | #define rBP "%%ebp" |
| 240 | #define rSI "%%esi" |
| 241 | #define rDI "%%edi" |
| 242 | #endif |
| 243 | |
| 244 | #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros. |
| 245 | #define rAX "%%rax" |
| 246 | #define rCX "%%rcx" |
| 247 | #define rDX "%%rdx" |
| 248 | #define rBX "%%rbx" |
| 249 | #define rSP "%%rsp" |
| 250 | #define rBP "%%rbp" |
| 251 | #define rSI "%%rsi" |
| 252 | #define rDI "%%rdi" |
| 253 | #endif |
| 254 | |
| 255 | // Elements 0 to 3 are used for holding checksum terms a1, a2, |
| 256 | // b1, b2 respectively. These elements are filled by asm code. |
| 257 | // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing |
| 258 | // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are |
| 259 | // setting word index 0 and word index 2 to zero). |
| 260 | // Element 6 and 7 are used for setting a1 and a2 to 1. |
| 261 | volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = |
| 262 | {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; |
| 263 | |
| 264 | if ((size_in_bytes >> 19) > 0) { |
| 265 | // Size is too large. Must be less than 2^19 bytes = 512 KB. |
| 266 | return false; |
| 267 | } |
| 268 | |
| 269 | // Number of 32-bit words which are not added to a1/a2 in the main loop. |
| 270 | uint32 remaining_words = (size_in_bytes % 48) / 4; |
| 271 | |
| 272 | // Since we are moving 48 bytes at a time number of iterations = total size/48 |
| 273 | // is value of counter. |
| 274 | uint32 num_of_48_byte_units = size_in_bytes / 48; |
| 275 | |
| 276 | asm volatile ( |
| 277 | // Source address is in ESI (extended source index) |
| 278 | // destination is in EDI (extended destination index) |
| 279 | // and counter is already in ECX (extended counter |
| 280 | // index). |
| 281 | "cmp $0, " rCX ";" // Compare counter to zero. |
| 282 | "jz END;" |
| 283 | |
| 284 | // XMM6 is initialized with 1 and XMM7 with 0. |
| 285 | "prefetchnta 0(" rSI ");" |
| 286 | "prefetchnta 64(" rSI ");" |
| 287 | "movdqu 48(" rAX "), %%xmm6;" |
| 288 | "xorps %%xmm7, %%xmm7;" |
| 289 | |
| 290 | // Start of the loop which copies 48 bytes from source to dst each time. |
| 291 | "TOP:\n" |
| 292 | |
| 293 | // Make 6 moves each of 16 bytes from srcmem to XMM registers. |
| 294 | // We are using 2 words out of 4 words in each XMM register, |
| 295 | // word index 0 and word index 2 |
| 296 | "movdqa 0(" rSI "), %%xmm0;" |
| 297 | "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here. |
| 298 | "movdqa 16(" rSI "), %%xmm2;" |
| 299 | "movdqu 20(" rSI "), %%xmm3;" |
| 300 | "movdqa 32(" rSI "), %%xmm4;" |
| 301 | "movdqu 36(" rSI "), %%xmm5;" |
| 302 | |
| 303 | // Move 3 * 16 bytes from XMM registers to dstmem. |
| 304 | // Note: this copy must be performed before pinsrw instructions since |
| 305 | // they will modify the XMM registers. |
| 306 | "movntdq %%xmm0, 0(" rDI ");" |
| 307 | "movntdq %%xmm2, 16(" rDI ");" |
| 308 | "movntdq %%xmm4, 32(" rDI ");" |
| 309 | |
| 310 | // Sets the word[1] and word[3] of XMM0 to XMM5 to zero. |
| 311 | "andps 32(" rAX "), %%xmm0;" |
| 312 | "andps 32(" rAX "), %%xmm1;" |
| 313 | "andps 32(" rAX "), %%xmm2;" |
| 314 | "andps 32(" rAX "), %%xmm3;" |
| 315 | "andps 32(" rAX "), %%xmm4;" |
| 316 | "andps 32(" rAX "), %%xmm5;" |
| 317 | |
| 318 | // Add XMM0 to XMM6 and then add XMM6 to XMM7. |
| 319 | // Repeat this for XMM1, ..., XMM5. |
| 320 | // Overflow(for XMM7) can occur only if there are more |
| 321 | // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so |
| 322 | // if size_in_bytes > 2^19 than overflow occurs. |
| 323 | "paddq %%xmm0, %%xmm6;" |
| 324 | "paddq %%xmm6, %%xmm7;" |
| 325 | "paddq %%xmm1, %%xmm6;" |
| 326 | "paddq %%xmm6, %%xmm7;" |
| 327 | "paddq %%xmm2, %%xmm6;" |
| 328 | "paddq %%xmm6, %%xmm7;" |
| 329 | "paddq %%xmm3, %%xmm6;" |
| 330 | "paddq %%xmm6, %%xmm7;" |
| 331 | "paddq %%xmm4, %%xmm6;" |
| 332 | "paddq %%xmm6, %%xmm7;" |
| 333 | "paddq %%xmm5, %%xmm6;" |
| 334 | "paddq %%xmm6, %%xmm7;" |
| 335 | |
| 336 | // Increment ESI and EDI by 48 bytes and decrement counter by 1. |
| 337 | "add $48, " rSI ";" |
| 338 | "add $48, " rDI ";" |
| 339 | "prefetchnta 0(" rSI ");" |
| 340 | "prefetchnta 64(" rSI ");" |
| 341 | "dec " rCX ";" |
| 342 | "jnz TOP;" |
| 343 | |
| 344 | // Now only remaining_words 32-bit words are left. |
| 345 | // make a loop, add first two words to a1 and next two to a2 (just like |
| 346 | // above loop, the only extra thing we are doing is rechecking |
| 347 | // rDX (=remaining_words) everytime we add a number to a1/a2. |
| 348 | "REM_IS_STILL_NOT_ZERO:\n" |
| 349 | // Unless remaining_words becomes less than 4 words(16 bytes) |
| 350 | // there is not much issue and remaining_words will always |
| 351 | // be a multiple of four by assumption. |
| 352 | "cmp $4, " rDX ";" |
| 353 | // In case for some weird reasons if remaining_words becomes |
| 354 | // less than 4 but not zero then also break the code and go off to END. |
| 355 | "jl END;" |
| 356 | // Otherwise just go on and copy data in chunks of 4-words at a time till |
| 357 | // whole data (<48 bytes) is copied. |
| 358 | "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. |
| 359 | |
| 360 | "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without |
| 361 | "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary. |
| 362 | |
| 363 | "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination. |
| 364 | "andps 32(" rAX "), %%xmm0;" |
| 365 | "andps 32(" rAX "), %%xmm1;" |
| 366 | "paddq %%xmm0, %%xmm6;" |
| 367 | "paddq %%xmm6, %%xmm7;" |
| 368 | "paddq %%xmm1, %%xmm6;" |
| 369 | "paddq %%xmm6, %%xmm7;" |
| 370 | "add $16, " rSI ";" |
| 371 | "add $16, " rDI ";" |
| 372 | "sub $4, " rDX ";" |
| 373 | // Decrement %rDX by 4 since %rDX is number of 32-bit |
| 374 | // words left after considering all 48-byte units. |
| 375 | "jmp REM_IS_STILL_NOT_ZERO;" |
| 376 | |
| 377 | "END:\n" |
| 378 | // Report checksum values A and B (both right now are two concatenated |
| 379 | // 64 bit numbers and have to be converted to 64 bit numbers) |
| 380 | // seems like Adler128 (since size of each part is 4 byte rather than |
| 381 | // 1 byte). |
| 382 | "movdqa %%xmm6, 0(" rAX ");" |
| 383 | "movdqa %%xmm7, 16(" rAX ");" |
| 384 | "sfence;" |
| 385 | |
| 386 | // No output registers. |
| 387 | : |
| 388 | // Input registers. |
| 389 | : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr), |
| 390 | "c" (num_of_48_byte_units), "d" (remaining_words) |
| 391 | ); // asm. |
| 392 | |
| 393 | if (checksum != NULL) { |
| 394 | checksum->Set(checksum_arr[0], checksum_arr[1], |
| 395 | checksum_arr[2], checksum_arr[3]); |
| 396 | } |
| 397 | |
| 398 | // Everything went fine, so return true (this does not mean |
| 399 | // that there is no problem with memory this just mean that data was copied |
| 400 | // from src to dst and checksum was calculated successfully). |
| 401 | return true; |
| 402 | #else |
| 403 | // Fall back to C implementation for anything else. |
| 404 | return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); |
| 405 | #endif |
| 406 | } |