Blame - src/adler32memcpy.cc - fp2-dev/platform/external/stressapptest

blob: 69324f7b9a104baef20bb842f5921245aca79489 [file] [log] [blame]

Scott Anderson	b0114cb	2012-04-09 14:08:22 -0700	[diff] [blame]	1	// Copyright 2008 Google Inc. All Rights Reserved.
				2
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include "adler32memcpy.h"
				16
				17	// We are using (a modified form of) adler-32 checksum algorithm instead
				18	// of CRC since adler-32 is faster than CRC.
				19	// (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
				20	// This form of adler is bit modified, instead of treating the data in
				21	// units of bytes, 32-bit data is taken as a unit and two 64-bit
				22	// checksums are done (we could have one checksum but two checksums
				23	// make the code run faster).
				24
				25	// Adler-32 implementation:
				26	// Data is treated as 1-byte numbers and,
				27	// there are two 16-bit numbers a and b
				28	// Initialize a with 1 and b with 0.
				29	// for each data unit 'd'
				30	// a += d
				31	// b += a
				32	// checksum = a<<16 + b
				33	// This sum should never overflow.
				34	//
				35	// Adler-64+64 implementation:
				36	// (applied in this code)
				37	// Data is treated as 32-bit numbers and whole data is separated into two
				38	// streams, and hence the two checksums a1, a2, b1 and b2.
				39	// Initialize a1 and a2 with 1, b1 and b2 with 0
				40	// add first dataunit to a1
				41	// add a1 to b1
				42	// add second dataunit to a1
				43	// add a1 to b1
				44	// add third dataunit to a2
				45	// add a2 to b2
				46	// add fourth dataunit to a2
				47	// add a2 to b2
				48	// ...
				49	// repeat the sequence back for next 4 dataunits
				50	//
				51	// variable A = XMM6 and variable B = XMM7.
				52	// (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
				53
				54	// Assumptions
				55	// 1. size_in_bytes is a multiple of 16.
				56	// 2. srcmem and dstmem are 16 byte aligned.
				57	// 3. size_in_bytes is less than 2^19 bytes.
				58
				59	// Assumption 3 ensures that there is no overflow when numbers are being
				60	// added (we can remove this assumption by doing modulus with a prime
				61	// number when it is just about to overflow but that would be a very costly
				62	// exercise)
				63
				64	// Returns true if the checksums are equal.
				65	bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
				66	return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
				67	(b1_ == other.b1_) && (b2_ == other.b2_) );
				68	}
				69
				70	// Returns string representation of the Adler checksum.
				71	string AdlerChecksum::ToHexString() const {
				72	char buffer[128];
				73	snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
				74	return string(buffer);
				75	}
				76
				77	// Sets components of the Adler checksum.
				78	void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
				79	a1_ = a1;
				80	a2_ = a2;
				81	b1_ = b1;
				82	b2_ = b2;
				83	}
				84
				85	// Calculates Adler checksum for supplied data.
				86	bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
				87	AdlerChecksum *checksum) {
				88	// Use this data wrapper to access memory with 64bit read/write.
				89	datacast_t data;
				90	unsigned int count = size_in_bytes / sizeof(data);
				91
				92	if (count > (1U) << 19) {
				93	// Size is too large, must be strictly less than 512 KB.
				94	return false;
				95	}
				96
				97	uint64 a1 = 1;
				98	uint64 a2 = 1;
				99	uint64 b1 = 0;
				100	uint64 b2 = 0;
				101
				102	unsigned int i = 0;
				103	while (i < count) {
				104	// Process 64 bits at a time.
				105	data.l64 = data64[i];
				106	a1 = a1 + data.l32.l;
				107	b1 = b1 + a1;
				108	a1 = a1 + data.l32.h;
				109	b1 = b1 + a1;
				110	i++;
				111
				112	data.l64 = data64[i];
				113	a2 = a2 + data.l32.l;
				114	b2 = b2 + a2;
				115	a2 = a2 + data.l32.h;
				116	b2 = b2 + a2;
				117	i++;
				118	}
				119	checksum->Set(a1, a2, b1, b2);
				120	return true;
				121	}
				122
				123	// C implementation of Adler memory copy.
				124	bool AdlerMemcpyC(uint64 dstmem64, uint64 srcmem64,
				125	unsigned int size_in_bytes, AdlerChecksum *checksum) {
				126	// Use this data wrapper to access memory with 64bit read/write.
				127	datacast_t data;
				128	unsigned int count = size_in_bytes / sizeof(data);
				129
				130	if (count > ((1U) << 19)) {
				131	// Size is too large, must be strictly less than 512 KB.
				132	return false;
				133	}
				134
				135	uint64 a1 = 1;
				136	uint64 a2 = 1;
				137	uint64 b1 = 0;
				138	uint64 b2 = 0;
				139
				140	unsigned int i = 0;
				141	while (i < count) {
				142	// Process 64 bits at a time.
				143	data.l64 = srcmem64[i];
				144	a1 = a1 + data.l32.l;
				145	b1 = b1 + a1;
				146	a1 = a1 + data.l32.h;
				147	b1 = b1 + a1;
				148	dstmem64[i] = data.l64;
				149	i++;
				150
				151	data.l64 = srcmem64[i];
				152	a2 = a2 + data.l32.l;
				153	b2 = b2 + a2;
				154	a2 = a2 + data.l32.h;
				155	b2 = b2 + a2;
				156	dstmem64[i] = data.l64;
				157	i++;
				158	}
				159	checksum->Set(a1, a2, b1, b2);
				160	return true;
				161	}
				162
				163	// C implementation of Adler memory copy with some float point ops,
				164	// attempting to warm up the CPU.
				165	bool AdlerMemcpyWarmC(uint64 dstmem64, uint64 srcmem64,
				166	unsigned int size_in_bytes, AdlerChecksum *checksum) {
				167	// Use this data wrapper to access memory with 64bit read/write.
				168	datacast_t data;
				169	unsigned int count = size_in_bytes / sizeof(data);
				170
				171	if (count > ((1U) << 19)) {
				172	// Size is too large, must be strictly less than 512 KB.
				173	return false;
				174	}
				175
				176	uint64 a1 = 1;
				177	uint64 a2 = 1;
				178	uint64 b1 = 0;
				179	uint64 b2 = 0;
				180
				181	double a = 2.0 * static_cast<double>(srcmem64[0]);
				182	double b = 5.0 * static_cast<double>(srcmem64[0]);
				183	double c = 7.0 * static_cast<double>(srcmem64[0]);
				184	double d = 9.0 * static_cast<double>(srcmem64[0]);
				185
				186	unsigned int i = 0;
				187	while (i < count) {
				188	// Process 64 bits at a time.
				189	data.l64 = srcmem64[i];
				190	a1 = a1 + data.l32.l;
				191	b1 = b1 + a1;
				192	a1 = a1 + data.l32.h;
				193	b1 = b1 + a1;
				194	dstmem64[i] = data.l64;
				195	i++;
				196
				197	// Warm cpu up.
				198	a = a * b;
				199	b = b + c;
				200
				201	data.l64 = srcmem64[i];
				202	a2 = a2 + data.l32.l;
				203	b2 = b2 + a2;
				204	a2 = a2 + data.l32.h;
				205	b2 = b2 + a2;
				206	dstmem64[i] = data.l64;
				207	i++;
				208
				209	// Warm cpu up.
				210	c = c * d;
				211	d = d + d;
				212	}
				213
				214	// Warm cpu up.
				215	d = a + b + c + d;
				216	if (d == 1.0) {
				217	// Reference the result so that it can't be discarded by the compiler.
				218	printf("Log: This will probably never happen.\n");
				219	}
				220
				221	checksum->Set(a1, a2, b1, b2);
				222	return true;
				223	}
				224
				225	// x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
				226	bool AdlerMemcpyAsm(uint64 dstmem64, uint64 srcmem64,
				227	unsigned int size_in_bytes, AdlerChecksum *checksum) {
				228	// Use assembly implementation where supported.
				229	#if defined(STRESSAPPTEST_CPU_X86_64) \|\| defined(STRESSAPPTEST_CPU_I686)
				230
				231	// Pull a bit of tricky preprocessing to make the inline asm both
				232	// 32 bit and 64 bit.
				233	#ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86...
				234	#define rAX "%%eax"
				235	#define rCX "%%ecx"
				236	#define rDX "%%edx"
				237	#define rBX "%%ebx"
				238	#define rSP "%%esp"
				239	#define rBP "%%ebp"
				240	#define rSI "%%esi"
				241	#define rDI "%%edi"
				242	#endif
				243
				244	#ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros.
				245	#define rAX "%%rax"
				246	#define rCX "%%rcx"
				247	#define rDX "%%rdx"
				248	#define rBX "%%rbx"
				249	#define rSP "%%rsp"
				250	#define rBP "%%rbp"
				251	#define rSI "%%rsi"
				252	#define rDI "%%rdi"
				253	#endif
				254
				255	// Elements 0 to 3 are used for holding checksum terms a1, a2,
				256	// b1, b2 respectively. These elements are filled by asm code.
				257	// Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
				258	// 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
				259	// setting word index 0 and word index 2 to zero).
				260	// Element 6 and 7 are used for setting a1 and a2 to 1.
				261	volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
				262	{0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
				263
				264	if ((size_in_bytes >> 19) > 0) {
				265	// Size is too large. Must be less than 2^19 bytes = 512 KB.
				266	return false;
				267	}
				268
				269	// Number of 32-bit words which are not added to a1/a2 in the main loop.
				270	uint32 remaining_words = (size_in_bytes % 48) / 4;
				271
				272	// Since we are moving 48 bytes at a time number of iterations = total size/48
				273	// is value of counter.
				274	uint32 num_of_48_byte_units = size_in_bytes / 48;
				275
				276	asm volatile (
				277	// Source address is in ESI (extended source index)
				278	// destination is in EDI (extended destination index)
				279	// and counter is already in ECX (extended counter
				280	// index).
				281	"cmp $0, " rCX ";" // Compare counter to zero.
				282	"jz END;"
				283
				284	// XMM6 is initialized with 1 and XMM7 with 0.
				285	"prefetchnta 0(" rSI ");"
				286	"prefetchnta 64(" rSI ");"
				287	"movdqu 48(" rAX "), %%xmm6;"
				288	"xorps %%xmm7, %%xmm7;"
				289
				290	// Start of the loop which copies 48 bytes from source to dst each time.
				291	"TOP:\n"
				292
				293	// Make 6 moves each of 16 bytes from srcmem to XMM registers.
				294	// We are using 2 words out of 4 words in each XMM register,
				295	// word index 0 and word index 2
				296	"movdqa 0(" rSI "), %%xmm0;"
				297	"movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here.
				298	"movdqa 16(" rSI "), %%xmm2;"
				299	"movdqu 20(" rSI "), %%xmm3;"
				300	"movdqa 32(" rSI "), %%xmm4;"
				301	"movdqu 36(" rSI "), %%xmm5;"
				302
				303	// Move 3 * 16 bytes from XMM registers to dstmem.
				304	// Note: this copy must be performed before pinsrw instructions since
				305	// they will modify the XMM registers.
				306	"movntdq %%xmm0, 0(" rDI ");"
				307	"movntdq %%xmm2, 16(" rDI ");"
				308	"movntdq %%xmm4, 32(" rDI ");"
				309
				310	// Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
				311	"andps 32(" rAX "), %%xmm0;"
				312	"andps 32(" rAX "), %%xmm1;"
				313	"andps 32(" rAX "), %%xmm2;"
				314	"andps 32(" rAX "), %%xmm3;"
				315	"andps 32(" rAX "), %%xmm4;"
				316	"andps 32(" rAX "), %%xmm5;"
				317
				318	// Add XMM0 to XMM6 and then add XMM6 to XMM7.
				319	// Repeat this for XMM1, ..., XMM5.
				320	// Overflow(for XMM7) can occur only if there are more
				321	// than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
				322	// if size_in_bytes > 2^19 than overflow occurs.
				323	"paddq %%xmm0, %%xmm6;"
				324	"paddq %%xmm6, %%xmm7;"
				325	"paddq %%xmm1, %%xmm6;"
				326	"paddq %%xmm6, %%xmm7;"
				327	"paddq %%xmm2, %%xmm6;"
				328	"paddq %%xmm6, %%xmm7;"
				329	"paddq %%xmm3, %%xmm6;"
				330	"paddq %%xmm6, %%xmm7;"
				331	"paddq %%xmm4, %%xmm6;"
				332	"paddq %%xmm6, %%xmm7;"
				333	"paddq %%xmm5, %%xmm6;"
				334	"paddq %%xmm6, %%xmm7;"
				335
				336	// Increment ESI and EDI by 48 bytes and decrement counter by 1.
				337	"add $48, " rSI ";"
				338	"add $48, " rDI ";"
				339	"prefetchnta 0(" rSI ");"
				340	"prefetchnta 64(" rSI ");"
				341	"dec " rCX ";"
				342	"jnz TOP;"
				343
				344	// Now only remaining_words 32-bit words are left.
				345	// make a loop, add first two words to a1 and next two to a2 (just like
				346	// above loop, the only extra thing we are doing is rechecking
				347	// rDX (=remaining_words) everytime we add a number to a1/a2.
				348	"REM_IS_STILL_NOT_ZERO:\n"
				349	// Unless remaining_words becomes less than 4 words(16 bytes)
				350	// there is not much issue and remaining_words will always
				351	// be a multiple of four by assumption.
				352	"cmp $4, " rDX ";"
				353	// In case for some weird reasons if remaining_words becomes
				354	// less than 4 but not zero then also break the code and go off to END.
				355	"jl END;"
				356	// Otherwise just go on and copy data in chunks of 4-words at a time till
				357	// whole data (<48 bytes) is copied.
				358	"movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
				359
				360	"movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without
				361	"pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
				362
				363	"movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination.
				364	"andps 32(" rAX "), %%xmm0;"
				365	"andps 32(" rAX "), %%xmm1;"
				366	"paddq %%xmm0, %%xmm6;"
				367	"paddq %%xmm6, %%xmm7;"
				368	"paddq %%xmm1, %%xmm6;"
				369	"paddq %%xmm6, %%xmm7;"
				370	"add $16, " rSI ";"
				371	"add $16, " rDI ";"
				372	"sub $4, " rDX ";"
				373	// Decrement %rDX by 4 since %rDX is number of 32-bit
				374	// words left after considering all 48-byte units.
				375	"jmp REM_IS_STILL_NOT_ZERO;"
				376
				377	"END:\n"
				378	// Report checksum values A and B (both right now are two concatenated
				379	// 64 bit numbers and have to be converted to 64 bit numbers)
				380	// seems like Adler128 (since size of each part is 4 byte rather than
				381	// 1 byte).
				382	"movdqa %%xmm6, 0(" rAX ");"
				383	"movdqa %%xmm7, 16(" rAX ");"
				384	"sfence;"
				385
				386	// No output registers.
				387	:
				388	// Input registers.
				389	: "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
				390	"c" (num_of_48_byte_units), "d" (remaining_words)
				391	); // asm.
				392
				393	if (checksum != NULL) {
				394	checksum->Set(checksum_arr[0], checksum_arr[1],
				395	checksum_arr[2], checksum_arr[3]);
				396	}
				397
				398	// Everything went fine, so return true (this does not mean
				399	// that there is no problem with memory this just mean that data was copied
				400	// from src to dst and checksum was calculated successfully).
				401	return true;
				402	#else
				403	// Fall back to C implementation for anything else.
				404	return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
				405	#endif
				406	}