| /* |
| * Copyright 2014 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "Benchmark.h" |
| #include "SkRandom.h" |
| #include "SkTemplates.h" |
| #include "SkUtils.h" |
| |
| template <typename Memcpy32> |
| class Memcpy32Bench : public Benchmark { |
| public: |
| explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name) |
| : fCount(count) |
| , fMemcpy32(memcpy32) |
| , fName(SkStringPrintf("%s_%d", name, count)) {} |
| |
| virtual const char* onGetName() SK_OVERRIDE { |
| return fName.c_str(); |
| } |
| |
| virtual bool isSuitableFor(Backend backend) SK_OVERRIDE { |
| return backend == kNonRendering_Backend; |
| } |
| |
| virtual void onPreDraw() SK_OVERRIDE { |
| fDst.reset(fCount); |
| fSrc.reset(fCount); |
| |
| SkRandom rand; |
| for (int i = 0; i < fCount; i++) { |
| fSrc[i] = rand.nextU(); |
| } |
| } |
| |
| virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { |
| for (int i = 0; i < loops; i++) { |
| fMemcpy32(fDst, fSrc, fCount); |
| } |
| } |
| |
| private: |
| SkAutoTMalloc<uint32_t> fDst, fSrc; |
| |
| int fCount; |
| Memcpy32 fMemcpy32; |
| const SkString fName; |
| }; |
| |
| template <typename Memcpy32> |
| static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) { |
| return new Memcpy32Bench<Memcpy32>(count, memcpy32, name); |
| } |
| #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); ) |
| |
| |
| // Let the libc developers do what they think is best. |
| static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { |
| memcpy(dst, src, sizeof(uint32_t) * count); |
| } |
| BENCH(memcpy32_memcpy, 10) |
| BENCH(memcpy32_memcpy, 100) |
| BENCH(memcpy32_memcpy, 1000) |
| BENCH(memcpy32_memcpy, 10000) |
| BENCH(memcpy32_memcpy, 100000) |
| |
| // Let the compiler's autovectorizer do what it thinks is best. |
| static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) { |
| while (count --> 0) { |
| *dst++ = *src++; |
| } |
| } |
| BENCH(memcpy32_autovectorize, 10) |
| BENCH(memcpy32_autovectorize, 100) |
| BENCH(memcpy32_autovectorize, 1000) |
| BENCH(memcpy32_autovectorize, 10000) |
| BENCH(memcpy32_autovectorize, 100000) |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| |
| // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads. |
| static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { |
| if (count >= 16) { |
| while (uintptr_t(dst) & 0xF) { |
| *dst++ = *src++; |
| count--; |
| } |
| |
| __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
| const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| dst += 16 * (count / 16); |
| src += 16 * (count / 16); |
| while (count >= 16) { |
| __m128i a = _mm_loadu_si128(src128++); |
| __m128i b = _mm_loadu_si128(src128++); |
| __m128i c = _mm_loadu_si128(src128++); |
| __m128i d = _mm_loadu_si128(src128++); |
| |
| _mm_store_si128(dst128++, a); |
| _mm_store_si128(dst128++, b); |
| _mm_store_si128(dst128++, c); |
| _mm_store_si128(dst128++, d); |
| |
| count -= 16; |
| } |
| } |
| |
| while (count --> 0) { |
| *dst++ = *src++; |
| } |
| } |
| BENCH(memcpy32_sse2_align, 10) |
| BENCH(memcpy32_sse2_align, 100) |
| BENCH(memcpy32_sse2_align, 1000) |
| BENCH(memcpy32_sse2_align, 10000) |
| BENCH(memcpy32_sse2_align, 100000) |
| |
| // Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src. |
| static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) { |
| __m128i* dst128 = reinterpret_cast<__m128i*>(dst); |
| const __m128i* src128 = reinterpret_cast<const __m128i*>(src); |
| dst += 16 * (count / 16); |
| src += 16 * (count / 16); |
| while (count >= 16) { |
| __m128i a = _mm_loadu_si128(src128++); |
| __m128i b = _mm_loadu_si128(src128++); |
| __m128i c = _mm_loadu_si128(src128++); |
| __m128i d = _mm_loadu_si128(src128++); |
| |
| _mm_storeu_si128(dst128++, a); |
| _mm_storeu_si128(dst128++, b); |
| _mm_storeu_si128(dst128++, c); |
| _mm_storeu_si128(dst128++, d); |
| |
| count -= 16; |
| } |
| |
| while (count --> 0) { |
| *dst++ = *src++; |
| } |
| } |
| BENCH(memcpy32_sse2_unalign, 10) |
| BENCH(memcpy32_sse2_unalign, 100) |
| BENCH(memcpy32_sse2_unalign, 1000) |
| BENCH(memcpy32_sse2_unalign, 10000) |
| BENCH(memcpy32_sse2_unalign, 100000) |
| |
| // Test our chosen best, from SkUtils.h |
| BENCH(sk_memcpy32, 10) |
| BENCH(sk_memcpy32, 100) |
| BENCH(sk_memcpy32, 1000) |
| BENCH(sk_memcpy32, 10000) |
| BENCH(sk_memcpy32, 100000) |
| |
| #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| |
| #undef BENCH |