blob: 3fd6dcbcfb56492f6acea9ed548d3f2bdde62006 [file] [log] [blame]
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +00001/*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBenchmark.h"
9#include "SkRandom.h"
10#include "SkTemplates.h"
commit-bot@chromium.orgdcba9932014-05-28 22:47:26 +000011#include "SkUtils.h"
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +000012
13template <typename Memcpy32>
14class Memcpy32Bench : public SkBenchmark {
15public:
16 explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
17 : fCount(count)
18 , fMemcpy32(memcpy32)
19 , fName(SkStringPrintf("%s_%d", name, count)) {}
20
21 virtual const char* onGetName() SK_OVERRIDE {
22 return fName.c_str();
23 }
24
25 virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
26 return backend == kNonRendering_Backend;
27 }
28
29 virtual void onPreDraw() SK_OVERRIDE {
30 fDst.reset(fCount);
31 fSrc.reset(fCount);
32
33 SkRandom rand;
34 for (int i = 0; i < fCount; i++) {
35 fSrc[i] = rand.nextU();
36 }
37 }
38
39 virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
40 for (int i = 0; i < loops; i++) {
41 fMemcpy32(fDst, fSrc, fCount);
42 }
43 }
44
45private:
46 SkAutoTMalloc<uint32_t> fDst, fSrc;
47
48 int fCount;
49 Memcpy32 fMemcpy32;
50 const SkString fName;
51};
52
53template <typename Memcpy32>
54static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
55 return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
56}
57#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
58
59
60// Let the libc developers do what they think is best.
61static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
62 memcpy(dst, src, sizeof(uint32_t) * count);
63}
64BENCH(memcpy32_memcpy, 10)
65BENCH(memcpy32_memcpy, 100)
66BENCH(memcpy32_memcpy, 1000)
67BENCH(memcpy32_memcpy, 10000)
68BENCH(memcpy32_memcpy, 100000)
69
70// Let the compiler's autovectorizer do what it thinks is best.
71static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
72 while (count --> 0) {
73 *dst++ = *src++;
74 }
75}
76BENCH(memcpy32_autovectorize, 10)
77BENCH(memcpy32_autovectorize, 100)
78BENCH(memcpy32_autovectorize, 1000)
79BENCH(memcpy32_autovectorize, 10000)
80BENCH(memcpy32_autovectorize, 100000)
81
82#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
83
84// Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads.
85static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
86 if (count >= 16) {
87 while (uintptr_t(dst) & 0xF) {
88 *dst++ = *src++;
89 count--;
90 }
91
92 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
93 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
commit-bot@chromium.org0be2d832014-05-22 18:24:42 +000094 dst += 16 * (count / 16);
95 src += 16 * (count / 16);
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +000096 while (count >= 16) {
97 __m128i a = _mm_loadu_si128(src128++);
98 __m128i b = _mm_loadu_si128(src128++);
99 __m128i c = _mm_loadu_si128(src128++);
100 __m128i d = _mm_loadu_si128(src128++);
101
102 _mm_store_si128(dst128++, a);
103 _mm_store_si128(dst128++, b);
104 _mm_store_si128(dst128++, c);
105 _mm_store_si128(dst128++, d);
106
107 count -= 16;
108 }
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +0000109 }
110
111 while (count --> 0) {
112 *dst++ = *src++;
113 }
114}
115BENCH(memcpy32_sse2_align, 10)
116BENCH(memcpy32_sse2_align, 100)
117BENCH(memcpy32_sse2_align, 1000)
118BENCH(memcpy32_sse2_align, 10000)
119BENCH(memcpy32_sse2_align, 100000)
120
121// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
122static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
commit-bot@chromium.org0be2d832014-05-22 18:24:42 +0000125 dst += 16 * (count / 16);
126 src += 16 * (count / 16);
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +0000127 while (count >= 16) {
128 __m128i a = _mm_loadu_si128(src128++);
129 __m128i b = _mm_loadu_si128(src128++);
130 __m128i c = _mm_loadu_si128(src128++);
131 __m128i d = _mm_loadu_si128(src128++);
132
133 _mm_storeu_si128(dst128++, a);
134 _mm_storeu_si128(dst128++, b);
135 _mm_storeu_si128(dst128++, c);
136 _mm_storeu_si128(dst128++, d);
137
138 count -= 16;
139 }
140
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +0000141 while (count --> 0) {
142 *dst++ = *src++;
143 }
144}
commit-bot@chromium.org0be2d832014-05-22 18:24:42 +0000145BENCH(memcpy32_sse2_unalign, 10)
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +0000146BENCH(memcpy32_sse2_unalign, 100)
147BENCH(memcpy32_sse2_unalign, 1000)
148BENCH(memcpy32_sse2_unalign, 10000)
149BENCH(memcpy32_sse2_unalign, 100000)
150
commit-bot@chromium.orgdcba9932014-05-28 22:47:26 +0000151// Test our chosen best, from SkUtils.h
152BENCH(sk_memcpy32, 10)
153BENCH(sk_memcpy32, 100)
154BENCH(sk_memcpy32, 1000)
155BENCH(sk_memcpy32, 10000)
156BENCH(sk_memcpy32, 100000)
157
commit-bot@chromium.orgc4e416c2014-05-20 14:54:04 +0000158#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
159
160#undef BENCH