mtklein | 036e183 | 2016-07-15 07:45:53 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2016 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "Benchmark.h" |
| 9 | #include "SkTypes.h" |
| 10 | |
| 11 | /** |
| 12 | * There's a good variety of ways to pack from int down to uint16_t with SSE, |
| 13 | * depending on the specific instructions available. |
| 14 | * |
| 15 | * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways: |
| 16 | * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a) |
| 17 | * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b) |
| 18 | * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3) |
| 19 | * SSE41 added an int -> uint16_t pack instruction. (sse41) |
| 20 | * |
| 21 | * Findings so far: |
| 22 | * - sse41 < ssse3 <<< sse2_b < sse2_a; |
| 23 | * - the ssse3 version is only slightly slower than the sse41 version, maybe not at all |
| 24 | * - the sse2_a is only slightly slower than the sse2_b version |
| 25 | * - the ssse3 and sse41 versions are about 3x faster than either sse2 version |
| 26 | * - the sse41 version seems to cause some code generation trouble. |
| 27 | */ |
| 28 | |
| 29 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 30 | |
| 31 | #include <immintrin.h> |
| 32 | |
| 33 | template <__m128i (kernel)(__m128i)> |
| 34 | class pack_int_uint16_t_Bench : public Benchmark { |
| 35 | public: |
| 36 | pack_int_uint16_t_Bench(const char* impl) { |
| 37 | fName.append("pack_int_uint16_t_"); |
| 38 | fName.append(impl); |
| 39 | } |
| 40 | |
| 41 | bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } |
| 42 | const char* onGetName() override { return fName.c_str(); } |
| 43 | |
| 44 | void onDraw(int loops, SkCanvas*) override { |
| 45 | __m128i x = _mm_set1_epi32(0x42424242); |
| 46 | while (loops --> 0) { |
| 47 | x = kernel(x); |
| 48 | } |
| 49 | |
| 50 | volatile int blackhole = 0; |
| 51 | blackhole ^= _mm_cvtsi128_si32(x); |
| 52 | } |
| 53 | |
| 54 | SkString fName; |
| 55 | }; |
| 56 | |
| 57 | namespace { |
| 58 | __m128i sse2_a(__m128i x) { |
| 59 | x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000)); |
| 60 | return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000)); |
| 61 | } |
| 62 | } |
| 63 | DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); ) |
| 64 | |
| 65 | namespace { |
| 66 | __m128i sse2_b(__m128i x) { |
| 67 | x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); |
| 68 | return _mm_packs_epi32(x,x); |
| 69 | } |
| 70 | } |
| 71 | DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); ) |
| 72 | |
| 73 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 74 | namespace { |
| 75 | __m128i ssse3(__m128i x) { |
| 76 | // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic. |
| 77 | const int _ = ~0; |
| 78 | return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); |
| 79 | } |
| 80 | } |
| 81 | DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); ) |
| 82 | #endif |
| 83 | |
| 84 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| 85 | namespace { |
| 86 | __m128i sse41(__m128i x) { |
| 87 | return _mm_packus_epi32(x,x); |
| 88 | } |
| 89 | } |
| 90 | DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); ) |
| 91 | #endif |
| 92 | |
| 93 | #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |