Blame - bench/pack_int_uint16_t_Bench.cpp - platform/external/skia

blob: 5e1527d7ca25c2cc6a761ddf7801be0293bfe134 [file] [log] [blame]

mtklein	036e183	2016-07-15 07:45:53 -0700	[diff] [blame]	1	/*
				2	* Copyright 2016 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#include "Benchmark.h"
				9	#include "SkTypes.h"
				10
				11	/**
				12	* There's a good variety of ways to pack from int down to uint16_t with SSE,
				13	* depending on the specific instructions available.
				14	*
				15	* SSE2 offers an int -> int16_t pack instruction. We can use this in two ways:
				16	* - subtract off 32768, int -> int16_t, add 32768 back (sse2_a)
				17	* - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)
				18	* SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3)
				19	* SSE41 added an int -> uint16_t pack instruction. (sse41)
				20	*
				21	* Findings so far:
				22	* - sse41 < ssse3 <<< sse2_b < sse2_a;
				23	* - the ssse3 version is only slightly slower than the sse41 version, maybe not at all
				24	* - the sse2_a is only slightly slower than the sse2_b version
				25	* - the ssse3 and sse41 versions are about 3x faster than either sse2 version
				26	* - the sse41 version seems to cause some code generation trouble.
				27	*/
				28
				29	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
				30
				31	#include <immintrin.h>
				32
				33	template <__m128i (kernel)(__m128i)>
				34	class pack_int_uint16_t_Bench : public Benchmark {
				35	public:
				36	pack_int_uint16_t_Bench(const char* impl) {
				37	fName.append("pack_int_uint16_t_");
				38	fName.append(impl);
				39	}
				40
				41	bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
				42	const char* onGetName() override { return fName.c_str(); }
				43
				44	void onDraw(int loops, SkCanvas*) override {
				45	__m128i x = _mm_set1_epi32(0x42424242);
				46	while (loops --> 0) {
				47	x = kernel(x);
				48	}
				49
				50	volatile int blackhole = 0;
				51	blackhole ^= _mm_cvtsi128_si32(x);
				52	}
				53
				54	SkString fName;
				55	};
				56
				57	namespace {
				58	__m128i sse2_a(__m128i x) {
				59	x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));
				60	return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000));
				61	}
				62	}
				63	DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )
				64
				65	namespace {
				66	__m128i sse2_b(__m128i x) {
				67	x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
				68	return _mm_packs_epi32(x,x);
				69	}
				70	}
				71	DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )
				72
				73	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
				74	namespace {
				75	__m128i ssse3(__m128i x) {
				76	// TODO: Can we force the bench to load the mask inside the loop? Would be more realistic.
				77	const int _ = ~0;
				78	return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
				79	}
				80	}
				81	DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )
				82	#endif
				83
				84	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
				85	namespace {
				86	__m128i sse41(__m128i x) {
				87	return _mm_packus_epi32(x,x);
				88	}
				89	}
				90	DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )
				91	#endif
				92
				93	#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2