blob: 71ecbfd89d9664ae6a4bded0cc44dac4dcd18618 [file] [log] [blame]
mtkleinc9adb052015-03-30 10:50:27 -07001/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkNx_sse_DEFINED
9#define SkNx_sse_DEFINED
10
11// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
mtkleinaa999cb2015-05-22 17:18:21 -070012
fmalitac2e0ac42015-12-03 09:15:25 -080013#define SKNX_IS_FAST
14
mtklein082e3292015-08-12 11:56:43 -070015namespace { // See SkNx.h
16
17
mtkleinc9adb052015-03-30 10:50:27 -070018template <>
mtklein6c221b42015-11-20 13:53:19 -080019class SkNx<2, float> {
mtkleinc9adb052015-03-30 10:50:27 -070020public:
mtklein6c221b42015-11-20 13:53:19 -080021 SkNx(const __m128& vec) : fVec(vec) {}
mtkleinc9adb052015-03-30 10:50:27 -070022
mtklein6c221b42015-11-20 13:53:19 -080023 SkNx() {}
24 SkNx(float val) : fVec(_mm_set1_ps(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -080025 static SkNx Load(const void* ptr) {
26 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr));
mtkleinc9adb052015-03-30 10:50:27 -070027 }
mtklein6c221b42015-11-20 13:53:19 -080028 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
mtkleinc9adb052015-03-30 10:50:27 -070029
mtklein507ef6d2016-01-31 08:02:47 -080030 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070031
mtklein6c221b42015-11-20 13:53:19 -080032 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
33 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
34 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
35 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070036
mtklein6c221b42015-11-20 13:53:19 -080037 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
38 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
39 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
40 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
41 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
42 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070043
mtklein6c221b42015-11-20 13:53:19 -080044 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
45 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070046
mtklein6c221b42015-11-20 13:53:19 -080047 SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
48 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
49 SkNx rsqrt1() const { return this->rsqrt0(); }
50 SkNx rsqrt2() const { return this->rsqrt1(); }
mtkleinc9adb052015-03-30 10:50:27 -070051
mtklein6c221b42015-11-20 13:53:19 -080052 SkNx invert() const { return SkNx(1) / *this; }
53 SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070054
mtkleina156a8f2015-04-03 06:16:13 -070055 template <int k> float kth() const {
mtkleinc9adb052015-03-30 10:50:27 -070056 SkASSERT(0 <= k && k < 2);
57 union { __m128 v; float fs[4]; } pun = {fVec};
mtkleina156a8f2015-04-03 06:16:13 -070058 return pun.fs[k&1];
mtkleinc9adb052015-03-30 10:50:27 -070059 }
60
mtkleinb5e86112015-06-24 15:18:39 -070061 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
62 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
63
mtkleinc9adb052015-03-30 10:50:27 -070064 __m128 fVec;
65};
66
67template <>
mtkleinfce612a2015-12-15 07:38:54 -080068class SkNx<2, double> {
69public:
70 SkNx(const __m128d& vec) : fVec(vec) {}
71
72 SkNx() {}
73 SkNx(double val) : fVec(_mm_set1_pd(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -080074 static SkNx Load(const void* ptr) { return _mm_loadu_pd((const double*)ptr); }
mtkleinfce612a2015-12-15 07:38:54 -080075 SkNx(double a, double b) : fVec(_mm_setr_pd(a,b)) {}
76
mtklein507ef6d2016-01-31 08:02:47 -080077 void store(void* ptr) const { _mm_storeu_pd((double*)ptr, fVec); }
mtkleinfce612a2015-12-15 07:38:54 -080078
79 SkNx operator + (const SkNx& o) const { return _mm_add_pd(fVec, o.fVec); }
80 SkNx operator - (const SkNx& o) const { return _mm_sub_pd(fVec, o.fVec); }
81 SkNx operator * (const SkNx& o) const { return _mm_mul_pd(fVec, o.fVec); }
82 SkNx operator / (const SkNx& o) const { return _mm_div_pd(fVec, o.fVec); }
83
84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_pd (fVec, o.fVec); }
85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_pd(fVec, o.fVec); }
86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_pd (fVec, o.fVec); }
87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_pd (fVec, o.fVec); }
88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_pd (fVec, o.fVec); }
89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_pd (fVec, o.fVec); }
90
91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_pd(l.fVec, r.fVec); }
92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_pd(l.fVec, r.fVec); }
93
94 SkNx sqrt() const { return _mm_sqrt_pd(fVec); }
95
96 template <int k> double kth() const {
97 SkASSERT(0 <= k && k < 2);
98 union { __m128d v; double fs[2]; } pun = {fVec};
99 return pun.fs[k&1];
100 }
101
102 bool allTrue() const { return 0x3 == _mm_movemask_pd(fVec); }
103 bool anyTrue() const { return 0x0 != _mm_movemask_pd(fVec); }
104
105 SkNx thenElse(const SkNx& t, const SkNx& e) const {
106 return _mm_or_pd(_mm_and_pd (fVec, t.fVec),
107 _mm_andnot_pd(fVec, e.fVec));
108 }
109
110 __m128d fVec;
111};
112
113template <>
mtklein6c221b42015-11-20 13:53:19 -0800114class SkNx<4, int> {
mtklein1113da72015-04-27 12:08:01 -0700115public:
mtklein6c221b42015-11-20 13:53:19 -0800116 SkNx(const __m128i& vec) : fVec(vec) {}
mtklein1113da72015-04-27 12:08:01 -0700117
mtklein6c221b42015-11-20 13:53:19 -0800118 SkNx() {}
119 SkNx(int val) : fVec(_mm_set1_epi32(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -0800120 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
mtklein6c221b42015-11-20 13:53:19 -0800121 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
mtklein1113da72015-04-27 12:08:01 -0700122
mtklein507ef6d2016-01-31 08:02:47 -0800123 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
mtklein1113da72015-04-27 12:08:01 -0700124
mtklein6c221b42015-11-20 13:53:19 -0800125 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
126 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
127 SkNx operator * (const SkNx& o) const {
mtklein1113da72015-04-27 12:08:01 -0700128 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
129 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
130 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
131 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
132 }
133
mtklein6c221b42015-11-20 13:53:19 -0800134 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
135 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
mtklein1113da72015-04-27 12:08:01 -0700136
137 template <int k> int kth() const {
138 SkASSERT(0 <= k && k < 4);
139 switch (k) {
140 case 0: return _mm_cvtsi128_si32(fVec);
141 case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4));
142 case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8));
143 case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12));
144 default: SkASSERT(false); return 0;
145 }
146 }
mtkleind2ffd362015-05-12 06:11:21 -0700147
mtklein1113da72015-04-27 12:08:01 -0700148 __m128i fVec;
149};
150
151template <>
mtklein6c221b42015-11-20 13:53:19 -0800152class SkNx<4, float> {
mtkleinc9adb052015-03-30 10:50:27 -0700153public:
mtklein6c221b42015-11-20 13:53:19 -0800154 SkNx(const __m128& vec) : fVec(vec) {}
mtkleinc9adb052015-03-30 10:50:27 -0700155
mtklein6c221b42015-11-20 13:53:19 -0800156 SkNx() {}
157 SkNx(float val) : fVec( _mm_set1_ps(val) ) {}
mtklein507ef6d2016-01-31 08:02:47 -0800158 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
mtkleinaba1dc82015-08-31 14:39:59 -0700159
mtklein6c221b42015-11-20 13:53:19 -0800160 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
mtkleinc9adb052015-03-30 10:50:27 -0700161
mtklein507ef6d2016-01-31 08:02:47 -0800162 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
mtklein9db43ac2015-12-01 07:10:21 -0800163
mtklein6c221b42015-11-20 13:53:19 -0800164 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
165 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
166 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
167 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700168
mtklein6c221b42015-11-20 13:53:19 -0800169 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
170 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
171 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
172 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
173 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
174 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700175
mtklein6c221b42015-11-20 13:53:19 -0800176 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
177 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700178
mtkleinc33065a2016-01-15 12:16:40 -0800179 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
180
mtklein6c221b42015-11-20 13:53:19 -0800181 SkNx sqrt() const { return _mm_sqrt_ps (fVec); }
182 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
183 SkNx rsqrt1() const { return this->rsqrt0(); }
184 SkNx rsqrt2() const { return this->rsqrt1(); }
mtkleinc9adb052015-03-30 10:50:27 -0700185
mtklein6c221b42015-11-20 13:53:19 -0800186 SkNx invert() const { return SkNx(1) / *this; }
187 SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700188
mtkleina156a8f2015-04-03 06:16:13 -0700189 template <int k> float kth() const {
mtkleinc9adb052015-03-30 10:50:27 -0700190 SkASSERT(0 <= k && k < 4);
191 union { __m128 v; float fs[4]; } pun = {fVec};
mtkleina156a8f2015-04-03 06:16:13 -0700192 return pun.fs[k&3];
mtkleinc9adb052015-03-30 10:50:27 -0700193 }
194
mtkleinb5e86112015-06-24 15:18:39 -0700195 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); }
196 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
197
mtklein6c221b42015-11-20 13:53:19 -0800198 SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtklein2aab22a2015-06-26 10:46:31 -0700199 return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
200 _mm_andnot_ps(fVec, e.fVec));
201 }
202
mtkleinc9adb052015-03-30 10:50:27 -0700203 __m128 fVec;
204};
205
mtklein115acee2015-04-14 14:02:52 -0700206template <>
mtklein6c221b42015-11-20 13:53:19 -0800207class SkNx<4, uint16_t> {
mtklein115acee2015-04-14 14:02:52 -0700208public:
mtklein6c221b42015-11-20 13:53:19 -0800209 SkNx(const __m128i& vec) : fVec(vec) {}
mtklein115acee2015-04-14 14:02:52 -0700210
mtklein6c221b42015-11-20 13:53:19 -0800211 SkNx() {}
212 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -0800213 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
mtklein6c221b42015-11-20 13:53:19 -0800214 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
mtklein115acee2015-04-14 14:02:52 -0700215
mtklein507ef6d2016-01-31 08:02:47 -0800216 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
mtklein115acee2015-04-14 14:02:52 -0700217
mtklein6c221b42015-11-20 13:53:19 -0800218 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
219 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
220 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
mtklein115acee2015-04-14 14:02:52 -0700221
mtklein6c221b42015-11-20 13:53:19 -0800222 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
223 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
mtklein115acee2015-04-14 14:02:52 -0700224
225 template <int k> uint16_t kth() const {
226 SkASSERT(0 <= k && k < 4);
227 return _mm_extract_epi16(fVec, k);
228 }
mtkleind2ffd362015-05-12 06:11:21 -0700229
mtklein115acee2015-04-14 14:02:52 -0700230 __m128i fVec;
231};
232
233template <>
mtklein6c221b42015-11-20 13:53:19 -0800234class SkNx<8, uint16_t> {
mtklein115acee2015-04-14 14:02:52 -0700235public:
mtklein6c221b42015-11-20 13:53:19 -0800236 SkNx(const __m128i& vec) : fVec(vec) {}
mtklein115acee2015-04-14 14:02:52 -0700237
mtklein6c221b42015-11-20 13:53:19 -0800238 SkNx() {}
239 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -0800240 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
mtklein6c221b42015-11-20 13:53:19 -0800241 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
mtklein115acee2015-04-14 14:02:52 -0700242 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {}
243
mtklein507ef6d2016-01-31 08:02:47 -0800244 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
mtklein115acee2015-04-14 14:02:52 -0700245
mtklein6c221b42015-11-20 13:53:19 -0800246 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
247 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
248 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
mtklein115acee2015-04-14 14:02:52 -0700249
mtklein6c221b42015-11-20 13:53:19 -0800250 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
251 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
mtklein115acee2015-04-14 14:02:52 -0700252
mtklein6c221b42015-11-20 13:53:19 -0800253 static SkNx Min(const SkNx& a, const SkNx& b) {
mtklein27e517a2015-05-14 17:53:04 -0700254 // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the
255 // signed version, _mm_min_epi16, then shift back.
256 const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine.
257 const __m128i top_8x = _mm_set1_epi16(top);
258 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x),
259 _mm_sub_epi8(b.fVec, top_8x)));
260 }
261
mtklein6c221b42015-11-20 13:53:19 -0800262 SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtklein4be181e2015-07-14 10:54:19 -0700263 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
264 _mm_andnot_si128(fVec, e.fVec));
265 }
266
mtklein115acee2015-04-14 14:02:52 -0700267 template <int k> uint16_t kth() const {
268 SkASSERT(0 <= k && k < 8);
269 return _mm_extract_epi16(fVec, k);
270 }
mtkleind2ffd362015-05-12 06:11:21 -0700271
272 __m128i fVec;
273};
274
275template <>
mtklein6f37b4a2015-12-14 11:25:18 -0800276class SkNx<4, uint8_t> {
277public:
278 SkNx(const __m128i& vec) : fVec(vec) {}
279
280 SkNx() {}
mtklein507ef6d2016-01-31 08:02:47 -0800281 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
282 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
mtklein6f37b4a2015-12-14 11:25:18 -0800283
284 // TODO as needed
285
286 __m128i fVec;
287};
288
289template <>
290class SkNx<8, uint8_t> {
291public:
292 SkNx(const __m128i& vec) : fVec(vec) {}
293
294 SkNx() {}
mtklein507ef6d2016-01-31 08:02:47 -0800295 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
296 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
mtklein6f37b4a2015-12-14 11:25:18 -0800297
298 // TODO as needed
299
300 __m128i fVec;
301};
302
303template <>
mtklein6c221b42015-11-20 13:53:19 -0800304class SkNx<16, uint8_t> {
mtkleind2ffd362015-05-12 06:11:21 -0700305public:
mtklein6c221b42015-11-20 13:53:19 -0800306 SkNx(const __m128i& vec) : fVec(vec) {}
mtkleind2ffd362015-05-12 06:11:21 -0700307
mtklein6c221b42015-11-20 13:53:19 -0800308 SkNx() {}
309 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
mtklein507ef6d2016-01-31 08:02:47 -0800310 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
mtklein6c221b42015-11-20 13:53:19 -0800311 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
mtkleind2ffd362015-05-12 06:11:21 -0700312 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
313 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
314 uint8_t m, uint8_t n, uint8_t o, uint8_t p)
315 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {}
316
mtklein507ef6d2016-01-31 08:02:47 -0800317 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700318
mtklein6c221b42015-11-20 13:53:19 -0800319 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
mtklein6cbf18c2015-05-12 15:48:09 -0700320
mtklein6c221b42015-11-20 13:53:19 -0800321 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
322 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700323
mtklein6c221b42015-11-20 13:53:19 -0800324 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
325 SkNx operator < (const SkNx& o) const {
mtkleinb5e86112015-06-24 15:18:39 -0700326 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
327 auto flip = _mm_set1_epi8(char(0x80));
328 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
329 }
mtklein27e517a2015-05-14 17:53:04 -0700330
mtkleind2ffd362015-05-12 06:11:21 -0700331 template <int k> uint8_t kth() const {
332 SkASSERT(0 <= k && k < 16);
333 // SSE4.1 would just `return _mm_extract_epi8(fVec, k)`. We have to read 16-bits instead.
334 int pair = _mm_extract_epi16(fVec, k/2);
335 return k % 2 == 0 ? pair : (pair >> 8);
336 }
337
mtklein6c221b42015-11-20 13:53:19 -0800338 SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtkleinb5e86112015-06-24 15:18:39 -0700339 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec),
340 _mm_andnot_si128(fVec, e.fVec));
341 }
342
mtklein115acee2015-04-14 14:02:52 -0700343 __m128i fVec;
344};
mtkleinc9adb052015-03-30 10:50:27 -0700345
mtklein6c221b42015-11-20 13:53:19 -0800346
mtklein6f37b4a2015-12-14 11:25:18 -0800347template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) {
mtklein6c221b42015-11-20 13:53:19 -0800348 return _mm_cvttps_epi32(src.fVec);
349}
350
mtklein6f37b4a2015-12-14 11:25:18 -0800351template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) {
352 auto _32 = _mm_cvttps_epi32(src.fVec);
353#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
354 const int _ = ~0;
355 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_));
356#else
357 auto _16 = _mm_packus_epi16(_32, _32);
358 return _mm_packus_epi16(_16, _16);
359#endif
360}
361
362template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) {
363#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
364 const int _ = ~0;
365 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
366#else
367 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),
368 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128());
369#endif
370 return _mm_cvtepi32_ps(_32);
371}
372
373static inline void Sk4f_ToBytes(uint8_t bytes[16],
374 const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
375 _mm_storeu_si128((__m128i*)bytes,
376 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
377 _mm_cvttps_epi32(b.fVec)),
378 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
379 _mm_cvttps_epi32(d.fVec))));
380}
381
mtklein550e9b02016-01-20 11:55:51 -0800382template<> inline Sk4h SkNx_cast<uint16_t, uint8_t, 4>(const Sk4b& src) {
383 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
384}
385
386template<> inline Sk4b SkNx_cast<uint8_t, uint16_t, 4>(const Sk4h& src) {
387 return _mm_packus_epi16(src.fVec, src.fVec);
388}
389
mtklein6f37b4a2015-12-14 11:25:18 -0800390
mtklein082e3292015-08-12 11:56:43 -0700391} // namespace
392
mtkleinc9adb052015-03-30 10:50:27 -0700393#endif//SkNx_sse_DEFINED