blob: b99330903a49d7c7efcc4deba42da9091183b223 [file] [log] [blame]
Mike Kleinb9c4a6f2017-04-03 13:54:55 -04001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkJumper_vectors_DEFINED
9#define SkJumper_vectors_DEFINED
10
11#include "SkJumper.h"
12#include "SkJumper_misc.h"
13
14// This file contains vector types that SkJumper_stages.cpp uses to define stages.
15
16// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
17
18#if !defined(JUMPER)
19 // This path should lead to portable code that can be compiled directly into Skia.
20 // (All other paths are compiled offline by Clang into SkJumper_generated.S.)
21 #include <math.h>
22
Mike Klein21bd3e42017-04-06 16:32:29 -040023 using F = float ;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040024 using I32 = int32_t;
Mike Klein5f055f02017-04-06 20:02:11 -040025 using U64 = uint64_t;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040026 using U32 = uint32_t;
27 using U16 = uint16_t;
Mike Klein21bd3e42017-04-06 16:32:29 -040028 using U8 = uint8_t ;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040029
30 SI F mad(F f, F m, F a) { return f*m+a; }
31 SI F min(F a, F b) { return fminf(a,b); }
32 SI F max(F a, F b) { return fmaxf(a,b); }
33 SI F abs_ (F v) { return fabsf(v); }
34 SI F floor_(F v) { return floorf(v); }
35 SI F rcp (F v) { return 1.0f / v; }
36 SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
Mike Kleinfd35c742017-05-15 15:55:54 -040037 SI F sqrt_(F v) { return sqrtf(v); }
Mike Kleine9801742017-05-23 09:53:36 -040038 SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040039 SI U16 pack(U32 v) { return (U16)v; }
40 SI U8 pack(U16 v) { return (U8)v; }
41
42 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
43
Mike Klein21bd3e42017-04-06 16:32:29 -040044 template <typename T>
45 SI T gather(const T* p, U32 ix) { return p[ix]; }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040046
Mike Kleinb3821732017-04-17 10:58:05 -040047 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
48 *r = ptr[0];
49 *g = ptr[1];
50 *b = ptr[2];
51 }
Mike Kleinfa6eb912017-04-05 10:18:27 -040052 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
Mike Klein114e6b32017-04-03 22:21:15 -040053 *r = ptr[0];
54 *g = ptr[1];
55 *b = ptr[2];
56 *a = ptr[3];
57 }
Mike Kleinfa6eb912017-04-05 10:18:27 -040058 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
59 ptr[0] = r;
60 ptr[1] = g;
61 ptr[2] = b;
62 ptr[3] = a;
63 }
Mike Klein14987eb2017-04-06 10:22:26 -040064
65 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
66 *r = ptr[0];
67 *g = ptr[1];
68 *b = ptr[2];
69 *a = ptr[3];
70 }
Mike Kleinfa6eb912017-04-05 10:18:27 -040071 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
Mike Klein95f53be2017-04-04 10:24:56 -040072 ptr[0] = r;
73 ptr[1] = g;
74 ptr[2] = b;
75 ptr[3] = a;
76 }
Mike Klein114e6b32017-04-03 22:21:15 -040077
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040078#elif defined(__aarch64__)
79 #include <arm_neon.h>
80
81 // Since we know we're using Clang, we can use its vector extensions.
Mike Klein21bd3e42017-04-06 16:32:29 -040082 template <typename T> using V = T __attribute__((ext_vector_type(4)));
83 using F = V<float >;
84 using I32 = V< int32_t>;
Mike Klein5f055f02017-04-06 20:02:11 -040085 using U64 = V<uint64_t>;
Mike Klein21bd3e42017-04-06 16:32:29 -040086 using U32 = V<uint32_t>;
87 using U16 = V<uint16_t>;
88 using U8 = V<uint8_t >;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040089
90 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
91 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
92 SI F min(F a, F b) { return vminq_f32(a,b); }
93 SI F max(F a, F b) { return vmaxq_f32(a,b); }
94 SI F abs_ (F v) { return vabsq_f32(v); }
95 SI F floor_(F v) { return vrndmq_f32(v); }
96 SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
97 SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
Mike Kleinfd35c742017-05-15 15:55:54 -040098 SI F sqrt_(F v) { return vsqrtq_f32(v); }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040099 SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
100 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
101 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
102
103 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
104
Mike Klein21bd3e42017-04-06 16:32:29 -0400105 template <typename T>
106 SI V<T> gather(const T* p, U32 ix) {
107 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
108 }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400109
Mike Kleinb3821732017-04-17 10:58:05 -0400110 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
111 uint16x4x3_t rgb = vld3_u16(ptr);
112 *r = rgb.val[0];
113 *g = rgb.val[1];
114 *b = rgb.val[2];
115 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400116 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
117 uint16x4x4_t rgba = vld4_u16(ptr);
Mike Klein114e6b32017-04-03 22:21:15 -0400118 *r = rgba.val[0];
119 *g = rgba.val[1];
120 *b = rgba.val[2];
121 *a = rgba.val[3];
122 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400123 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
124 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
125 }
Mike Klein14987eb2017-04-06 10:22:26 -0400126
127 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
128 float32x4x4_t rgba = vld4q_f32(ptr);
129 *r = rgba.val[0];
130 *g = rgba.val[1];
131 *b = rgba.val[2];
132 *a = rgba.val[3];
133 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400134 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
135 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
Mike Klein114e6b32017-04-03 22:21:15 -0400136 }
137
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400138#elif defined(__arm__)
139 #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
140 #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
141 #endif
142 #include <arm_neon.h>
143
144 // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
Mike Klein21bd3e42017-04-06 16:32:29 -0400145 template <typename T> using V = T __attribute__((ext_vector_type(2)));
146 using F = V<float >;
147 using I32 = V< int32_t>;
Mike Klein5f055f02017-04-06 20:02:11 -0400148 using U64 = V<uint64_t>;
Mike Klein21bd3e42017-04-06 16:32:29 -0400149 using U32 = V<uint32_t>;
150 using U16 = V<uint16_t>;
151 using U8 = V<uint8_t >;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400152
153 SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
154 SI F min(F a, F b) { return vmin_f32(a,b); }
155 SI F max(F a, F b) { return vmax_f32(a,b); }
156 SI F abs_ (F v) { return vabs_f32(v); }
157 SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
158 SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
159 SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
160 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
161 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
162
Mike Kleinfd35c742017-05-15 15:55:54 -0400163 SI F sqrt_(F v) {
164 auto e = vrsqrte_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
165 e *= vrsqrts_f32(v,e*e);
166 e *= vrsqrts_f32(v,e*e);
167 return v*e; // sqrt(v) == v*rsqrt(v).
168 }
169
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400170 SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
171
172 SI F floor_(F v) {
173 F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
Mike Kleinb4bbc642017-04-27 08:59:55 -0400174 return roundtrip - if_then_else(roundtrip > v, 1, 0);
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400175 }
176
Mike Klein21bd3e42017-04-06 16:32:29 -0400177 template <typename T>
178 SI V<T> gather(const T* p, U32 ix) {
179 return {p[ix[0]], p[ix[1]]};
180 }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400181
Mike Kleinb3821732017-04-17 10:58:05 -0400182 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
183 uint16x4x3_t rgb;
184 rgb = vld3_lane_u16(ptr + 0, rgb, 0);
185 rgb = vld3_lane_u16(ptr + 3, rgb, 1);
186 *r = unaligned_load<U16>(rgb.val+0);
187 *g = unaligned_load<U16>(rgb.val+1);
188 *b = unaligned_load<U16>(rgb.val+2);
189 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400190 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
Mike Klein114e6b32017-04-03 22:21:15 -0400191 uint16x4x4_t rgba;
192 rgba = vld4_lane_u16(ptr + 0, rgba, 0);
193 rgba = vld4_lane_u16(ptr + 4, rgba, 1);
194 *r = unaligned_load<U16>(rgba.val+0);
195 *g = unaligned_load<U16>(rgba.val+1);
196 *b = unaligned_load<U16>(rgba.val+2);
197 *a = unaligned_load<U16>(rgba.val+3);
198 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400199 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
Mike Klein95f53be2017-04-04 10:24:56 -0400200 uint16x4x4_t rgba = {{
201 widen_cast<uint16x4_t>(r),
202 widen_cast<uint16x4_t>(g),
203 widen_cast<uint16x4_t>(b),
204 widen_cast<uint16x4_t>(a),
205 }};
206 vst4_lane_u16(ptr + 0, rgba, 0);
207 vst4_lane_u16(ptr + 4, rgba, 1);
208 }
Mike Klein14987eb2017-04-06 10:22:26 -0400209
210 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
211 float32x2x4_t rgba = vld4_f32(ptr);
212 *r = rgba.val[0];
213 *g = rgba.val[1];
214 *b = rgba.val[2];
215 *a = rgba.val[3];
216 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400217 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
218 vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
219 }
Mike Klein114e6b32017-04-03 22:21:15 -0400220
Mike Klein114e6b32017-04-03 22:21:15 -0400221
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400222#elif defined(__AVX__)
223 #include <immintrin.h>
224
225 // These are __m256 and __m256i, but friendlier and strongly-typed.
Mike Klein21bd3e42017-04-06 16:32:29 -0400226 template <typename T> using V = T __attribute__((ext_vector_type(8)));
227 using F = V<float >;
228 using I32 = V< int32_t>;
Mike Klein5f055f02017-04-06 20:02:11 -0400229 using U64 = V<uint64_t>;
Mike Klein21bd3e42017-04-06 16:32:29 -0400230 using U32 = V<uint32_t>;
231 using U16 = V<uint16_t>;
232 using U8 = V<uint8_t >;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400233
234 SI F mad(F f, F m, F a) {
235 #if defined(__FMA__)
236 return _mm256_fmadd_ps(f,m,a);
237 #else
238 return f*m+a;
239 #endif
240 }
241
242 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
243 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
244 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
245 SI F floor_(F v) { return _mm256_floor_ps(v); }
246 SI F rcp (F v) { return _mm256_rcp_ps (v); }
247 SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
Mike Kleinfd35c742017-05-15 15:55:54 -0400248 SI F sqrt_(F v) { return _mm256_sqrt_ps (v); }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400249 SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
250
251 SI U16 pack(U32 v) {
252 return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
253 _mm256_extractf128_si256(v, 1));
254 }
255 SI U8 pack(U16 v) {
256 auto r = _mm_packus_epi16(v,v);
257 return unaligned_load<U8>(&r);
258 }
259
260 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
261
Mike Klein21bd3e42017-04-06 16:32:29 -0400262 template <typename T>
263 SI V<T> gather(const T* p, U32 ix) {
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400264 return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
265 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400266 }
Mike Kleindec4ea82017-04-06 15:04:05 -0400267 #if defined(__AVX2__)
Mike Klein21bd3e42017-04-06 16:32:29 -0400268 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); }
269 SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
Mike Klein5f055f02017-04-06 20:02:11 -0400270 SI U64 gather(const uint64_t* p, U32 ix) {
271 __m256i parts[] = {
272 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8),
273 _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8),
274 };
275 return bit_cast<U64>(parts);
276 }
Mike Kleindec4ea82017-04-06 15:04:05 -0400277 #endif
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400278
Mike Kleinb3821732017-04-17 10:58:05 -0400279 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
280 __m128i _0,_1,_2,_3,_4,_5,_6,_7;
281 if (__builtin_expect(tail,0)) {
282 auto load_rgb = [](const uint16_t* src) {
283 auto v = _mm_cvtsi32_si128(*(const uint32_t*)src);
284 return _mm_insert_epi16(v, src[2], 2);
285 };
286 if (tail > 0) { _0 = load_rgb(ptr + 0); }
287 if (tail > 1) { _1 = load_rgb(ptr + 3); }
288 if (tail > 2) { _2 = load_rgb(ptr + 6); }
289 if (tail > 3) { _3 = load_rgb(ptr + 9); }
290 if (tail > 4) { _4 = load_rgb(ptr + 12); }
291 if (tail > 5) { _5 = load_rgb(ptr + 15); }
292 if (tail > 6) { _6 = load_rgb(ptr + 18); }
293 } else {
294 // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
295 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ;
296 auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ;
297 auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ;
298 auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
299 _0 = _01; _1 = _mm_srli_si128(_01, 6),
300 _2 = _23; _3 = _mm_srli_si128(_23, 6),
301 _4 = _45; _5 = _mm_srli_si128(_45, 6),
302 _6 = _67; _7 = _mm_srli_si128(_67, 6);
303 }
304
305 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
306 _13 = _mm_unpacklo_epi16(_1, _3),
307 _46 = _mm_unpacklo_epi16(_4, _6),
308 _57 = _mm_unpacklo_epi16(_5, _7);
309
310 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
311 bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
312 rg4567 = _mm_unpacklo_epi16(_46, _57),
313 bx4567 = _mm_unpackhi_epi16(_46, _57);
314
315 *r = _mm_unpacklo_epi64(rg0123, rg4567);
316 *g = _mm_unpackhi_epi64(rg0123, rg4567);
317 *b = _mm_unpacklo_epi64(bx0123, bx4567);
318 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400319 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
Mike Klein114e6b32017-04-03 22:21:15 -0400320 __m128i _01, _23, _45, _67;
321 if (__builtin_expect(tail,0)) {
322 auto src = (const double*)ptr;
323 _01 = _23 = _45 = _67 = _mm_setzero_si128();
324 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
325 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
326 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
327 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
328 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
329 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
330 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
331 } else {
332 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
333 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
334 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
335 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
336 }
337
338 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
339 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
340 _46 = _mm_unpacklo_epi16(_45, _67),
341 _57 = _mm_unpackhi_epi16(_45, _67);
342
343 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
344 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
345 rg4567 = _mm_unpacklo_epi16(_46, _57),
346 ba4567 = _mm_unpackhi_epi16(_46, _57);
347
348 *r = _mm_unpacklo_epi64(rg0123, rg4567);
349 *g = _mm_unpackhi_epi64(rg0123, rg4567);
350 *b = _mm_unpacklo_epi64(ba0123, ba4567);
351 *a = _mm_unpackhi_epi64(ba0123, ba4567);
352 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400353 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
Mike Klein95f53be2017-04-04 10:24:56 -0400354 auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
355 rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
356 ba0123 = _mm_unpacklo_epi16(b, a),
357 ba4567 = _mm_unpackhi_epi16(b, a);
358
359 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
360 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
361 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
362 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
363
364 if (__builtin_expect(tail,0)) {
365 auto dst = (double*)ptr;
366 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
367 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
368 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
369 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
370 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
371 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
372 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
373 } else {
374 _mm_storeu_si128((__m128i*)ptr + 0, _01);
375 _mm_storeu_si128((__m128i*)ptr + 1, _23);
376 _mm_storeu_si128((__m128i*)ptr + 2, _45);
377 _mm_storeu_si128((__m128i*)ptr + 3, _67);
378 }
379 }
Mike Klein14987eb2017-04-06 10:22:26 -0400380
381 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
382 F _04, _15, _26, _37;
383
384 switch (tail) {
385 case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
386 case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
387 case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
388 case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
389 case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
390 case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
391 case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
392 case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
393 }
394
395 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
396 ba0145 = _mm256_unpackhi_ps(_04,_15),
397 rg2367 = _mm256_unpacklo_ps(_26,_37),
398 ba2367 = _mm256_unpackhi_ps(_26,_37);
399
400 *r = _mm256_unpacklo_pd(rg0145, rg2367);
401 *g = _mm256_unpackhi_pd(rg0145, rg2367);
402 *b = _mm256_unpacklo_pd(ba0145, ba2367);
403 *a = _mm256_unpackhi_pd(ba0145, ba2367);
404 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400405 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
406 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
407 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
408 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
409 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
410
411 F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
412 _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
413 _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
414 _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
415
416 if (__builtin_expect(tail, 0)) {
417 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
418 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
419 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
420 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
421 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
422 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
423 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
424 } else {
425 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
426 _23 = _mm256_permute2f128_ps(_26, _37, 32),
427 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
428 _67 = _mm256_permute2f128_ps(_26, _37, 49);
429 _mm256_storeu_ps(ptr+ 0, _01);
430 _mm256_storeu_ps(ptr+ 8, _23);
431 _mm256_storeu_ps(ptr+16, _45);
432 _mm256_storeu_ps(ptr+24, _67);
433 }
434 }
Mike Klein114e6b32017-04-03 22:21:15 -0400435
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400436#elif defined(__SSE2__)
437 #include <immintrin.h>
438
Mike Klein21bd3e42017-04-06 16:32:29 -0400439 template <typename T> using V = T __attribute__((ext_vector_type(4)));
440 using F = V<float >;
441 using I32 = V< int32_t>;
Mike Klein5f055f02017-04-06 20:02:11 -0400442 using U64 = V<uint64_t>;
Mike Klein21bd3e42017-04-06 16:32:29 -0400443 using U32 = V<uint32_t>;
444 using U16 = V<uint16_t>;
445 using U8 = V<uint8_t >;
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400446
447 SI F mad(F f, F m, F a) { return f*m+a; }
448 SI F min(F a, F b) { return _mm_min_ps(a,b); }
449 SI F max(F a, F b) { return _mm_max_ps(a,b); }
450 SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
Mike Kleinfd35c742017-05-15 15:55:54 -0400451 SI F rcp (F v) { return _mm_rcp_ps (v); }
452 SI F rsqrt (F v) { return _mm_rsqrt_ps(v); }
453 SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400454 SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
455
456 SI U16 pack(U32 v) {
457 #if defined(__SSE4_1__)
458 auto p = _mm_packus_epi32(v,v);
459 #else
460 // Sign extend so that _mm_packs_epi32() does the pack we want.
461 auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
462 p = _mm_packs_epi32(p,p);
463 #endif
464 return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
465 }
466 SI U8 pack(U16 v) {
Mike Klein95f53be2017-04-04 10:24:56 -0400467 auto r = widen_cast<__m128i>(v);
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400468 r = _mm_packus_epi16(r,r);
469 return unaligned_load<U8>(&r);
470 }
471
472 SI F if_then_else(I32 c, F t, F e) {
473 return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
474 }
475
476 SI F floor_(F v) {
477 #if defined(__SSE4_1__)
478 return _mm_floor_ps(v);
479 #else
480 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
Mike Kleinb4bbc642017-04-27 08:59:55 -0400481 return roundtrip - if_then_else(roundtrip > v, 1, 0);
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400482 #endif
483 }
484
Mike Klein21bd3e42017-04-06 16:32:29 -0400485 template <typename T>
486 SI V<T> gather(const T* p, U32 ix) {
487 return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
488 }
Mike Klein114e6b32017-04-03 22:21:15 -0400489
Mike Kleinb3821732017-04-17 10:58:05 -0400490 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
491 // Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
492 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ,
493 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
494
495 // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
496 auto _0 = _01, _1 = _mm_srli_si128(_01, 6),
497 _2 = _23, _3 = _mm_srli_si128(_23, 6);
498
499 // De-interlace to R,G,B.
500 auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
501 _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx
502
503 auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
504 G = _mm_srli_si128(R, 8),
505 B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx
506
507 *r = unaligned_load<U16>(&R);
508 *g = unaligned_load<U16>(&G);
509 *b = unaligned_load<U16>(&B);
510 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400511 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
Mike Klein114e6b32017-04-03 22:21:15 -0400512 auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
513 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
514
515 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
516 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
517
518 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
519 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
520
521 *r = unaligned_load<U16>((uint16_t*)&rg + 0);
522 *g = unaligned_load<U16>((uint16_t*)&rg + 4);
523 *b = unaligned_load<U16>((uint16_t*)&ba + 0);
524 *a = unaligned_load<U16>((uint16_t*)&ba + 4);
525 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400526 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
Mike Klein95f53be2017-04-04 10:24:56 -0400527 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
528 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
529 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
530 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
531 }
Mike Klein14987eb2017-04-06 10:22:26 -0400532
533 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
534 auto _0 = _mm_loadu_ps(ptr+ 0),
535 _1 = _mm_loadu_ps(ptr+ 4),
536 _2 = _mm_loadu_ps(ptr+ 8),
537 _3 = _mm_loadu_ps(ptr+12);
538 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
539 *r = _0;
540 *g = _1;
541 *b = _2;
542 *a = _3;
543 }
Mike Kleinfa6eb912017-04-05 10:18:27 -0400544 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
545 _MM_TRANSPOSE4_PS(r,g,b,a);
546 _mm_storeu_ps(ptr+ 0, r);
547 _mm_storeu_ps(ptr+ 4, g);
548 _mm_storeu_ps(ptr+ 8, b);
549 _mm_storeu_ps(ptr+12, a);
550 }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400551#endif
552
553// We need to be a careful with casts.
554// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
555// These named casts and bit_cast() are always what they seem to be.
556#if defined(JUMPER)
Mike Kleindec4ea82017-04-06 15:04:05 -0400557 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
558 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
559 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
560 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400561#else
562 SI F cast (U32 v) { return (F)v; }
Mike Kleindec4ea82017-04-06 15:04:05 -0400563 SI U32 trunc_(F v) { return (U32)v; }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400564 SI U32 expand(U16 v) { return (U32)v; }
565 SI U32 expand(U8 v) { return (U32)v; }
566#endif
567
Mike Kleind0ce1482017-04-19 17:19:30 -0400568template <typename V>
569SI V if_then_else(I32 c, V t, V e) {
570 return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
571}
572
Mike Klein3146bb92017-04-05 14:45:02 -0400573SI U16 bswap(U16 x) {
574#if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
575 // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
576 // when generating code for SSE2 and SSE4.1. We'll do it manually...
577 auto v = widen_cast<__m128i>(x);
578 v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
579 return unaligned_load<U16>(&v);
580#else
581 return (x<<8) | (x>>8);
582#endif
583}
584
Mike Klein44375172017-04-17 19:32:05 -0400585SI F fract(F v) { return v - floor_(v); }
586
Mike Kleinda164342017-04-19 14:33:58 -0400587// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
588SI F approx_log2(F x) {
589 // e - 127 is a fair approximation of log2(x) in its own right...
Mike Kleinb4bbc642017-04-27 08:59:55 -0400590 F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23));
Mike Kleinda164342017-04-19 14:33:58 -0400591
592 // ... but using the mantissa to refine its error is _much_ better.
Mike Kleinb4bbc642017-04-27 08:59:55 -0400593 F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
Mike Kleinda164342017-04-19 14:33:58 -0400594 return e
Mike Kleinb4bbc642017-04-27 08:59:55 -0400595 - 124.225514990f
596 - 1.498030302f * m
597 - 1.725879990f / (0.3520887068f + m);
Mike Kleinda164342017-04-19 14:33:58 -0400598}
599SI F approx_pow2(F x) {
600 F f = fract(x);
Mike Kleinb4bbc642017-04-27 08:59:55 -0400601 return bit_cast<F>(round(1.0f * (1<<23),
602 x + 121.274057500f
603 - 1.490129070f * f
604 + 27.728023300f / (4.84252568f - f)));
Mike Kleinda164342017-04-19 14:33:58 -0400605}
606
607SI F approx_powf(F x, F y) {
608 return approx_pow2(approx_log2(x) * y);
609}
610
Mike Kleind0ce1482017-04-19 17:19:30 -0400611SI F from_half(U16 h) {
612#if defined(JUMPER) && defined(__aarch64__)
613 return vcvt_f32_f16(h);
614
615#elif defined(JUMPER) && defined(__arm__)
616 auto v = widen_cast<uint16x4_t>(h);
617 return vget_low_f32(vcvt_f32_f16(v));
618
619#elif defined(JUMPER) && defined(__AVX2__)
620 return _mm256_cvtph_ps(h);
621
622#else
623 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
624 U32 sem = expand(h),
Mike Kleinb4bbc642017-04-27 08:59:55 -0400625 s = sem & 0x8000,
Mike Kleind0ce1482017-04-19 17:19:30 -0400626 em = sem ^ s;
627
628 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
Mike Kleinb4bbc642017-04-27 08:59:55 -0400629 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here.
Mike Klein097d0932017-04-20 09:11:53 -0400630 return if_then_else(denorm, F(0)
Mike Kleinb4bbc642017-04-27 08:59:55 -0400631 , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
Mike Kleind0ce1482017-04-19 17:19:30 -0400632#endif
633}
634
635SI U16 to_half(F f) {
636#if defined(JUMPER) && defined(__aarch64__)
637 return vcvt_f16_f32(f);
638
639#elif defined(JUMPER) && defined(__arm__)
640 auto v = widen_cast<float32x4_t>(f);
641 uint16x4_t h = vcvt_f16_f32(v);
642 return unaligned_load<U16>(&h);
643
644#elif defined(JUMPER) && defined(__AVX2__)
645 return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
646
647#else
648 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
649 U32 sem = bit_cast<U32>(f),
Mike Kleinb4bbc642017-04-27 08:59:55 -0400650 s = sem & 0x80000000,
Mike Kleind0ce1482017-04-19 17:19:30 -0400651 em = sem ^ s;
652
653 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
Mike Kleinb4bbc642017-04-27 08:59:55 -0400654 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here.
Mike Kleind0ce1482017-04-19 17:19:30 -0400655 return pack(if_then_else(denorm, U32(0)
Mike Kleinb4bbc642017-04-27 08:59:55 -0400656 , (s>>16) + (em>>13) - ((127-15)<<10)));
Mike Kleind0ce1482017-04-19 17:19:30 -0400657#endif
658}
659
660
661
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400662#endif//SkJumper_vectors_DEFINED