blob: 92c189eeb8d125e36e272b60f142777021403f31 [file] [log] [blame]
Mike Kleinec370972020-03-05 10:15:35 -06001// Copyright 2020 Google LLC.
Mike Kleina67d1ae2020-03-09 17:36:00 -05002// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
Mike Kleinec370972020-03-05 10:15:35 -06003
4#ifndef SkVM_opts_DEFINED
5#define SkVM_opts_DEFINED
6
7#include "include/private/SkVx.h"
8#include "src/core/SkVM.h"
9
Mike Klein6d94b652020-09-16 11:37:03 -050010// Ideally this is (x*y + 0x2000)>>14,
11// but to let use vpmulhrsw we'll approximate that as ((x*y + 0x4000)>>15)<<1.
12template <int N>
13static inline skvx::Vec<N,int16_t> mul_q14(const skvx::Vec<N,int16_t>& x,
14 const skvx::Vec<N,int16_t>& y) {
15#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
16 if constexpr (N == 16) {
17 return skvx::bit_pun<skvx::Vec<N,int16_t>>(_mm256_mulhrs_epi16(skvx::bit_pun<__m256i>(x),
18 skvx::bit_pun<__m256i>(y)))
19 << 1;
20 }
21#endif
22#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
23 if constexpr (N == 8) {
24 return skvx::bit_pun<skvx::Vec<N,int16_t>>(_mm_mulhrs_epi16(skvx::bit_pun<__m128i>(x),
25 skvx::bit_pun<__m128i>(y)))
26 << 1;
27 }
28#endif
29 // TODO: NEON specialization with vqrdmulh.s16?
30
31 // Try to recurse onto the specializations above.
32 if constexpr (N > 8) {
33 return join(mul_q14(x.lo, y.lo),
34 mul_q14(x.hi, y.hi));
35 }
36 return skvx::cast<int16_t>((skvx::cast<int>(x) *
37 skvx::cast<int>(y) + 0x4000)>>15 ) <<1;
38}
39
Mike Klein2e69a132020-09-18 08:02:46 -050040template <int N>
41static inline skvx::Vec<N,int> gather32(const int* ptr, const skvx::Vec<N,int>& ix) {
42#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
43 if constexpr (N == 8) {
44 return skvx::bit_pun<skvx::Vec<N,int>>(
45 _mm256_i32gather_epi32(ptr, skvx::bit_pun<__m256i>(ix), 4));
46 }
47#endif
48 // Try to recurse on specializations, falling back on standard scalar map()-based impl.
49 if constexpr (N > 8) {
50 return join(gather32(ptr, ix.lo),
51 gather32(ptr, ix.hi));
52 }
53 return map(ix, [&](int i) { return ptr[i]; });
54}
55
Mike Kleinec370972020-03-05 10:15:35 -060056namespace SK_OPTS_NS {
57
58 inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
59 const int nregs, const int loop,
60 const int strides[], const int nargs,
61 int n, void* args[]) {
62 using namespace skvm;
63
64 // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
Mike Klein51d35ed2020-04-24 08:16:22 -050065 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
Mike Klein394a6d52020-09-18 14:04:19 -050066 constexpr int K = 32; // 1024-bit: 4 ymm or 2 zmm at a time
Mike Kleinec370972020-03-05 10:15:35 -060067 #else
Mike Klein394a6d52020-09-18 14:04:19 -050068 constexpr int K = 8; // 256-bit: 2 xmm, 2 v-registers, etc.
Mike Kleinec370972020-03-05 10:15:35 -060069 #endif
70 using I32 = skvx::Vec<K, int>;
71 using F32 = skvx::Vec<K, float>;
Mike Klein6732da02020-07-16 13:03:18 -050072 using U64 = skvx::Vec<K, uint64_t>;
Mike Kleinec370972020-03-05 10:15:35 -060073 using U32 = skvx::Vec<K, uint32_t>;
74 using U16 = skvx::Vec<K, uint16_t>;
75 using U8 = skvx::Vec<K, uint8_t>;
76
Mike Klein9791e502020-09-15 12:43:38 -050077 using I16x2 = skvx::Vec<2*K, int16_t>;
78 using U16x2 = skvx::Vec<2*K, uint16_t>;
79
Mike Kleinec370972020-03-05 10:15:35 -060080 union Slot {
81 F32 f32;
82 I32 i32;
83 U32 u32;
Mike Klein9791e502020-09-15 12:43:38 -050084 I16x2 i16x2;
85 U16x2 u16x2;
Mike Kleinec370972020-03-05 10:15:35 -060086 };
87
88 Slot few_regs[16];
89 std::unique_ptr<char[]> many_regs;
90
Mike Klein4284f752020-07-10 15:16:17 -050091 Slot* r = few_regs;
Mike Kleinec370972020-03-05 10:15:35 -060092
93 if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
94 // Annoyingly we can't trust that malloc() or new will work with Slot because
95 // the skvx::Vec types may have alignment greater than what they provide.
96 // We'll overallocate one extra register so we can align manually.
97 many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
98
99 uintptr_t addr = (uintptr_t)many_regs.get();
100 addr += alignof(Slot) -
101 (addr & (alignof(Slot) - 1));
102 SkASSERT((addr & (alignof(Slot) - 1)) == 0);
Mike Klein4284f752020-07-10 15:16:17 -0500103 r = (Slot*)addr;
Mike Kleinec370972020-03-05 10:15:35 -0600104 }
105
106
Mike Kleinec370972020-03-05 10:15:35 -0600107 // Step each argument pointer ahead by its stride a number of times.
108 auto step_args = [&](int times) {
109 for (int i = 0; i < nargs; i++) {
110 args[i] = (void*)( (char*)args[i] + times * strides[i] );
111 }
112 };
113
114 int start = 0,
115 stride;
116 for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
117 stride = n >= K ? K : 1;
118
119 for (int i = start; i < ninsts; i++) {
120 InterpreterInstruction inst = insts[i];
121
122 // d = op(x,y/imm,z/imm)
123 Reg d = inst.d,
124 x = inst.x,
125 y = inst.y,
126 z = inst.z;
127 int immy = inst.immy,
128 immz = inst.immz;
129
130 // Ops that interact with memory need to know whether we're stride=1 or K,
131 // but all non-memory ops can run the same code no matter the stride.
132 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
133 default: SkUNREACHABLE;
134
135 #define STRIDE_1(op) case 2*(int)op
136 #define STRIDE_K(op) case 2*(int)op + 1
Mike Klein4284f752020-07-10 15:16:17 -0500137 STRIDE_1(Op::store8 ): memcpy(args[immy], &r[x].i32, 1); break;
138 STRIDE_1(Op::store16): memcpy(args[immy], &r[x].i32, 2); break;
139 STRIDE_1(Op::store32): memcpy(args[immy], &r[x].i32, 4); break;
Mike Klein6732da02020-07-16 13:03:18 -0500140 STRIDE_1(Op::store64): memcpy((char*)args[immz]+0, &r[x].i32, 4);
141 memcpy((char*)args[immz]+4, &r[y].i32, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -0600142
Mike Klein4284f752020-07-10 15:16:17 -0500143 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r[x].i32).store(args[immy]); break;
144 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r[x].i32).store(args[immy]); break;
145 STRIDE_K(Op::store32): (r[x].i32).store(args[immy]); break;
Mike Klein6732da02020-07-16 13:03:18 -0500146 STRIDE_K(Op::store64): (skvx::cast<uint64_t>(r[x].u32) << 0 |
147 skvx::cast<uint64_t>(r[y].u32) << 32).store(args[immz]);
148 break;
Mike Kleinec370972020-03-05 10:15:35 -0600149
Mike Klein4284f752020-07-10 15:16:17 -0500150 STRIDE_1(Op::load8 ): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 1); break;
151 STRIDE_1(Op::load16): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 2); break;
152 STRIDE_1(Op::load32): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 4); break;
Mike Klein31367892020-07-30 08:19:12 -0500153 STRIDE_1(Op::load64):
154 r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immy] + 4*immz, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -0600155
Mike Klein4284f752020-07-10 15:16:17 -0500156 STRIDE_K(Op::load8 ): r[d].i32= skvx::cast<int>(U8 ::Load(args[immy])); break;
157 STRIDE_K(Op::load16): r[d].i32= skvx::cast<int>(U16::Load(args[immy])); break;
158 STRIDE_K(Op::load32): r[d].i32= I32::Load(args[immy]) ; break;
Mike Klein31367892020-07-30 08:19:12 -0500159 STRIDE_K(Op::load64):
160 // Low 32 bits if immz=0, or high 32 bits if immz=1.
161 r[d].i32 = skvx::cast<int>(U64::Load(args[immy]) >> (32*immz)); break;
Mike Kleinec370972020-03-05 10:15:35 -0600162
163 // The pointer we base our gather on is loaded indirectly from a uniform:
Mike Klein4284f752020-07-10 15:16:17 -0500164 // - args[immy] is the uniform holding our gather base pointer somewhere;
165 // - (const uint8_t*)args[immy] + immz points to the gather base pointer;
Mike Kleinec370972020-03-05 10:15:35 -0600166 // - memcpy() loads the gather base and into a pointer of the right type.
167 // After all that we have an ordinary (uniform) pointer `ptr` to load from,
Mike Klein4284f752020-07-10 15:16:17 -0500168 // and we then gather from it using the varying indices in r[x].
Mike Kleinfeb4d102020-09-17 08:54:08 -0500169 STRIDE_1(Op::gather8): {
170 const uint8_t* ptr;
171 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
172 r[d].i32 = ptr[ r[x].i32[0] ];
173 } break;
174 STRIDE_1(Op::gather16): {
175 const uint16_t* ptr;
176 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
177 r[d].i32 = ptr[ r[x].i32[0] ];
178 } break;
179 STRIDE_1(Op::gather32): {
180 const int* ptr;
181 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
182 r[d].i32 = ptr[ r[x].i32[0] ];
183 } break;
Mike Kleinec370972020-03-05 10:15:35 -0600184
Mike Kleinfeb4d102020-09-17 08:54:08 -0500185 STRIDE_K(Op::gather8): {
186 const uint8_t* ptr;
187 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
188 r[d].i32 = map(r[x].i32, [&](int ix) { return (int)ptr[ix]; });
189 } break;
190 STRIDE_K(Op::gather16): {
191 const uint16_t* ptr;
192 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
193 r[d].i32 = map(r[x].i32, [&](int ix) { return (int)ptr[ix]; });
194 } break;
195 STRIDE_K(Op::gather32): {
196 const int* ptr;
197 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
Mike Klein2e69a132020-09-18 08:02:46 -0500198 r[d].i32 = gather32(ptr, r[x].i32);
Mike Kleinfeb4d102020-09-17 08:54:08 -0500199 } break;
Mike Kleinec370972020-03-05 10:15:35 -0600200
201 #undef STRIDE_1
202 #undef STRIDE_K
203
204 // Ops that don't interact with memory should never care about the stride.
205 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
206
Mike Klein89b3c1f2020-07-29 16:45:05 -0500207 // These 128-bit ops are implemented serially for simplicity.
Mike Klein31367892020-07-30 08:19:12 -0500208 CASE(Op::store128): {
209 int ptr = immz>>1,
210 lane = immz&1;
Mike Klein89b3c1f2020-07-29 16:45:05 -0500211 U64 src = (skvx::cast<uint64_t>(r[x].u32) << 0 |
212 skvx::cast<uint64_t>(r[y].u32) << 32);
213 for (int i = 0; i < stride; i++) {
Mike Klein31367892020-07-30 08:19:12 -0500214 memcpy((char*)args[ptr] + 16*i + 8*lane, &src[i], 8);
Mike Klein89b3c1f2020-07-29 16:45:05 -0500215 }
216 } break;
217
Mike Klein31367892020-07-30 08:19:12 -0500218 CASE(Op::load128):
Mike Klein89b3c1f2020-07-29 16:45:05 -0500219 r[d].i32 = 0;
220 for (int i = 0; i < stride; i++) {
221 memcpy(&r[d].i32[i], (const char*)args[immy] + 16*i+ 4*immz, 4);
222 } break;
223
Mike Kleinec370972020-03-05 10:15:35 -0600224 CASE(Op::assert_true):
225 #ifdef SK_DEBUG
Mike Klein4284f752020-07-10 15:16:17 -0500226 if (!all(r[x].i32)) {
Mike Kleinec370972020-03-05 10:15:35 -0600227 SkDebugf("inst %d, register %d\n", i, y);
228 for (int i = 0; i < K; i++) {
Mike Klein4284f752020-07-10 15:16:17 -0500229 SkDebugf("\t%2d: %08x (%g)\n", i, r[y].i32[i], r[y].f32[i]);
Mike Kleinec370972020-03-05 10:15:35 -0600230 }
Mike Klein51a7f952020-09-16 16:00:33 -0500231 SkASSERT(false);
Mike Kleinec370972020-03-05 10:15:35 -0600232 }
Mike Kleinec370972020-03-05 10:15:35 -0600233 #endif
234 break;
235
236 CASE(Op::index): {
237 const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
Mike Klein394a6d52020-09-18 14:04:19 -0500238 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
239 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
240 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 };
Mike Kleinec370972020-03-05 10:15:35 -0600241 static_assert(K <= SK_ARRAY_COUNT(iota), "");
242
Mike Klein4284f752020-07-10 15:16:17 -0500243 r[d].i32 = n - I32::Load(iota);
Mike Kleinec370972020-03-05 10:15:35 -0600244 } break;
245
246 CASE(Op::uniform8):
Mike Klein4284f752020-07-10 15:16:17 -0500247 r[d].i32 = *(const uint8_t* )( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600248 break;
249 CASE(Op::uniform16):
Mike Klein4284f752020-07-10 15:16:17 -0500250 r[d].i32 = *(const uint16_t*)( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600251 break;
252 CASE(Op::uniform32):
Mike Klein4284f752020-07-10 15:16:17 -0500253 r[d].i32 = *(const int* )( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600254 break;
255
Mike Klein4284f752020-07-10 15:16:17 -0500256 CASE(Op::splat): r[d].i32 = immy; break;
Mike Kleinec370972020-03-05 10:15:35 -0600257
Mike Klein4284f752020-07-10 15:16:17 -0500258 CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break;
259 CASE(Op::sub_f32): r[d].f32 = r[x].f32 - r[y].f32; break;
260 CASE(Op::mul_f32): r[d].f32 = r[x].f32 * r[y].f32; break;
261 CASE(Op::div_f32): r[d].f32 = r[x].f32 / r[y].f32; break;
262 CASE(Op::min_f32): r[d].f32 = min(r[x].f32, r[y].f32); break;
263 CASE(Op::max_f32): r[d].f32 = max(r[x].f32, r[y].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600264
Mike Klein4284f752020-07-10 15:16:17 -0500265 CASE(Op::fma_f32): r[d].f32 = fma( r[x].f32, r[y].f32, r[z].f32); break;
266 CASE(Op::fms_f32): r[d].f32 = fma( r[x].f32, r[y].f32, -r[z].f32); break;
267 CASE(Op::fnma_f32): r[d].f32 = fma(-r[x].f32, r[y].f32, r[z].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600268
Mike Klein4284f752020-07-10 15:16:17 -0500269 CASE(Op::sqrt_f32): r[d].f32 = sqrt(r[x].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600270
Mike Klein4284f752020-07-10 15:16:17 -0500271 CASE(Op::add_i32): r[d].i32 = r[x].i32 + r[y].i32; break;
272 CASE(Op::sub_i32): r[d].i32 = r[x].i32 - r[y].i32; break;
273 CASE(Op::mul_i32): r[d].i32 = r[x].i32 * r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600274
Mike Klein4284f752020-07-10 15:16:17 -0500275 CASE(Op::shl_i32): r[d].i32 = r[x].i32 << immy; break;
276 CASE(Op::sra_i32): r[d].i32 = r[x].i32 >> immy; break;
277 CASE(Op::shr_i32): r[d].u32 = r[x].u32 >> immy; break;
Mike Kleinec370972020-03-05 10:15:35 -0600278
Mike Klein4284f752020-07-10 15:16:17 -0500279 CASE(Op:: eq_f32): r[d].i32 = r[x].f32 == r[y].f32; break;
280 CASE(Op::neq_f32): r[d].i32 = r[x].f32 != r[y].f32; break;
281 CASE(Op:: gt_f32): r[d].i32 = r[x].f32 > r[y].f32; break;
282 CASE(Op::gte_f32): r[d].i32 = r[x].f32 >= r[y].f32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600283
Mike Klein4284f752020-07-10 15:16:17 -0500284 CASE(Op:: eq_i32): r[d].i32 = r[x].i32 == r[y].i32; break;
285 CASE(Op:: gt_i32): r[d].i32 = r[x].i32 > r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600286
Mike Klein4284f752020-07-10 15:16:17 -0500287 CASE(Op::bit_and ): r[d].i32 = r[x].i32 & r[y].i32; break;
288 CASE(Op::bit_or ): r[d].i32 = r[x].i32 | r[y].i32; break;
289 CASE(Op::bit_xor ): r[d].i32 = r[x].i32 ^ r[y].i32; break;
290 CASE(Op::bit_clear): r[d].i32 = r[x].i32 & ~r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600291
Mike Klein4284f752020-07-10 15:16:17 -0500292 CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32);
Mike Kleinec370972020-03-05 10:15:35 -0600293 break;
294
Mike Klein4284f752020-07-10 15:16:17 -0500295 CASE(Op::pack): r[d].u32 = r[x].u32 | (r[y].u32 << immz); break;
Mike Kleinec370972020-03-05 10:15:35 -0600296
Mike Klein4284f752020-07-10 15:16:17 -0500297 CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break;
298 CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break;
299 CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break;
300 CASE(Op::trunc): r[d].i32 = skvx::cast<int> ( r[x].f32 ); break;
301 CASE(Op::round): r[d].i32 = skvx::cast<int> (skvx::lrint(r[x].f32)); break;
Mike Klein4d680cd2020-07-15 09:58:51 -0500302
303 CASE(Op::to_half):
304 r[d].i32 = skvx::cast<int>(skvx::to_half(r[x].f32));
305 break;
306 CASE(Op::from_half):
307 r[d].f32 = skvx::from_half(skvx::cast<uint16_t>(r[x].i32));
308 break;
Mike Klein98c512c2020-09-15 10:00:27 -0500309
Mike Klein9791e502020-09-15 12:43:38 -0500310 CASE(Op::add_q14x2): r[d].i16x2 = r[x].i16x2 + r[y].i16x2; break;
311 CASE(Op::sub_q14x2): r[d].i16x2 = r[x].i16x2 - r[y].i16x2; break;
Mike Klein6d94b652020-09-16 11:37:03 -0500312 CASE(Op::mul_q14x2): r[d].i16x2 = mul_q14(r[x].i16x2, r[y].i16x2); break;
Mike Klein9791e502020-09-15 12:43:38 -0500313
314 CASE(Op::shl_q14x2): r[d].i16x2 = r[x].i16x2 << immy; break;
315 CASE(Op::sra_q14x2): r[d].i16x2 = r[x].i16x2 >> immy; break;
316 CASE(Op::shr_q14x2): r[d].u16x2 = r[x].u16x2 >> immy; break;
317
318 CASE(Op::eq_q14x2): r[d].i16x2 = r[x].i16x2 == r[y].i16x2; break;
319 CASE(Op::gt_q14x2): r[d].i16x2 = r[x].i16x2 > r[y].i16x2; break;
320
321 CASE(Op:: min_q14x2): r[d].i16x2 = min(r[x].i16x2, r[y].i16x2); break;
322 CASE(Op:: max_q14x2): r[d].i16x2 = max(r[x].i16x2, r[y].i16x2); break;
323 CASE(Op::umin_q14x2): r[d].u16x2 = min(r[x].u16x2, r[y].u16x2); break;
324
Mike Klein7b1620f2020-09-16 10:18:47 -0500325 // Happily, Clang can see through this one and generates perfect code
326 // using vpavgw without any help from us!
Mike Klein98c512c2020-09-15 10:00:27 -0500327 CASE(Op::uavg_q14x2):
Mike Klein9791e502020-09-15 12:43:38 -0500328 r[d].u16x2 = skvx::cast<uint16_t>( (skvx::cast<int>(r[x].u16x2) +
329 skvx::cast<int>(r[y].u16x2) + 1)>>1 );
330 break;
Mike Kleinec370972020-03-05 10:15:35 -0600331 #undef CASE
332 }
333 }
334 }
335 }
336
John Stilesa6841be2020-08-06 14:11:56 -0400337} // namespace SK_OPTS_NS
Mike Kleinec370972020-03-05 10:15:35 -0600338
339#endif//SkVM_opts_DEFINED