blob: 26df430bebd8992250c8802ef8c876a0654ec648 [file] [log] [blame]
Mike Kleinec370972020-03-05 10:15:35 -06001// Copyright 2020 Google LLC.
Mike Kleina67d1ae2020-03-09 17:36:00 -05002// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
Mike Kleinec370972020-03-05 10:15:35 -06003
4#ifndef SkVM_opts_DEFINED
5#define SkVM_opts_DEFINED
6
7#include "include/private/SkVx.h"
8#include "src/core/SkVM.h"
9
10namespace SK_OPTS_NS {
11
12 inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
13 const int nregs, const int loop,
14 const int strides[], const int nargs,
15 int n, void* args[]) {
16 using namespace skvm;
17
18 // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
19 // We noticed quad-pumping is slower than single-pumping and both were slower than double.
Mike Klein51d35ed2020-04-24 08:16:22 -050020 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
Mike Kleinec370972020-03-05 10:15:35 -060021 constexpr int K = 16;
22 #else
23 constexpr int K = 8;
24 #endif
25 using I32 = skvx::Vec<K, int>;
26 using F32 = skvx::Vec<K, float>;
Mike Klein6732da02020-07-16 13:03:18 -050027 using U64 = skvx::Vec<K, uint64_t>;
Mike Kleinec370972020-03-05 10:15:35 -060028 using U32 = skvx::Vec<K, uint32_t>;
29 using U16 = skvx::Vec<K, uint16_t>;
30 using U8 = skvx::Vec<K, uint8_t>;
31
Mike Kleinec370972020-03-05 10:15:35 -060032 union Slot {
33 F32 f32;
34 I32 i32;
35 U32 u32;
Mike Kleinec370972020-03-05 10:15:35 -060036 };
37
38 Slot few_regs[16];
39 std::unique_ptr<char[]> many_regs;
40
Mike Klein4284f752020-07-10 15:16:17 -050041 Slot* r = few_regs;
Mike Kleinec370972020-03-05 10:15:35 -060042
43 if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
44 // Annoyingly we can't trust that malloc() or new will work with Slot because
45 // the skvx::Vec types may have alignment greater than what they provide.
46 // We'll overallocate one extra register so we can align manually.
47 many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
48
49 uintptr_t addr = (uintptr_t)many_regs.get();
50 addr += alignof(Slot) -
51 (addr & (alignof(Slot) - 1));
52 SkASSERT((addr & (alignof(Slot) - 1)) == 0);
Mike Klein4284f752020-07-10 15:16:17 -050053 r = (Slot*)addr;
Mike Kleinec370972020-03-05 10:15:35 -060054 }
55
56
Mike Kleinec370972020-03-05 10:15:35 -060057 // Step each argument pointer ahead by its stride a number of times.
58 auto step_args = [&](int times) {
59 for (int i = 0; i < nargs; i++) {
60 args[i] = (void*)( (char*)args[i] + times * strides[i] );
61 }
62 };
63
64 int start = 0,
65 stride;
66 for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
67 stride = n >= K ? K : 1;
68
69 for (int i = start; i < ninsts; i++) {
70 InterpreterInstruction inst = insts[i];
71
72 // d = op(x,y/imm,z/imm)
73 Reg d = inst.d,
74 x = inst.x,
75 y = inst.y,
76 z = inst.z;
77 int immy = inst.immy,
78 immz = inst.immz;
79
80 // Ops that interact with memory need to know whether we're stride=1 or K,
81 // but all non-memory ops can run the same code no matter the stride.
82 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
83 default: SkUNREACHABLE;
84
85 #define STRIDE_1(op) case 2*(int)op
86 #define STRIDE_K(op) case 2*(int)op + 1
Mike Klein4284f752020-07-10 15:16:17 -050087 STRIDE_1(Op::store8 ): memcpy(args[immy], &r[x].i32, 1); break;
88 STRIDE_1(Op::store16): memcpy(args[immy], &r[x].i32, 2); break;
89 STRIDE_1(Op::store32): memcpy(args[immy], &r[x].i32, 4); break;
Mike Klein6732da02020-07-16 13:03:18 -050090 STRIDE_1(Op::store64): memcpy((char*)args[immz]+0, &r[x].i32, 4);
91 memcpy((char*)args[immz]+4, &r[y].i32, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -060092
Mike Klein4284f752020-07-10 15:16:17 -050093 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r[x].i32).store(args[immy]); break;
94 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r[x].i32).store(args[immy]); break;
95 STRIDE_K(Op::store32): (r[x].i32).store(args[immy]); break;
Mike Klein6732da02020-07-16 13:03:18 -050096 STRIDE_K(Op::store64): (skvx::cast<uint64_t>(r[x].u32) << 0 |
97 skvx::cast<uint64_t>(r[y].u32) << 32).store(args[immz]);
98 break;
Mike Kleinec370972020-03-05 10:15:35 -060099
Mike Klein4284f752020-07-10 15:16:17 -0500100 STRIDE_1(Op::load8 ): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 1); break;
101 STRIDE_1(Op::load16): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 2); break;
102 STRIDE_1(Op::load32): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 4); break;
Mike Klein6732da02020-07-16 13:03:18 -0500103 STRIDE_1(Op::load64_lo):
104 r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immy] + 0, 4); break;
105 STRIDE_1(Op::load64_hi):
106 r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immy] + 4, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -0600107
Mike Klein4284f752020-07-10 15:16:17 -0500108 STRIDE_K(Op::load8 ): r[d].i32= skvx::cast<int>(U8 ::Load(args[immy])); break;
109 STRIDE_K(Op::load16): r[d].i32= skvx::cast<int>(U16::Load(args[immy])); break;
110 STRIDE_K(Op::load32): r[d].i32= I32::Load(args[immy]) ; break;
Mike Klein6732da02020-07-16 13:03:18 -0500111 STRIDE_K(Op::load64_lo):
112 r[d].i32 = skvx::cast<int>(U64::Load(args[immy]) & 0xffff'ffff); break;
113 STRIDE_K(Op::load64_hi):
114 r[d].i32 = skvx::cast<int>(U64::Load(args[immy]) >> 32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600115
116 // The pointer we base our gather on is loaded indirectly from a uniform:
Mike Klein4284f752020-07-10 15:16:17 -0500117 // - args[immy] is the uniform holding our gather base pointer somewhere;
118 // - (const uint8_t*)args[immy] + immz points to the gather base pointer;
Mike Kleinec370972020-03-05 10:15:35 -0600119 // - memcpy() loads the gather base and into a pointer of the right type.
120 // After all that we have an ordinary (uniform) pointer `ptr` to load from,
Mike Klein4284f752020-07-10 15:16:17 -0500121 // and we then gather from it using the varying indices in r[x].
Mike Kleinec370972020-03-05 10:15:35 -0600122 STRIDE_1(Op::gather8):
123 for (int i = 0; i < K; i++) {
124 const uint8_t* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500125 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
126 r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0;
Mike Kleinec370972020-03-05 10:15:35 -0600127 } break;
128 STRIDE_1(Op::gather16):
129 for (int i = 0; i < K; i++) {
130 const uint16_t* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500131 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
132 r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0;
Mike Kleinec370972020-03-05 10:15:35 -0600133 } break;
134 STRIDE_1(Op::gather32):
135 for (int i = 0; i < K; i++) {
136 const int* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500137 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
138 r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0;
Mike Kleinec370972020-03-05 10:15:35 -0600139 } break;
140
141 STRIDE_K(Op::gather8):
142 for (int i = 0; i < K; i++) {
143 const uint8_t* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500144 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
145 r[d].i32[i] = ptr[ r[x].i32[i] ];
Mike Kleinec370972020-03-05 10:15:35 -0600146 } break;
147 STRIDE_K(Op::gather16):
148 for (int i = 0; i < K; i++) {
149 const uint16_t* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500150 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
151 r[d].i32[i] = ptr[ r[x].i32[i] ];
Mike Kleinec370972020-03-05 10:15:35 -0600152 } break;
153 STRIDE_K(Op::gather32):
154 for (int i = 0; i < K; i++) {
155 const int* ptr;
Mike Klein4284f752020-07-10 15:16:17 -0500156 memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
157 r[d].i32[i] = ptr[ r[x].i32[i] ];
Mike Kleinec370972020-03-05 10:15:35 -0600158 } break;
159
160 #undef STRIDE_1
161 #undef STRIDE_K
162
163 // Ops that don't interact with memory should never care about the stride.
164 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
165
166 CASE(Op::assert_true):
167 #ifdef SK_DEBUG
Mike Klein4284f752020-07-10 15:16:17 -0500168 if (!all(r[x].i32)) {
Mike Kleinec370972020-03-05 10:15:35 -0600169 SkDebugf("inst %d, register %d\n", i, y);
170 for (int i = 0; i < K; i++) {
Mike Klein4284f752020-07-10 15:16:17 -0500171 SkDebugf("\t%2d: %08x (%g)\n", i, r[y].i32[i], r[y].f32[i]);
Mike Kleinec370972020-03-05 10:15:35 -0600172 }
173 }
Mike Klein4284f752020-07-10 15:16:17 -0500174 SkASSERT(all(r[x].i32));
Mike Kleinec370972020-03-05 10:15:35 -0600175 #endif
176 break;
177
178 CASE(Op::index): {
179 const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
180 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
181 static_assert(K <= SK_ARRAY_COUNT(iota), "");
182
Mike Klein4284f752020-07-10 15:16:17 -0500183 r[d].i32 = n - I32::Load(iota);
Mike Kleinec370972020-03-05 10:15:35 -0600184 } break;
185
186 CASE(Op::uniform8):
Mike Klein4284f752020-07-10 15:16:17 -0500187 r[d].i32 = *(const uint8_t* )( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600188 break;
189 CASE(Op::uniform16):
Mike Klein4284f752020-07-10 15:16:17 -0500190 r[d].i32 = *(const uint16_t*)( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600191 break;
192 CASE(Op::uniform32):
Mike Klein4284f752020-07-10 15:16:17 -0500193 r[d].i32 = *(const int* )( (const char*)args[immy] + immz );
Mike Kleinec370972020-03-05 10:15:35 -0600194 break;
195
Mike Klein4284f752020-07-10 15:16:17 -0500196 CASE(Op::splat): r[d].i32 = immy; break;
Mike Kleinec370972020-03-05 10:15:35 -0600197
Mike Klein4284f752020-07-10 15:16:17 -0500198 CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break;
199 CASE(Op::sub_f32): r[d].f32 = r[x].f32 - r[y].f32; break;
200 CASE(Op::mul_f32): r[d].f32 = r[x].f32 * r[y].f32; break;
201 CASE(Op::div_f32): r[d].f32 = r[x].f32 / r[y].f32; break;
202 CASE(Op::min_f32): r[d].f32 = min(r[x].f32, r[y].f32); break;
203 CASE(Op::max_f32): r[d].f32 = max(r[x].f32, r[y].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600204
Mike Klein4284f752020-07-10 15:16:17 -0500205 CASE(Op::fma_f32): r[d].f32 = fma( r[x].f32, r[y].f32, r[z].f32); break;
206 CASE(Op::fms_f32): r[d].f32 = fma( r[x].f32, r[y].f32, -r[z].f32); break;
207 CASE(Op::fnma_f32): r[d].f32 = fma(-r[x].f32, r[y].f32, r[z].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600208
Mike Klein4284f752020-07-10 15:16:17 -0500209 CASE(Op::sqrt_f32): r[d].f32 = sqrt(r[x].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600210
Mike Klein4284f752020-07-10 15:16:17 -0500211 CASE(Op::add_i32): r[d].i32 = r[x].i32 + r[y].i32; break;
212 CASE(Op::sub_i32): r[d].i32 = r[x].i32 - r[y].i32; break;
213 CASE(Op::mul_i32): r[d].i32 = r[x].i32 * r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600214
Mike Klein4284f752020-07-10 15:16:17 -0500215 CASE(Op::shl_i32): r[d].i32 = r[x].i32 << immy; break;
216 CASE(Op::sra_i32): r[d].i32 = r[x].i32 >> immy; break;
217 CASE(Op::shr_i32): r[d].u32 = r[x].u32 >> immy; break;
Mike Kleinec370972020-03-05 10:15:35 -0600218
Mike Klein4284f752020-07-10 15:16:17 -0500219 CASE(Op:: eq_f32): r[d].i32 = r[x].f32 == r[y].f32; break;
220 CASE(Op::neq_f32): r[d].i32 = r[x].f32 != r[y].f32; break;
221 CASE(Op:: gt_f32): r[d].i32 = r[x].f32 > r[y].f32; break;
222 CASE(Op::gte_f32): r[d].i32 = r[x].f32 >= r[y].f32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600223
Mike Klein4284f752020-07-10 15:16:17 -0500224 CASE(Op:: eq_i32): r[d].i32 = r[x].i32 == r[y].i32; break;
225 CASE(Op:: gt_i32): r[d].i32 = r[x].i32 > r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600226
Mike Klein4284f752020-07-10 15:16:17 -0500227 CASE(Op::bit_and ): r[d].i32 = r[x].i32 & r[y].i32; break;
228 CASE(Op::bit_or ): r[d].i32 = r[x].i32 | r[y].i32; break;
229 CASE(Op::bit_xor ): r[d].i32 = r[x].i32 ^ r[y].i32; break;
230 CASE(Op::bit_clear): r[d].i32 = r[x].i32 & ~r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600231
Mike Klein4284f752020-07-10 15:16:17 -0500232 CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32);
Mike Kleinec370972020-03-05 10:15:35 -0600233 break;
234
Mike Klein4284f752020-07-10 15:16:17 -0500235 CASE(Op::pack): r[d].u32 = r[x].u32 | (r[y].u32 << immz); break;
Mike Kleinec370972020-03-05 10:15:35 -0600236
Mike Klein4284f752020-07-10 15:16:17 -0500237 CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break;
238 CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break;
239 CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break;
240 CASE(Op::trunc): r[d].i32 = skvx::cast<int> ( r[x].f32 ); break;
241 CASE(Op::round): r[d].i32 = skvx::cast<int> (skvx::lrint(r[x].f32)); break;
Mike Klein4d680cd2020-07-15 09:58:51 -0500242
243 CASE(Op::to_half):
244 r[d].i32 = skvx::cast<int>(skvx::to_half(r[x].f32));
245 break;
246 CASE(Op::from_half):
247 r[d].f32 = skvx::from_half(skvx::cast<uint16_t>(r[x].i32));
248 break;
Mike Kleinec370972020-03-05 10:15:35 -0600249 #undef CASE
250 }
251 }
252 }
253 }
254
255}
256
257#endif//SkVM_opts_DEFINED