blob: ef3e4464132e1b51e086f9642cf463d667148aa9 [file] [log] [blame]
Mike Kleinec370972020-03-05 10:15:35 -06001// Copyright 2020 Google LLC.
2
3#ifndef SkVM_opts_DEFINED
4#define SkVM_opts_DEFINED
5
6#include "include/private/SkVx.h"
7#include "src/core/SkVM.h"
8
9namespace SK_OPTS_NS {
10
11 inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
12 const int nregs, const int loop,
13 const int strides[], const int nargs,
14 int n, void* args[]) {
15 using namespace skvm;
16
17 // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
18 // We noticed quad-pumping is slower than single-pumping and both were slower than double.
19 #if defined(__AVX2__)
20 constexpr int K = 16;
21 #else
22 constexpr int K = 8;
23 #endif
24 using I32 = skvx::Vec<K, int>;
25 using F32 = skvx::Vec<K, float>;
26 using U32 = skvx::Vec<K, uint32_t>;
27 using U16 = skvx::Vec<K, uint16_t>;
28 using U8 = skvx::Vec<K, uint8_t>;
29
30 using I16x2 = skvx::Vec<2*K, int16_t>;
31 using U16x2 = skvx::Vec<2*K, uint16_t>;
32
33 union Slot {
34 F32 f32;
35 I32 i32;
36 U32 u32;
37 I16x2 i16x2;
38 U16x2 u16x2;
39 };
40
41 Slot few_regs[16];
42 std::unique_ptr<char[]> many_regs;
43
44 Slot* regs = few_regs;
45
46 if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
47 // Annoyingly we can't trust that malloc() or new will work with Slot because
48 // the skvx::Vec types may have alignment greater than what they provide.
49 // We'll overallocate one extra register so we can align manually.
50 many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
51
52 uintptr_t addr = (uintptr_t)many_regs.get();
53 addr += alignof(Slot) -
54 (addr & (alignof(Slot) - 1));
55 SkASSERT((addr & (alignof(Slot) - 1)) == 0);
56 regs = (Slot*)addr;
57 }
58
59
60 auto r = [&](Reg id) -> Slot& {
61 SkASSERT(0 <= id && id < nregs);
62 return regs[id];
63 };
64 auto arg = [&](int ix) {
65 SkASSERT(0 <= ix && ix < nargs);
66 return args[ix];
67 };
68
69 // Step each argument pointer ahead by its stride a number of times.
70 auto step_args = [&](int times) {
71 for (int i = 0; i < nargs; i++) {
72 args[i] = (void*)( (char*)args[i] + times * strides[i] );
73 }
74 };
75
76 int start = 0,
77 stride;
78 for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
79 stride = n >= K ? K : 1;
80
81 for (int i = start; i < ninsts; i++) {
82 InterpreterInstruction inst = insts[i];
83
84 // d = op(x,y/imm,z/imm)
85 Reg d = inst.d,
86 x = inst.x,
87 y = inst.y,
88 z = inst.z;
89 int immy = inst.immy,
90 immz = inst.immz;
91
92 // Ops that interact with memory need to know whether we're stride=1 or K,
93 // but all non-memory ops can run the same code no matter the stride.
94 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
95 default: SkUNREACHABLE;
96
97 #define STRIDE_1(op) case 2*(int)op
98 #define STRIDE_K(op) case 2*(int)op + 1
99 STRIDE_1(Op::store8 ): memcpy(arg(immy), &r(x).i32, 1); break;
100 STRIDE_1(Op::store16): memcpy(arg(immy), &r(x).i32, 2); break;
101 STRIDE_1(Op::store32): memcpy(arg(immy), &r(x).i32, 4); break;
102
103 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(immy)); break;
104 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(immy)); break;
105 STRIDE_K(Op::store32): (r(x).i32).store(arg(immy)); break;
106
107 STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 1); break;
108 STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 2); break;
109 STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(immy), 4); break;
110
111 STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(immy))); break;
112 STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(immy))); break;
113 STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(immy)) ; break;
114
115 // The pointer we base our gather on is loaded indirectly from a uniform:
116 // - arg(immy) is the uniform holding our gather base pointer somewhere;
117 // - (const uint8_t*)arg(immy) + immz points to the gather base pointer;
118 // - memcpy() loads the gather base and into a pointer of the right type.
119 // After all that we have an ordinary (uniform) pointer `ptr` to load from,
120 // and we then gather from it using the varying indices in r(x).
121 STRIDE_1(Op::gather8):
122 for (int i = 0; i < K; i++) {
123 const uint8_t* ptr;
124 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
125 r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
126 } break;
127 STRIDE_1(Op::gather16):
128 for (int i = 0; i < K; i++) {
129 const uint16_t* ptr;
130 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
131 r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
132 } break;
133 STRIDE_1(Op::gather32):
134 for (int i = 0; i < K; i++) {
135 const int* ptr;
136 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
137 r(d).i32[i] = (i==0) ? ptr[ r(x).i32[i] ] : 0;
138 } break;
139
140 STRIDE_K(Op::gather8):
141 for (int i = 0; i < K; i++) {
142 const uint8_t* ptr;
143 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
144 r(d).i32[i] = ptr[ r(x).i32[i] ];
145 } break;
146 STRIDE_K(Op::gather16):
147 for (int i = 0; i < K; i++) {
148 const uint16_t* ptr;
149 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
150 r(d).i32[i] = ptr[ r(x).i32[i] ];
151 } break;
152 STRIDE_K(Op::gather32):
153 for (int i = 0; i < K; i++) {
154 const int* ptr;
155 memcpy(&ptr, (const uint8_t*)arg(immy) + immz, sizeof(ptr));
156 r(d).i32[i] = ptr[ r(x).i32[i] ];
157 } break;
158
159 #undef STRIDE_1
160 #undef STRIDE_K
161
162 // Ops that don't interact with memory should never care about the stride.
163 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
164
165 CASE(Op::assert_true):
166 #ifdef SK_DEBUG
167 if (!all(r(x).i32)) {
168 SkDebugf("inst %d, register %d\n", i, y);
169 for (int i = 0; i < K; i++) {
170 SkDebugf("\t%2d: %08x (%g)\n", i, r(y).i32[i], r(y).f32[i]);
171 }
172 }
173 SkASSERT(all(r(x).i32));
174 #endif
175 break;
176
177 CASE(Op::index): {
178 const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
179 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
180 static_assert(K <= SK_ARRAY_COUNT(iota), "");
181
182 r(d).i32 = n - I32::Load(iota);
183 } break;
184
185 CASE(Op::uniform8):
186 r(d).i32 = *(const uint8_t* )( (const char*)arg(immy) + immz );
187 break;
188 CASE(Op::uniform16):
189 r(d).i32 = *(const uint16_t*)( (const char*)arg(immy) + immz );
190 break;
191 CASE(Op::uniform32):
192 r(d).i32 = *(const int* )( (const char*)arg(immy) + immz );
193 break;
194
195 CASE(Op::splat): r(d).i32 = immy; break;
196
197 CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
198 CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
199 CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
200 CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
201 CASE(Op::min_f32): r(d).f32 = min(r(x).f32, r(y).f32); break;
202 CASE(Op::max_f32): r(d).f32 = max(r(x).f32, r(y).f32); break;
203
204 // These _imm instructions are all x86/JIT only.
205 CASE(Op::add_f32_imm):
206 CASE(Op::sub_f32_imm):
207 CASE(Op::mul_f32_imm):
208 CASE(Op::min_f32_imm):
209 CASE(Op::max_f32_imm):
210 CASE(Op::bit_and_imm):
211 CASE(Op::bit_or_imm ):
212 CASE(Op::bit_xor_imm): SkUNREACHABLE; break;
213
214 CASE(Op::fma_f32): r(d).f32 = fma(r(x).f32, r(y).f32, r(z).f32); break;
215
216 CASE(Op::sqrt_f32): r(d).f32 = sqrt(r(x).f32); break;
217
218 CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
219 CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
220 CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
221
222 CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
223 CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
224 CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
225
226 CASE(Op::shl_i32): r(d).i32 = r(x).i32 << immy; break;
227 CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> immy; break;
228 CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> immy; break;
229
230 CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << immy; break;
231 CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> immy; break;
232 CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> immy; break;
233
234 CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
235 CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
236 CASE(Op:: gt_f32): r(d).i32 = r(x).f32 > r(y).f32; break;
237 CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
238
239 CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
240 CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
241 CASE(Op:: gt_i32): r(d).i32 = r(x).i32 > r(y).i32; break;
242 CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
243
244 CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
245 CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
246 CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 > r(y).i16x2; break;
247 CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
248
249 CASE(Op::bit_and ): r(d).i32 = r(x).i32 & r(y).i32; break;
250 CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
251 CASE(Op::bit_xor ): r(d).i32 = r(x).i32 ^ r(y).i32; break;
252 CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
253
254 CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
255 break;
256
257 CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << immz); break;
258
259 CASE(Op::bytes): {
260 const U32 table[] = {
261 0,
262 (r(x).u32 ) & 0xff,
263 (r(x).u32 >> 8) & 0xff,
264 (r(x).u32 >> 16) & 0xff,
265 (r(x).u32 >> 24) & 0xff,
266 };
267 r(d).u32 = table[(immy >> 0) & 0xf] << 0
268 | table[(immy >> 4) & 0xf] << 8
269 | table[(immy >> 8) & 0xf] << 16
270 | table[(immy >> 12) & 0xf] << 24;
271 } break;
272
273 CASE(Op::floor): r(d).f32 = skvx::floor(r(x).f32); break;
274 CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
275 CASE(Op::trunc): r(d).i32 = skvx::cast<int> (r(x).f32); break;
276 #undef CASE
277 }
278 }
279 }
280 }
281
282}
283
284#endif//SkVM_opts_DEFINED