blob: e3090615f3c538384f86ef9f82d786ff3c6182a9 [file] [log] [blame]
Mike Kleinec370972020-03-05 10:15:35 -06001// Copyright 2020 Google LLC.
Mike Kleina67d1ae2020-03-09 17:36:00 -05002// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
Mike Kleinec370972020-03-05 10:15:35 -06003
4#ifndef SkVM_opts_DEFINED
5#define SkVM_opts_DEFINED
6
7#include "include/private/SkVx.h"
8#include "src/core/SkVM.h"
9
Mike Klein2e69a132020-09-18 08:02:46 -050010template <int N>
11static inline skvx::Vec<N,int> gather32(const int* ptr, const skvx::Vec<N,int>& ix) {
12#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
13 if constexpr (N == 8) {
14 return skvx::bit_pun<skvx::Vec<N,int>>(
15 _mm256_i32gather_epi32(ptr, skvx::bit_pun<__m256i>(ix), 4));
16 }
17#endif
18 // Try to recurse on specializations, falling back on standard scalar map()-based impl.
19 if constexpr (N > 8) {
20 return join(gather32(ptr, ix.lo),
21 gather32(ptr, ix.hi));
22 }
Mike Klein840e8ea2020-10-12 12:38:10 -050023 return map([&](int i) { return ptr[i]; }, ix);
Mike Klein2e69a132020-09-18 08:02:46 -050024}
25
Mike Kleinec370972020-03-05 10:15:35 -060026namespace SK_OPTS_NS {
27
28 inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
29 const int nregs, const int loop,
30 const int strides[], const int nargs,
31 int n, void* args[]) {
32 using namespace skvm;
33
34 // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
Mike Klein51d35ed2020-04-24 08:16:22 -050035 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
Mike Klein394a6d52020-09-18 14:04:19 -050036 constexpr int K = 32; // 1024-bit: 4 ymm or 2 zmm at a time
Mike Kleinec370972020-03-05 10:15:35 -060037 #else
Mike Klein394a6d52020-09-18 14:04:19 -050038 constexpr int K = 8; // 256-bit: 2 xmm, 2 v-registers, etc.
Mike Kleinec370972020-03-05 10:15:35 -060039 #endif
40 using I32 = skvx::Vec<K, int>;
Mike Klein6b72d3a2020-09-24 11:17:22 -050041 using I16 = skvx::Vec<K, int16_t>;
Mike Kleinec370972020-03-05 10:15:35 -060042 using F32 = skvx::Vec<K, float>;
Mike Klein6732da02020-07-16 13:03:18 -050043 using U64 = skvx::Vec<K, uint64_t>;
Mike Kleinec370972020-03-05 10:15:35 -060044 using U32 = skvx::Vec<K, uint32_t>;
45 using U16 = skvx::Vec<K, uint16_t>;
46 using U8 = skvx::Vec<K, uint8_t>;
Mike Kleinec370972020-03-05 10:15:35 -060047 union Slot {
48 F32 f32;
49 I32 i32;
50 U32 u32;
Mike Klein6b72d3a2020-09-24 11:17:22 -050051 I16 i16;
52 U16 u16;
Mike Kleinec370972020-03-05 10:15:35 -060053 };
54
55 Slot few_regs[16];
56 std::unique_ptr<char[]> many_regs;
57
Mike Klein4284f752020-07-10 15:16:17 -050058 Slot* r = few_regs;
Mike Kleinec370972020-03-05 10:15:35 -060059
60 if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
61 // Annoyingly we can't trust that malloc() or new will work with Slot because
62 // the skvx::Vec types may have alignment greater than what they provide.
63 // We'll overallocate one extra register so we can align manually.
64 many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
65
66 uintptr_t addr = (uintptr_t)many_regs.get();
67 addr += alignof(Slot) -
68 (addr & (alignof(Slot) - 1));
69 SkASSERT((addr & (alignof(Slot) - 1)) == 0);
Mike Klein4284f752020-07-10 15:16:17 -050070 r = (Slot*)addr;
Mike Kleinec370972020-03-05 10:15:35 -060071 }
72
73
Mike Kleinec370972020-03-05 10:15:35 -060074 // Step each argument pointer ahead by its stride a number of times.
75 auto step_args = [&](int times) {
76 for (int i = 0; i < nargs; i++) {
77 args[i] = (void*)( (char*)args[i] + times * strides[i] );
78 }
79 };
80
81 int start = 0,
82 stride;
83 for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
84 stride = n >= K ? K : 1;
85
86 for (int i = start; i < ninsts; i++) {
87 InterpreterInstruction inst = insts[i];
88
Mike Kleinf3087d82021-01-19 11:36:25 -060089 // d = op(x,y,z,w, immA,immB)
Mike Kleinec370972020-03-05 10:15:35 -060090 Reg d = inst.d,
91 x = inst.x,
92 y = inst.y,
Mike Kleinf3087d82021-01-19 11:36:25 -060093 z = inst.z,
94 w = inst.w;
95 (void)w; // TODO: use in store128
Mike Kleinaad97192021-01-19 11:04:37 -060096 int immA = inst.immA,
97 immB = inst.immB;
Mike Kleinec370972020-03-05 10:15:35 -060098
99 // Ops that interact with memory need to know whether we're stride=1 or K,
100 // but all non-memory ops can run the same code no matter the stride.
101 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
102 default: SkUNREACHABLE;
103
104 #define STRIDE_1(op) case 2*(int)op
105 #define STRIDE_K(op) case 2*(int)op + 1
Mike Kleinaad97192021-01-19 11:04:37 -0600106 STRIDE_1(Op::store8 ): memcpy(args[immA], &r[x].i32, 1); break;
107 STRIDE_1(Op::store16): memcpy(args[immA], &r[x].i32, 2); break;
108 STRIDE_1(Op::store32): memcpy(args[immA], &r[x].i32, 4); break;
109 STRIDE_1(Op::store64): memcpy((char*)args[immA]+0, &r[x].i32, 4);
110 memcpy((char*)args[immA]+4, &r[y].i32, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -0600111
Mike Kleinaad97192021-01-19 11:04:37 -0600112 STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r[x].i32).store(args[immA]); break;
113 STRIDE_K(Op::store16): skvx::cast<uint16_t>(r[x].i32).store(args[immA]); break;
114 STRIDE_K(Op::store32): (r[x].i32).store(args[immA]); break;
Mike Klein6732da02020-07-16 13:03:18 -0500115 STRIDE_K(Op::store64): (skvx::cast<uint64_t>(r[x].u32) << 0 |
Mike Kleinaad97192021-01-19 11:04:37 -0600116 skvx::cast<uint64_t>(r[y].u32) << 32).store(args[immA]);
Mike Klein6732da02020-07-16 13:03:18 -0500117 break;
Mike Kleinec370972020-03-05 10:15:35 -0600118
Mike Kleinaad97192021-01-19 11:04:37 -0600119 STRIDE_1(Op::load8 ): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 1); break;
120 STRIDE_1(Op::load16): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 2); break;
121 STRIDE_1(Op::load32): r[d].i32 = 0; memcpy(&r[d].i32, args[immA], 4); break;
Mike Klein31367892020-07-30 08:19:12 -0500122 STRIDE_1(Op::load64):
Mike Kleinaad97192021-01-19 11:04:37 -0600123 r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immA] + 4*immB, 4); break;
Mike Kleinec370972020-03-05 10:15:35 -0600124
Mike Kleinaad97192021-01-19 11:04:37 -0600125 STRIDE_K(Op::load8 ): r[d].i32= skvx::cast<int>(U8 ::Load(args[immA])); break;
126 STRIDE_K(Op::load16): r[d].i32= skvx::cast<int>(U16::Load(args[immA])); break;
127 STRIDE_K(Op::load32): r[d].i32= I32::Load(args[immA]) ; break;
Mike Klein31367892020-07-30 08:19:12 -0500128 STRIDE_K(Op::load64):
Mike Kleinaad97192021-01-19 11:04:37 -0600129 // Low 32 bits if immB=0, or high 32 bits if immB=1.
130 r[d].i32 = skvx::cast<int>(U64::Load(args[immA]) >> (32*immB)); break;
Mike Kleinec370972020-03-05 10:15:35 -0600131
132 // The pointer we base our gather on is loaded indirectly from a uniform:
Mike Kleinaad97192021-01-19 11:04:37 -0600133 // - args[immA] is the uniform holding our gather base pointer somewhere;
134 // - (const uint8_t*)args[immA] + immB points to the gather base pointer;
Mike Kleinec370972020-03-05 10:15:35 -0600135 // - memcpy() loads the gather base and into a pointer of the right type.
136 // After all that we have an ordinary (uniform) pointer `ptr` to load from,
Mike Klein4284f752020-07-10 15:16:17 -0500137 // and we then gather from it using the varying indices in r[x].
Mike Kleinfeb4d102020-09-17 08:54:08 -0500138 STRIDE_1(Op::gather8): {
139 const uint8_t* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600140 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Kleinfeb4d102020-09-17 08:54:08 -0500141 r[d].i32 = ptr[ r[x].i32[0] ];
142 } break;
143 STRIDE_1(Op::gather16): {
144 const uint16_t* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600145 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Kleinfeb4d102020-09-17 08:54:08 -0500146 r[d].i32 = ptr[ r[x].i32[0] ];
147 } break;
148 STRIDE_1(Op::gather32): {
149 const int* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600150 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Kleinfeb4d102020-09-17 08:54:08 -0500151 r[d].i32 = ptr[ r[x].i32[0] ];
152 } break;
Mike Kleinec370972020-03-05 10:15:35 -0600153
Mike Kleinfeb4d102020-09-17 08:54:08 -0500154 STRIDE_K(Op::gather8): {
155 const uint8_t* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600156 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Klein840e8ea2020-10-12 12:38:10 -0500157 r[d].i32 = map([&](int ix) { return (int)ptr[ix]; }, r[x].i32);
Mike Kleinfeb4d102020-09-17 08:54:08 -0500158 } break;
159 STRIDE_K(Op::gather16): {
160 const uint16_t* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600161 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Klein840e8ea2020-10-12 12:38:10 -0500162 r[d].i32 = map([&](int ix) { return (int)ptr[ix]; }, r[x].i32);
Mike Kleinfeb4d102020-09-17 08:54:08 -0500163 } break;
164 STRIDE_K(Op::gather32): {
165 const int* ptr;
Mike Kleinaad97192021-01-19 11:04:37 -0600166 memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
Mike Klein2e69a132020-09-18 08:02:46 -0500167 r[d].i32 = gather32(ptr, r[x].i32);
Mike Kleinfeb4d102020-09-17 08:54:08 -0500168 } break;
Mike Kleinec370972020-03-05 10:15:35 -0600169
170 #undef STRIDE_1
171 #undef STRIDE_K
172
173 // Ops that don't interact with memory should never care about the stride.
174 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
175
Mike Klein89b3c1f2020-07-29 16:45:05 -0500176 // These 128-bit ops are implemented serially for simplicity.
Mike Klein31367892020-07-30 08:19:12 -0500177 CASE(Op::store128): {
Mike Kleinaad97192021-01-19 11:04:37 -0600178 int ptr = immA,
179 lane = immB;
Mike Klein89b3c1f2020-07-29 16:45:05 -0500180 U64 src = (skvx::cast<uint64_t>(r[x].u32) << 0 |
181 skvx::cast<uint64_t>(r[y].u32) << 32);
182 for (int i = 0; i < stride; i++) {
Mike Klein31367892020-07-30 08:19:12 -0500183 memcpy((char*)args[ptr] + 16*i + 8*lane, &src[i], 8);
Mike Klein89b3c1f2020-07-29 16:45:05 -0500184 }
185 } break;
186
Mike Klein31367892020-07-30 08:19:12 -0500187 CASE(Op::load128):
Mike Klein89b3c1f2020-07-29 16:45:05 -0500188 r[d].i32 = 0;
189 for (int i = 0; i < stride; i++) {
Mike Kleinaad97192021-01-19 11:04:37 -0600190 memcpy(&r[d].i32[i], (const char*)args[immA] + 16*i+ 4*immB, 4);
Mike Klein89b3c1f2020-07-29 16:45:05 -0500191 } break;
192
Mike Kleinec370972020-03-05 10:15:35 -0600193 CASE(Op::assert_true):
194 #ifdef SK_DEBUG
Mike Klein4284f752020-07-10 15:16:17 -0500195 if (!all(r[x].i32)) {
Mike Kleinec370972020-03-05 10:15:35 -0600196 SkDebugf("inst %d, register %d\n", i, y);
197 for (int i = 0; i < K; i++) {
Mike Klein4284f752020-07-10 15:16:17 -0500198 SkDebugf("\t%2d: %08x (%g)\n", i, r[y].i32[i], r[y].f32[i]);
Mike Kleinec370972020-03-05 10:15:35 -0600199 }
Mike Klein51a7f952020-09-16 16:00:33 -0500200 SkASSERT(false);
Mike Kleinec370972020-03-05 10:15:35 -0600201 }
Mike Kleinec370972020-03-05 10:15:35 -0600202 #endif
203 break;
204
205 CASE(Op::index): {
206 const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
Mike Klein394a6d52020-09-18 14:04:19 -0500207 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
208 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
209 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63 };
Mike Kleinec370972020-03-05 10:15:35 -0600210 static_assert(K <= SK_ARRAY_COUNT(iota), "");
211
Mike Klein4284f752020-07-10 15:16:17 -0500212 r[d].i32 = n - I32::Load(iota);
Mike Kleinec370972020-03-05 10:15:35 -0600213 } break;
214
Mike Kleinec370972020-03-05 10:15:35 -0600215 CASE(Op::uniform32):
Mike Kleinaad97192021-01-19 11:04:37 -0600216 r[d].i32 = *(const int*)( (const char*)args[immA] + immB );
Mike Kleinec370972020-03-05 10:15:35 -0600217 break;
218
Mike Kleinaad97192021-01-19 11:04:37 -0600219 CASE(Op::splat): r[d].i32 = immA; break;
Mike Kleinec370972020-03-05 10:15:35 -0600220
Mike Klein4284f752020-07-10 15:16:17 -0500221 CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break;
222 CASE(Op::sub_f32): r[d].f32 = r[x].f32 - r[y].f32; break;
223 CASE(Op::mul_f32): r[d].f32 = r[x].f32 * r[y].f32; break;
224 CASE(Op::div_f32): r[d].f32 = r[x].f32 / r[y].f32; break;
225 CASE(Op::min_f32): r[d].f32 = min(r[x].f32, r[y].f32); break;
226 CASE(Op::max_f32): r[d].f32 = max(r[x].f32, r[y].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600227
Mike Klein4284f752020-07-10 15:16:17 -0500228 CASE(Op::fma_f32): r[d].f32 = fma( r[x].f32, r[y].f32, r[z].f32); break;
229 CASE(Op::fms_f32): r[d].f32 = fma( r[x].f32, r[y].f32, -r[z].f32); break;
230 CASE(Op::fnma_f32): r[d].f32 = fma(-r[x].f32, r[y].f32, r[z].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600231
Mike Klein4284f752020-07-10 15:16:17 -0500232 CASE(Op::sqrt_f32): r[d].f32 = sqrt(r[x].f32); break;
Mike Kleinec370972020-03-05 10:15:35 -0600233
Mike Klein4284f752020-07-10 15:16:17 -0500234 CASE(Op::add_i32): r[d].i32 = r[x].i32 + r[y].i32; break;
235 CASE(Op::sub_i32): r[d].i32 = r[x].i32 - r[y].i32; break;
236 CASE(Op::mul_i32): r[d].i32 = r[x].i32 * r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600237
Mike Kleinaad97192021-01-19 11:04:37 -0600238 CASE(Op::shl_i32): r[d].i32 = r[x].i32 << immA; break;
239 CASE(Op::sra_i32): r[d].i32 = r[x].i32 >> immA; break;
240 CASE(Op::shr_i32): r[d].u32 = r[x].u32 >> immA; break;
Mike Kleinec370972020-03-05 10:15:35 -0600241
Mike Klein4284f752020-07-10 15:16:17 -0500242 CASE(Op:: eq_f32): r[d].i32 = r[x].f32 == r[y].f32; break;
243 CASE(Op::neq_f32): r[d].i32 = r[x].f32 != r[y].f32; break;
244 CASE(Op:: gt_f32): r[d].i32 = r[x].f32 > r[y].f32; break;
245 CASE(Op::gte_f32): r[d].i32 = r[x].f32 >= r[y].f32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600246
Mike Klein4284f752020-07-10 15:16:17 -0500247 CASE(Op:: eq_i32): r[d].i32 = r[x].i32 == r[y].i32; break;
248 CASE(Op:: gt_i32): r[d].i32 = r[x].i32 > r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600249
Mike Klein4284f752020-07-10 15:16:17 -0500250 CASE(Op::bit_and ): r[d].i32 = r[x].i32 & r[y].i32; break;
251 CASE(Op::bit_or ): r[d].i32 = r[x].i32 | r[y].i32; break;
252 CASE(Op::bit_xor ): r[d].i32 = r[x].i32 ^ r[y].i32; break;
253 CASE(Op::bit_clear): r[d].i32 = r[x].i32 & ~r[y].i32; break;
Mike Kleinec370972020-03-05 10:15:35 -0600254
Mike Klein4284f752020-07-10 15:16:17 -0500255 CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32);
Mike Kleinec370972020-03-05 10:15:35 -0600256 break;
257
Mike Klein4284f752020-07-10 15:16:17 -0500258 CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break;
259 CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break;
260 CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break;
261 CASE(Op::trunc): r[d].i32 = skvx::cast<int> ( r[x].f32 ); break;
262 CASE(Op::round): r[d].i32 = skvx::cast<int> (skvx::lrint(r[x].f32)); break;
Mike Klein4d680cd2020-07-15 09:58:51 -0500263
Mike Klein42d67a62020-12-01 10:14:55 -0600264 CASE(Op::to_fp16):
Mike Klein4d680cd2020-07-15 09:58:51 -0500265 r[d].i32 = skvx::cast<int>(skvx::to_half(r[x].f32));
266 break;
Mike Klein42d67a62020-12-01 10:14:55 -0600267 CASE(Op::from_fp16):
Mike Klein4d680cd2020-07-15 09:58:51 -0500268 r[d].f32 = skvx::from_half(skvx::cast<uint16_t>(r[x].i32));
269 break;
Mike Klein98c512c2020-09-15 10:00:27 -0500270
Mike Kleinec370972020-03-05 10:15:35 -0600271 #undef CASE
272 }
273 }
274 }
275 }
276
John Stilesa6841be2020-08-06 14:11:56 -0400277} // namespace SK_OPTS_NS
Mike Kleinec370972020-03-05 10:15:35 -0600278
279#endif//SkVM_opts_DEFINED