blob: b289871576b300ec30639b7867f7dee393fcbb36 [file] [log] [blame]
Mike Kleina7080262017-01-09 10:20:13 -05001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkCpu.h"
9#include "SkOpts.h"
10#include "SkRasterPipeline.h"
11#include "SkStream.h"
Mike Klein09326e72017-01-11 13:41:30 -050012#if defined(_MSC_VER)
13 #include <windows.h>
14#else
15 #include <sys/mman.h>
16#endif
Mike Kleina7080262017-01-09 10:20:13 -050017
18#include "SkSplicer_generated.h"
Mike Kleinf7200982017-01-15 18:14:07 -050019#include "SkSplicer_generated_lowp.h"
Mike Kleina7080262017-01-09 10:20:13 -050020#include "SkSplicer_shared.h"
21
Mike Klein13ccda42017-01-10 14:09:24 -050022// Uncomment to dump output JIT'd pipeline.
23//#define DUMP "/tmp/dump.bin"
Mike Kleinf7200982017-01-15 18:14:07 -050024//#define DUMP "/data/local/tmp/dump.bin"
Mike Klein13ccda42017-01-10 14:09:24 -050025//
26// On x86, we'll include IACA markers too.
27// https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
28// Running IACA will disassemble, and more.
29// $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
30//
31// To disassemble an aarch64 dump,
Mike Kleinf7200982017-01-15 18:14:07 -050032// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less
Mike Klein4ef8cb32017-01-12 11:36:46 -050033//
34// To disassemble an armv7 dump,
Mike Kleinf7200982017-01-15 18:14:07 -050035// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
Mike Kleina7080262017-01-09 10:20:13 -050036
37namespace {
38
39 // Stages expect these constants to be set to these values.
40 // It's fine to rearrange and add new ones if you update SkSplicer_constants.
41 static const SkSplicer_constants kConstants = {
42 0x000000ff, 1.0f, 255.0f, 1/255.0f,
43 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
44 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
45 };
Mike Kleinf7200982017-01-15 18:14:07 -050046 static const SkSplicer_constants_lowp kConstants_lowp = {
47 0x0001, 0x8000,
48 };
Mike Kleina7080262017-01-09 10:20:13 -050049
Mike Kleina7080262017-01-09 10:20:13 -050050 // We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
51 template <typename T>
Mike Klein8e619a22017-01-09 17:21:32 -050052 static void splice(SkWStream* buf, const T& val) {
Mike Kleinf7200982017-01-15 18:14:07 -050053 // This null check makes determining whether we can drop to lowp easier.
54 // It's always known at compile time..
55 if (buf) {
56 buf->write(&val, sizeof(val));
57 }
Mike Kleina7080262017-01-09 10:20:13 -050058 }
59
Mike Klein8e619a22017-01-09 17:21:32 -050060#if defined(__aarch64__)
61 static constexpr int kStride = 4;
Mike Klein8e619a22017-01-09 17:21:32 -050062 static void set_ctx(SkWStream* buf, void* ctx) {
63 uint16_t parts[4];
64 memcpy(parts, &ctx, 8);
65 splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2
66 splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2
67 splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2
68 splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
69 }
70 static void loop(SkWStream* buf, int loop_start) {
Mike Klein8e619a22017-01-09 17:21:32 -050071 splice(buf, 0xeb01001f); // cmp x0, x1
Mike Klein13ccda42017-01-10 14:09:24 -050072 int off = loop_start - (int)buf->bytesWritten();
Mike Klein8e619a22017-01-09 17:21:32 -050073 off /= 4; // bytes -> instructions, still signed
74 off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions)
75 splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than)
76 }
77 static void ret(SkWStream* buf) {
78 splice(buf, 0xd65f03c0); // ret
79 }
Mike Klein4ef8cb32017-01-12 11:36:46 -050080#elif defined(__ARM_NEON__)
81 static constexpr int kStride = 2;
82 static void set_ctx(SkWStream* buf, void* ctx) {
83 uint16_t parts[2];
84 auto encode = [](uint16_t part) -> uint32_t {
85 return (part & 0xf000) << 4 | (part & 0xfff);
86 };
87 memcpy(parts, &ctx, 4);
88 splice(buf, 0xe3002000 | encode(parts[0])); // mov r2, <bottom 16 bits>
89 splice(buf, 0xe3402000 | encode(parts[1])); // movt r2, <top 16 bits>
90 }
91 static void loop(SkWStream* buf, int loop_start) {
Mike Klein4ef8cb32017-01-12 11:36:46 -050092 splice(buf, 0xe1500001); // cmp r0, r1
93 int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
94 off /= 4; // bytes -> instructions, still signed
95 off = (off & 0x00ffffff);
96 splice(buf, 0x3a000000 | off); // bcc loop_start
97 }
98 static void ret(SkWStream* buf) {
99 splice(buf, 0xe12fff1e); // bx lr
100 }
Mike Klein8e619a22017-01-09 17:21:32 -0500101#else
102 static constexpr int kStride = 8;
Mike Klein8e619a22017-01-09 17:21:32 -0500103 static void set_ctx(SkWStream* buf, void* ctx) {
104 static const uint8_t movabsq_rdx[] = { 0x48, 0xba };
105 splice(buf, movabsq_rdx); // movabsq <next 8 bytes>, %rdx
106 splice(buf, ctx);
107 }
108 static void loop(SkWStream* buf, int loop_start) {
Mike Klein8e619a22017-01-09 17:21:32 -0500109 static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
110 static const uint8_t jb_near[] = { 0x0f, 0x8c };
Mike Klein8e619a22017-01-09 17:21:32 -0500111 splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
112 splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
113 splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
114 }
115 static void ret(SkWStream* buf) {
116 static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
117 static const uint8_t ret[] = { 0xc3 };
118 splice(buf, vzeroupper);
119 splice(buf, ret);
120 }
121#endif
122
Mike Klein09326e72017-01-11 13:41:30 -0500123#if defined(_MSC_VER)
124 // Adapt from MS ABI to System V ABI used by stages.
125 static void before_loop(SkWStream* buf) {
126 static const uint8_t ms_to_system_v[] = {
127 0x56, // push %rsi
128 0x57, // push %rdi
129 0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp
130 0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp)
131 0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp)
132 0xc5,0x78,0x29,0x6c,0x24,0x70, // vmovaps %xmm13,0x70(%rsp)
133 0xc5,0x78,0x29,0x64,0x24,0x60, // vmovaps %xmm12,0x60(%rsp)
134 0xc5,0x78,0x29,0x5c,0x24,0x50, // vmovaps %xmm11,0x50(%rsp)
135 0xc5,0x78,0x29,0x54,0x24,0x40, // vmovaps %xmm10,0x40(%rsp)
136 0xc5,0x78,0x29,0x4c,0x24,0x30, // vmovaps %xmm9,0x30(%rsp)
137 0xc5,0x78,0x29,0x44,0x24,0x20, // vmovaps %xmm8,0x20(%rsp)
138 0xc5,0xf8,0x29,0x7c,0x24,0x10, // vmovaps %xmm7,0x10(%rsp)
139 0xc5,0xf8,0x29,0x34,0x24, // vmovaps %xmm6,(%rsp)
140 0x48,0x89,0xcf, // mov %rcx,%rdi
141 0x48,0x89,0xd6, // mov %rdx,%rsi
142 0x4c,0x89,0xc2, // mov %r8,%rdx
143 0x4c,0x89,0xc9, // mov %r9,%rcx
144 };
145 splice(buf, ms_to_system_v);
146 }
147 static void after_loop(SkWStream* buf) {
148 static const uint8_t system_v_to_ms[] = {
149 0xc5,0xf8,0x28,0x34,0x24, // vmovaps (%rsp),%xmm6
150 0xc5,0xf8,0x28,0x7c,0x24,0x10, // vmovaps 0x10(%rsp),%xmm7
151 0xc5,0x78,0x28,0x44,0x24,0x20, // vmovaps 0x20(%rsp),%xmm8
152 0xc5,0x78,0x28,0x4c,0x24,0x30, // vmovaps 0x30(%rsp),%xmm9
153 0xc5,0x78,0x28,0x54,0x24,0x40, // vmovaps 0x40(%rsp),%xmm10
154 0xc5,0x78,0x28,0x5c,0x24,0x50, // vmovaps 0x50(%rsp),%xmm11
155 0xc5,0x78,0x28,0x64,0x24,0x60, // vmovaps 0x60(%rsp),%xmm12
156 0xc5,0x78,0x28,0x6c,0x24,0x70, // vmovaps 0x70(%rsp),%xmm13
157 0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14
158 0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15
159 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp
160 0x5f, // pop %rdi
161 0x5e, // pop %rsi
162 };
163 splice(buf, system_v_to_ms);
164 }
Mike Klein4ef8cb32017-01-12 11:36:46 -0500165#elif !defined(__aarch64__) && !defined(__ARM_NEON__) && defined(DUMP)
Mike Klein09326e72017-01-11 13:41:30 -0500166 // IACA start and end markers.
Mike Kleina7080262017-01-09 10:20:13 -0500167 static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run
168 static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op
169 static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx
170
Mike Klein09326e72017-01-11 13:41:30 -0500171 static void before_loop(SkWStream* buf) {
Mike Klein8e619a22017-01-09 17:21:32 -0500172 splice(buf, ud2);
173 splice(buf, movl_ebx);
174 splice(buf, 111);
175 splice(buf, nop3);
Mike Kleina7080262017-01-09 10:20:13 -0500176 }
Mike Klein09326e72017-01-11 13:41:30 -0500177 static void after_loop(SkWStream* buf) {
Mike Klein8e619a22017-01-09 17:21:32 -0500178 splice(buf, movl_ebx);
179 splice(buf, 222);
180 splice(buf, nop3);
181 splice(buf, ud2);
Mike Kleina7080262017-01-09 10:20:13 -0500182 }
183#else
Mike Klein09326e72017-01-11 13:41:30 -0500184 static void before_loop(SkWStream*) {}
185 static void after_loop (SkWStream*) {}
Mike Kleina7080262017-01-09 10:20:13 -0500186#endif
187
Mike Klein09326e72017-01-11 13:41:30 -0500188 // We can only mprotect / VirtualProtect at 4K page granularity.
189 static size_t round_up_to_full_pages(size_t len) {
190 size_t size = 0;
191 while (size < len) {
192 size += 4096;
Mike Kleina7080262017-01-09 10:20:13 -0500193 }
Mike Klein09326e72017-01-11 13:41:30 -0500194 return size;
195 }
196
197#if defined(_MSC_VER)
198 // Copy len bytes from src to memory that's executable. cleanup with cleanup_executable_mem().
199 static void* copy_to_executable_mem(const void* src, size_t* len) {
200 if (!src || !*len) {
201 return nullptr;
202 }
203
204 size_t alloc = round_up_to_full_pages(*len);
205
206 auto fn = VirtualAlloc(nullptr, alloc, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
207 memcpy(fn, src, *len);
208
209 DWORD dont_care;
210 VirtualProtect(fn, alloc, PAGE_EXECUTE_READ, &dont_care);
211
212 *len = alloc;
213 return fn;
214 }
215 static void cleanup_executable_mem(void* fn, size_t len) {
216 if (fn) {
217 VirtualFree(fn, 0, MEM_RELEASE);
218 }
219 }
220#else
221 static void* copy_to_executable_mem(const void* src, size_t* len) {
222 if (!src || !*len) {
223 return nullptr;
224 }
225
226 size_t alloc = round_up_to_full_pages(*len);
227
228 auto fn = mmap(nullptr, alloc, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
229 memcpy(fn, src, *len);
230
231 mprotect(fn, alloc, PROT_READ|PROT_EXEC);
232 __builtin___clear_cache((char*)fn, (char*)fn + *len); // Essential on ARM; no-op on x86.
233
234 *len = alloc;
235 return fn;
Mike Kleina7080262017-01-09 10:20:13 -0500236 }
237 static void cleanup_executable_mem(void* fn, size_t len) {
238 if (fn) {
239 munmap(fn, len);
240 }
241 }
Mike Klein09326e72017-01-11 13:41:30 -0500242#endif
Mike Kleina7080262017-01-09 10:20:13 -0500243
Mike Kleinf7200982017-01-15 18:14:07 -0500244 static bool splice_lowp(SkWStream* buf, SkRasterPipeline::StockStage st) {
245 switch (st) {
246 default: return false;
247 case SkRasterPipeline::clamp_0: break; // lowp can't go below 0.
248 #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st##_lowp); break
249 CASE(clear);
250 CASE(plus_);
251 CASE(srcover);
252 CASE(dstover);
253 CASE(clamp_1);
254 CASE(clamp_a);
255 CASE(swap);
256 CASE(move_src_dst);
257 CASE(move_dst_src);
258 CASE(premul);
259 CASE(load_8888);
260 CASE(store_8888);
261 #undef CASE
262 }
263 return true;
264 }
265
266 static bool splice_highp(SkWStream* buf, SkRasterPipeline::StockStage st) {
267 switch (st) {
268 default: return false;
269 #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st); break
270 CASE(clear);
271 CASE(plus_);
272 CASE(srcover);
273 CASE(dstover);
274 CASE(clamp_0);
275 CASE(clamp_1);
276 CASE(clamp_a);
277 CASE(swap);
278 CASE(move_src_dst);
279 CASE(move_dst_src);
280 CASE(premul);
281 CASE(unpremul);
282 CASE(from_srgb);
283 CASE(to_srgb);
284 CASE(scale_u8);
285 CASE(load_tables);
286 CASE(load_8888);
287 CASE(store_8888);
288 CASE(load_f16);
289 CASE(store_f16);
290 CASE(matrix_3x4);
291 #undef CASE
292 }
293 return true;
294 }
295
Mike Kleina7080262017-01-09 10:20:13 -0500296 struct Spliced {
297
298 Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
299 // We always create a backup interpreter pipeline,
300 // - to handle any program we can't, and
Mike Kleinf7200982017-01-15 18:14:07 -0500301 // - to handle the n < stride tails.
Mike Kleina7080262017-01-09 10:20:13 -0500302 fBackup = SkOpts::compile_pipeline(stages, nstages);
303 fSplicedLen = 0;
304 fSpliced = nullptr;
Mike Klein0d7d8de2017-01-19 17:35:32 -0500305 fLowp = false;
Mike Kleina7080262017-01-09 10:20:13 -0500306 // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
307
Mike Klein4ef8cb32017-01-12 11:36:46 -0500308 #if defined(__aarch64__)
309 #elif defined(__ARM_NEON__)
310 // Late generation ARMv7, e.g. Cortex A15 or Krait.
311 if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
312 return;
313 }
314 #else
315 // To keep things simple, only one x86 target supported: Haswell+ x86-64.
Mike Kleina7080262017-01-09 10:20:13 -0500316 if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
317 return;
318 }
Mike Klein8e619a22017-01-09 17:21:32 -0500319 #endif
Mike Kleina7080262017-01-09 10:20:13 -0500320
Mike Kleinf7200982017-01-15 18:14:07 -0500321 // See if all the stages can run in lowp mode. If so, we can run at ~2x speed.
322 bool lowp = true;
323 for (int i = 0; i < nstages; i++) {
324 if (!splice_lowp(nullptr, stages[i].stage)) {
325 //SkDebugf("SkSplicer can't yet handle stage %d in lowp.\n", stages[i].stage);
326 lowp = false;
327 break;
328 }
329 }
330 fLowp = lowp;
331
Mike Kleina7080262017-01-09 10:20:13 -0500332 SkDynamicMemoryWStream buf;
333
Mike Klein7ba89a12017-01-10 13:42:51 -0500334 // Our loop is the equivalent of this C++ code:
335 // do {
336 // ... run spliced stages...
Mike Kleinf7200982017-01-15 18:14:07 -0500337 // x += stride;
Mike Klein7ba89a12017-01-10 13:42:51 -0500338 // } while(x < limit);
Mike Klein09326e72017-01-11 13:41:30 -0500339 before_loop(&buf);
Mike Kleina7080262017-01-09 10:20:13 -0500340 auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
341
342 for (int i = 0; i < nstages; i++) {
Mike Klein8e619a22017-01-09 17:21:32 -0500343 // If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx".
Mike Kleina7080262017-01-09 10:20:13 -0500344 if (stages[i].ctx) {
Mike Klein8e619a22017-01-09 17:21:32 -0500345 set_ctx(&buf, stages[i].ctx);
Mike Kleina7080262017-01-09 10:20:13 -0500346 }
347
348 // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
Mike Kleinf7200982017-01-15 18:14:07 -0500349 if (lowp) {
350 SkAssertResult(splice_lowp(&buf, stages[i].stage));
351 continue;
352 }
353 if (!splice_highp(&buf, stages[i].stage)) {
354 //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
355 return;
Mike Kleina7080262017-01-09 10:20:13 -0500356 }
357 }
358
Mike Kleinf7200982017-01-15 18:14:07 -0500359 lowp ? splice(&buf, kSplice_inc_x_lowp)
360 : splice(&buf, kSplice_inc_x);
Mike Klein8e619a22017-01-09 17:21:32 -0500361 loop(&buf, loop_start); // Loop back to handle more pixels if not done.
Mike Klein09326e72017-01-11 13:41:30 -0500362 after_loop(&buf);
Mike Klein8e619a22017-01-09 17:21:32 -0500363 ret(&buf); // We're done.
Mike Kleina7080262017-01-09 10:20:13 -0500364
365 auto data = buf.detachAsData();
366 fSplicedLen = data->size();
Mike Klein09326e72017-01-11 13:41:30 -0500367 fSpliced = copy_to_executable_mem(data->data(), &fSplicedLen);
Mike Kleina7080262017-01-09 10:20:13 -0500368
Mike Klein13ccda42017-01-10 14:09:24 -0500369 #if defined(DUMP)
370 SkFILEWStream(DUMP).write(data->data(), data->size());
Mike Kleina7080262017-01-09 10:20:13 -0500371 #endif
372 }
373
374 // Spliced is stored in a std::function, so it needs to be copyable.
375 Spliced(const Spliced& o) : fBackup (o.fBackup)
376 , fSplicedLen(o.fSplicedLen)
Mike Kleinf7200982017-01-15 18:14:07 -0500377 , fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen))
378 , fLowp (o.fLowp) {}
Mike Kleina7080262017-01-09 10:20:13 -0500379
380 ~Spliced() {
381 cleanup_executable_mem(fSpliced, fSplicedLen);
382 }
383
384 // Here's where we call fSpliced if we created it, fBackup if not.
385 void operator()(size_t x, size_t y, size_t n) const {
Mike Kleinf7200982017-01-15 18:14:07 -0500386 size_t stride = fLowp ? kStride*2
387 : kStride;
388 size_t body = n/stride*stride; // Largest multiple of stride (2, 4, 8, or 16) <= n.
389 if (fSpliced && body) { // Can we run fSpliced for at least one stride?
Mike Kleina7080262017-01-09 10:20:13 -0500390 // TODO: At some point we will want to pass in y...
Mike Kleinf7200982017-01-15 18:14:07 -0500391 using Fn = void(size_t x, size_t limit, void* ctx, const void* k);
392 auto k = fLowp ? (const void*)&kConstants_lowp
393 : (const void*)&kConstants;
394 ((Fn*)fSpliced)(x, x+body, nullptr, k);
Mike Kleina7080262017-01-09 10:20:13 -0500395
Mike Kleinf7200982017-01-15 18:14:07 -0500396 // Fall through to fBackup for any n<stride last pixels.
Mike Kleina7080262017-01-09 10:20:13 -0500397 x += body;
398 n -= body;
399 }
400 fBackup(x,y,n);
401 }
402
403 std::function<void(size_t, size_t, size_t)> fBackup;
404 size_t fSplicedLen;
405 void* fSpliced;
Mike Kleinf7200982017-01-15 18:14:07 -0500406 bool fLowp;
Mike Kleina7080262017-01-09 10:20:13 -0500407 };
408
409}
410
411std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const {
412 return Spliced(fStages.data(), SkToInt(fStages.size()));
413}