| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "SkCpu.h" |
| #include "SkOpts.h" |
| #include "SkRasterPipeline.h" |
| #include "SkStream.h" |
| #if defined(_MSC_VER) |
| #include <windows.h> |
| #else |
| #include <sys/mman.h> |
| #endif |
| |
| #include "SkSplicer_generated.h" |
| #include "SkSplicer_generated_lowp.h" |
| #include "SkSplicer_shared.h" |
| |
| // Uncomment to dump output JIT'd pipeline. |
| //#define DUMP "/tmp/dump.bin" |
| //#define DUMP "/data/local/tmp/dump.bin" |
| // |
| // On x86, we'll include IACA markers too. |
| // https://software.intel.com/en-us/articles/intel-architecture-code-analyzer |
| // Running IACA will disassemble, and more. |
| // $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less |
| // |
| // To disassemble an aarch64 dump, |
| // $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less |
| // |
| // To disassemble an armv7 dump, |
| // $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less |
| |
| namespace { |
| |
| // Stages expect these constants to be set to these values. |
| // It's fine to rearrange and add new ones if you update SkSplicer_constants. |
| static const SkSplicer_constants kConstants = { |
| 1.0f, 255.0f, 1/255.0f, 0x000000ff, |
| 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb |
| 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb |
| }; |
| static const SkSplicer_constants_lowp kConstants_lowp = { |
| 0x8000, 0x8081, |
| }; |
| |
| // We do this a lot, so it's nice to infer the correct size. Works fine with arrays. |
| template <typename T> |
| static void splice(SkWStream* buf, const T& val) { |
| // This null check makes determining whether we can drop to lowp easier. |
| // It's always known at compile time.. |
| if (buf) { |
| buf->write(&val, sizeof(val)); |
| } |
| } |
| |
| #if defined(__aarch64__) |
| static constexpr int kStride = 4; |
| static void set_ctx(SkWStream* buf, void* ctx) { |
| uint16_t parts[4]; |
| memcpy(parts, &ctx, 8); |
| splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2 |
| splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2 |
| splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2 |
| splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2 |
| } |
| static void loop(SkWStream* buf, int loop_start) { |
| splice(buf, 0xeb01001f); // cmp x0, x1 |
| int off = loop_start - (int)buf->bytesWritten(); |
| off /= 4; // bytes -> instructions, still signed |
| off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions) |
| splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than) |
| } |
| static void ret(SkWStream* buf) { |
| splice(buf, 0xd65f03c0); // ret |
| } |
| #elif defined(__ARM_NEON__) |
| static constexpr int kStride = 2; |
| static void set_ctx(SkWStream* buf, void* ctx) { |
| uint16_t parts[2]; |
| auto encode = [](uint16_t part) -> uint32_t { |
| return (part & 0xf000) << 4 | (part & 0xfff); |
| }; |
| memcpy(parts, &ctx, 4); |
| splice(buf, 0xe3002000 | encode(parts[0])); // mov r2, <bottom 16 bits> |
| splice(buf, 0xe3402000 | encode(parts[1])); // movt r2, <top 16 bits> |
| } |
| static void loop(SkWStream* buf, int loop_start) { |
| splice(buf, 0xe1500001); // cmp r0, r1 |
| int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/); |
| off /= 4; // bytes -> instructions, still signed |
| off = (off & 0x00ffffff); |
| splice(buf, 0x3a000000 | off); // bcc loop_start |
| } |
| static void ret(SkWStream* buf) { |
| splice(buf, 0xe12fff1e); // bx lr |
| } |
| #else |
| static constexpr int kStride = 8; |
| static void set_ctx(SkWStream* buf, void* ctx) { |
| static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; |
| splice(buf, movabsq_rdx); // movabsq <next 8 bytes>, %rdx |
| splice(buf, ctx); |
| } |
| static void loop(SkWStream* buf, int loop_start) { |
| static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; |
| static const uint8_t jb_near[] = { 0x0f, 0x8c }; |
| splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi |
| splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than) |
| splice(buf, loop_start - (int)(buf->bytesWritten() + 4)); |
| } |
| static void ret(SkWStream* buf) { |
| static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; |
| static const uint8_t ret[] = { 0xc3 }; |
| splice(buf, vzeroupper); |
| splice(buf, ret); |
| } |
| #endif |
| |
| #if defined(_MSC_VER) |
| // Adapt from MS ABI to System V ABI used by stages. |
| static void before_loop(SkWStream* buf) { |
| static const uint8_t ms_to_system_v[] = { |
| 0x56, // push %rsi |
| 0x57, // push %rdi |
| 0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp |
| 0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp) |
| 0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp) |
| 0xc5,0x78,0x29,0x6c,0x24,0x70, // vmovaps %xmm13,0x70(%rsp) |
| 0xc5,0x78,0x29,0x64,0x24,0x60, // vmovaps %xmm12,0x60(%rsp) |
| 0xc5,0x78,0x29,0x5c,0x24,0x50, // vmovaps %xmm11,0x50(%rsp) |
| 0xc5,0x78,0x29,0x54,0x24,0x40, // vmovaps %xmm10,0x40(%rsp) |
| 0xc5,0x78,0x29,0x4c,0x24,0x30, // vmovaps %xmm9,0x30(%rsp) |
| 0xc5,0x78,0x29,0x44,0x24,0x20, // vmovaps %xmm8,0x20(%rsp) |
| 0xc5,0xf8,0x29,0x7c,0x24,0x10, // vmovaps %xmm7,0x10(%rsp) |
| 0xc5,0xf8,0x29,0x34,0x24, // vmovaps %xmm6,(%rsp) |
| 0x48,0x89,0xcf, // mov %rcx,%rdi |
| 0x48,0x89,0xd6, // mov %rdx,%rsi |
| 0x4c,0x89,0xc2, // mov %r8,%rdx |
| 0x4c,0x89,0xc9, // mov %r9,%rcx |
| }; |
| splice(buf, ms_to_system_v); |
| } |
| static void after_loop(SkWStream* buf) { |
| static const uint8_t system_v_to_ms[] = { |
| 0xc5,0xf8,0x28,0x34,0x24, // vmovaps (%rsp),%xmm6 |
| 0xc5,0xf8,0x28,0x7c,0x24,0x10, // vmovaps 0x10(%rsp),%xmm7 |
| 0xc5,0x78,0x28,0x44,0x24,0x20, // vmovaps 0x20(%rsp),%xmm8 |
| 0xc5,0x78,0x28,0x4c,0x24,0x30, // vmovaps 0x30(%rsp),%xmm9 |
| 0xc5,0x78,0x28,0x54,0x24,0x40, // vmovaps 0x40(%rsp),%xmm10 |
| 0xc5,0x78,0x28,0x5c,0x24,0x50, // vmovaps 0x50(%rsp),%xmm11 |
| 0xc5,0x78,0x28,0x64,0x24,0x60, // vmovaps 0x60(%rsp),%xmm12 |
| 0xc5,0x78,0x28,0x6c,0x24,0x70, // vmovaps 0x70(%rsp),%xmm13 |
| 0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14 |
| 0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15 |
| 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp |
| 0x5f, // pop %rdi |
| 0x5e, // pop %rsi |
| }; |
| splice(buf, system_v_to_ms); |
| } |
| #elif !defined(__aarch64__) && !defined(__ARM_NEON__) && defined(DUMP) |
| // IACA start and end markers. |
| static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run |
| static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op |
| static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx |
| |
| static void before_loop(SkWStream* buf) { |
| splice(buf, ud2); |
| splice(buf, movl_ebx); |
| splice(buf, 111); |
| splice(buf, nop3); |
| } |
| static void after_loop(SkWStream* buf) { |
| splice(buf, movl_ebx); |
| splice(buf, 222); |
| splice(buf, nop3); |
| splice(buf, ud2); |
| } |
| #else |
| static void before_loop(SkWStream*) {} |
| static void after_loop (SkWStream*) {} |
| #endif |
| |
| // We can only mprotect / VirtualProtect at 4K page granularity. |
| static size_t round_up_to_full_pages(size_t len) { |
| size_t size = 0; |
| while (size < len) { |
| size += 4096; |
| } |
| return size; |
| } |
| |
| #if defined(_MSC_VER) |
| // Copy len bytes from src to memory that's executable. cleanup with cleanup_executable_mem(). |
| static void* copy_to_executable_mem(const void* src, size_t* len) { |
| if (!src || !*len) { |
| return nullptr; |
| } |
| |
| size_t alloc = round_up_to_full_pages(*len); |
| |
| auto fn = VirtualAlloc(nullptr, alloc, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); |
| memcpy(fn, src, *len); |
| |
| DWORD dont_care; |
| VirtualProtect(fn, alloc, PAGE_EXECUTE_READ, &dont_care); |
| |
| *len = alloc; |
| return fn; |
| } |
| static void cleanup_executable_mem(void* fn, size_t len) { |
| if (fn) { |
| VirtualFree(fn, 0, MEM_RELEASE); |
| } |
| } |
| #else |
| static void* copy_to_executable_mem(const void* src, size_t* len) { |
| if (!src || !*len) { |
| return nullptr; |
| } |
| |
| size_t alloc = round_up_to_full_pages(*len); |
| |
| auto fn = mmap(nullptr, alloc, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); |
| memcpy(fn, src, *len); |
| |
| mprotect(fn, alloc, PROT_READ|PROT_EXEC); |
| __builtin___clear_cache((char*)fn, (char*)fn + *len); // Essential on ARM; no-op on x86. |
| |
| *len = alloc; |
| return fn; |
| } |
| static void cleanup_executable_mem(void* fn, size_t len) { |
| if (fn) { |
| munmap(fn, len); |
| } |
| } |
| #endif |
| |
| static bool splice_lowp(SkWStream* buf, SkRasterPipeline::StockStage st) { |
| switch (st) { |
| default: return false; |
| case SkRasterPipeline::clamp_0: break; // lowp can't go below 0. |
| #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st##_lowp); break |
| CASE(clear); |
| CASE(plus_); |
| CASE(srcover); |
| CASE(dstover); |
| CASE(clamp_1); |
| CASE(clamp_a); |
| CASE(swap); |
| CASE(move_src_dst); |
| CASE(move_dst_src); |
| CASE(premul); |
| CASE(scale_u8); |
| CASE(load_8888); |
| CASE(store_8888); |
| #undef CASE |
| } |
| return true; |
| } |
| |
| static bool splice_highp(SkWStream* buf, SkRasterPipeline::StockStage st) { |
| switch (st) { |
| default: return false; |
| #define CASE(st) case SkRasterPipeline::st: splice(buf, kSplice_##st); break |
| CASE(clear); |
| CASE(plus_); |
| CASE(srcover); |
| CASE(dstover); |
| CASE(clamp_0); |
| CASE(clamp_1); |
| CASE(clamp_a); |
| CASE(swap); |
| CASE(move_src_dst); |
| CASE(move_dst_src); |
| CASE(premul); |
| CASE(unpremul); |
| CASE(from_srgb); |
| CASE(to_srgb); |
| CASE(scale_u8); |
| CASE(load_tables); |
| CASE(load_8888); |
| CASE(store_8888); |
| CASE(load_f16); |
| CASE(store_f16); |
| CASE(matrix_3x4); |
| #undef CASE |
| } |
| return true; |
| } |
| |
| struct Spliced { |
| |
| Spliced(const SkRasterPipeline::Stage* stages, int nstages) { |
| // We always create a backup interpreter pipeline, |
| // - to handle any program we can't, and |
| // - to handle the n < stride tails. |
| fBackup = SkOpts::compile_pipeline(stages, nstages); |
| fSplicedLen = 0; |
| fSpliced = nullptr; |
| fLowp = false; |
| // If we return early anywhere in here, !fSpliced means we'll use fBackup instead. |
| |
| #if defined(__aarch64__) |
| #elif defined(__ARM_NEON__) |
| // Late generation ARMv7, e.g. Cortex A15 or Krait. |
| if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) { |
| return; |
| } |
| #else |
| // To keep things simple, only one x86 target supported: Haswell+ x86-64. |
| if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) { |
| return; |
| } |
| #endif |
| |
| // See if all the stages can run in lowp mode. If so, we can run at ~2x speed. |
| bool lowp = true; |
| for (int i = 0; i < nstages; i++) { |
| if (!splice_lowp(nullptr, stages[i].stage)) { |
| //SkDebugf("SkSplicer can't yet handle stage %d in lowp.\n", stages[i].stage); |
| lowp = false; |
| break; |
| } |
| } |
| fLowp = lowp; |
| |
| SkDynamicMemoryWStream buf; |
| |
| // Our loop is the equivalent of this C++ code: |
| // do { |
| // ... run spliced stages... |
| // x += stride; |
| // } while(x < limit); |
| before_loop(&buf); |
| auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start: |
| |
| for (int i = 0; i < nstages; i++) { |
| // If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx". |
| if (stages[i].ctx) { |
| set_ctx(&buf, stages[i].ctx); |
| } |
| |
| // Splice in the code for the Stages, generated offline into SkSplicer_generated.h. |
| if (lowp) { |
| SkAssertResult(splice_lowp(&buf, stages[i].stage)); |
| continue; |
| } |
| if (!splice_highp(&buf, stages[i].stage)) { |
| //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage); |
| return; |
| } |
| } |
| |
| lowp ? splice(&buf, kSplice_inc_x_lowp) |
| : splice(&buf, kSplice_inc_x); |
| loop(&buf, loop_start); // Loop back to handle more pixels if not done. |
| after_loop(&buf); |
| ret(&buf); // We're done. |
| |
| auto data = buf.detachAsData(); |
| fSplicedLen = data->size(); |
| fSpliced = copy_to_executable_mem(data->data(), &fSplicedLen); |
| |
| #if defined(DUMP) |
| SkFILEWStream(DUMP).write(data->data(), data->size()); |
| #endif |
| } |
| |
| // Spliced is stored in a std::function, so it needs to be copyable. |
| Spliced(const Spliced& o) : fBackup (o.fBackup) |
| , fSplicedLen(o.fSplicedLen) |
| , fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen)) |
| , fLowp (o.fLowp) {} |
| |
| ~Spliced() { |
| cleanup_executable_mem(fSpliced, fSplicedLen); |
| } |
| |
| // Here's where we call fSpliced if we created it, fBackup if not. |
| void operator()(size_t x, size_t n) const { |
| size_t stride = fLowp ? kStride*2 |
| : kStride; |
| size_t body = n/stride*stride; // Largest multiple of stride (2, 4, 8, or 16) <= n. |
| if (fSpliced && body) { // Can we run fSpliced for at least one stride? |
| using Fn = void(size_t x, size_t limit, void* ctx, const void* k); |
| auto k = fLowp ? (const void*)&kConstants_lowp |
| : (const void*)&kConstants; |
| ((Fn*)fSpliced)(x, x+body, nullptr, k); |
| |
| // Fall through to fBackup for any n<stride last pixels. |
| x += body; |
| n -= body; |
| } |
| fBackup(x,n); |
| } |
| |
| std::function<void(size_t, size_t)> fBackup; |
| size_t fSplicedLen; |
| void* fSpliced; |
| bool fLowp; |
| }; |
| |
| } |
| |
| std::function<void(size_t, size_t)> SkRasterPipeline::jit() const { |
| return Spliced(fStages.data(), SkToInt(fStages.size())); |
| } |