| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "SkCpu.h" |
| #include "SkOpts.h" |
| #include "SkRasterPipeline.h" |
| #include "SkStream.h" |
| #include <sys/mman.h> |
| |
| #include "SkSplicer_generated.h" |
| #include "SkSplicer_shared.h" |
| |
| // Uncomment to dump output with IACA markers. |
| // #define IACA_DUMP "/tmp/dump.o" |
| // https://software.intel.com/en-us/articles/intel-architecture-code-analyzer |
| // $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.o | less |
| |
| namespace { |
| |
| // Stages expect these constants to be set to these values. |
| // It's fine to rearrange and add new ones if you update SkSplicer_constants. |
| static const SkSplicer_constants kConstants = { |
| 0x000000ff, 1.0f, 255.0f, 1/255.0f, |
| 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb |
| 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb |
| }; |
| |
| // Short x86-64 instruction sequences that we'll use as glue to splice together Stages. |
| static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; // clear top half of all ymm |
| static const uint8_t ret[] = { 0xc3 }; // return |
| static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 }; // move next 8 bytes into rcx |
| static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; // move next 8 bytes into rdx |
| static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 }; // rdi += 8 |
| static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; // rdi cmp? rsi |
| static const uint8_t jb_near[] = { 0x0f, 0x8c }; // jump relative next 4 bytes |
| // if cmp set unsigned < bit |
| |
| // We do this a lot, so it's nice to infer the correct size. Works fine with arrays. |
| template <typename T> |
| void splice(SkWStream* stream, const T& val) { |
| stream->write(&val, sizeof(val)); |
| } |
| |
| #ifdef IACA_DUMP |
| static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run |
| static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op |
| static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx |
| |
| static void iaca_start(SkWStream* stream) { |
| splice(stream, ud2); |
| splice(stream, movl_ebx); |
| splice(stream, 111); |
| splice(stream, nop3); |
| } |
| static void iaca_end(SkWStream* stream) { |
| splice(stream, movl_ebx); |
| splice(stream, 222); |
| splice(stream, nop3); |
| splice(stream, ud2); |
| } |
| #else |
| static void iaca_start(SkWStream*) {} |
| static void iaca_end (SkWStream*) {} |
| #endif |
| |
| // Copy len bytes from src to memory that's executable. cleanup with cleanup_executable_mem(). |
| static void* copy_to_executable_mem(const void* src, size_t len) { |
| if (src && len) { |
| // TODO: w^x |
| auto fn = mmap(nullptr, len, PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0); |
| return memcpy(fn, src, len); |
| } |
| return nullptr; |
| } |
| static void cleanup_executable_mem(void* fn, size_t len) { |
| if (fn) { |
| munmap(fn, len); |
| } |
| } |
| |
| struct Spliced { |
| |
| Spliced(const SkRasterPipeline::Stage* stages, int nstages) { |
| // We always create a backup interpreter pipeline, |
| // - to handle any program we can't, and |
| // - to handle the n < 8 tails. |
| fBackup = SkOpts::compile_pipeline(stages, nstages); |
| fSplicedLen = 0; |
| fSpliced = nullptr; |
| // If we return early anywhere in here, !fSpliced means we'll use fBackup instead. |
| |
| // To keep things simple, only one target supported: Haswell+ x86-64. |
| if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) { |
| return; |
| } |
| |
| SkDynamicMemoryWStream buf; |
| |
| // Put the address of kConstants in rcx, Stage argument 4 "k". |
| splice(&buf, movabsq_rcx); |
| splice(&buf, &kConstants); |
| |
| // We'll loop back to here as long as x<n after x+=8. |
| iaca_start(&buf); |
| auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start: |
| |
| for (int i = 0; i < nstages; i++) { |
| // If a stage has a context pointer, load it into rdx, Stage argument 3 "ctx". |
| if (stages[i].ctx) { |
| splice(&buf, movabsq_rdx); |
| splice(&buf, stages[i].ctx); |
| } |
| |
| // Splice in the code for the Stages, generated offline into SkSplicer_generated.h. |
| switch(stages[i].stage) { |
| case SkRasterPipeline::clear: splice(&buf, kSplice_clear ); break; |
| case SkRasterPipeline::plus_: splice(&buf, kSplice_plus ); break; |
| case SkRasterPipeline::srcover: splice(&buf, kSplice_srcover ); break; |
| case SkRasterPipeline::dstover: splice(&buf, kSplice_dstover ); break; |
| case SkRasterPipeline::clamp_0: splice(&buf, kSplice_clamp_0 ); break; |
| case SkRasterPipeline::clamp_1: splice(&buf, kSplice_clamp_1 ); break; |
| case SkRasterPipeline::clamp_a: splice(&buf, kSplice_clamp_a ); break; |
| case SkRasterPipeline::swap: splice(&buf, kSplice_swap ); break; |
| case SkRasterPipeline::move_src_dst: splice(&buf, kSplice_move_src_dst); break; |
| case SkRasterPipeline::move_dst_src: splice(&buf, kSplice_move_dst_src); break; |
| case SkRasterPipeline::premul: splice(&buf, kSplice_premul ); break; |
| case SkRasterPipeline::unpremul: splice(&buf, kSplice_unpremul ); break; |
| case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break; |
| case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break; |
| case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break; |
| case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break; |
| case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break; |
| case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break; |
| case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break; |
| |
| // No joy (probably just not yet implemented). |
| default: |
| //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage); |
| return; |
| } |
| } |
| |
| // See if we should loop back to handle more pixels. |
| splice(&buf, addq_8_rdi); // x += 8 |
| splice(&buf, cmpq_rsi_rdi); // if (x < n) |
| splice(&buf, jb_near); // goto loop_start; |
| splice(&buf, (int)loop_start - (int)(buf.bytesWritten() + 4)); |
| iaca_end(&buf); |
| |
| // Nope! We're done. |
| splice(&buf, vzeroupper); |
| splice(&buf, ret); |
| |
| auto data = buf.detachAsData(); |
| fSplicedLen = data->size(); |
| fSpliced = copy_to_executable_mem(data->data(), fSplicedLen); |
| |
| #ifdef IACA_DUMP |
| SkFILEWStream(IACA_DUMP).write(data->data(), data->size()); |
| #endif |
| } |
| |
| // Spliced is stored in a std::function, so it needs to be copyable. |
| Spliced(const Spliced& o) : fBackup (o.fBackup) |
| , fSplicedLen(o.fSplicedLen) |
| , fSpliced (copy_to_executable_mem(o.fSpliced, fSplicedLen)) {} |
| |
| ~Spliced() { |
| cleanup_executable_mem(fSpliced, fSplicedLen); |
| } |
| |
| // Here's where we call fSpliced if we created it, fBackup if not. |
| void operator()(size_t x, size_t y, size_t n) const { |
| // TODO: The looping logic is probably not correct for handling n<8 tails. |
| if (fSpliced) { |
| // TODO: At some point we will want to pass in y... |
| using Fn = void(size_t x, size_t n); |
| ((Fn*)fSpliced)(x,n); |
| |
| // Fall through to fBackup for any n<8 last pixels. |
| size_t body = n/8*8; |
| x += body; |
| n -= body; |
| } |
| fBackup(x,y,n); |
| } |
| |
| std::function<void(size_t, size_t, size_t)> fBackup; |
| size_t fSplicedLen; |
| void* fSpliced; |
| }; |
| |
| } |
| |
| std::function<void(size_t, size_t, size_t)> SkRasterPipeline::jit() const { |
| return Spliced(fStages.data(), SkToInt(fStages.size())); |
| } |