blob: a58a7d7a11d758b57586f76de9208fb9aada15bf [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkCpu.h"
#include "SkJumper.h"
#include "SkJumper_generated.h"
#include "SkRasterPipeline.h"
#include "SkTemplates.h"
// Stages expect these constants to be set to these values.
// It's fine to rearrange and add new ones if you update SkJumper_constants.
static const SkJumper_constants kConstants = {
1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
0x77800000, 0x07800000, // fp16 <-> fp32
};
using JumperStage = void(size_t, void**, const SkJumper_constants*);
// Jumper stages actually pass around 8 floating point vectors too.
// They're designed to work when those vectors start unintialized,
// so we don't need to mention them here.
#define STAGES(M) \
M(seed_shader) \
M(constant_color) \
M(clear) \
M(plus_) \
M(srcover) \
M(dstover) \
M(clamp_0) \
M(clamp_1) \
M(clamp_a) \
M(swap) \
M(move_src_dst) \
M(move_dst_src) \
M(premul) \
M(unpremul) \
M(from_srgb) \
M(to_srgb) \
M(scale_u8) \
M(load_tables) \
M(load_8888) \
M(store_8888) \
M(load_f16) \
M(store_f16) \
M(matrix_2x3) \
M(matrix_3x4) \
M(clamp_x) \
M(clamp_y) \
M(linear_gradient_2stops)
// Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
extern "C" {
JumperStage sk_just_return;
#define M(st) JumperStage sk_##st;
STAGES(M)
#undef M
}
// Translate SkRasterPipeline's enum to pointers to our portable, single pixel stages.
static void* portable_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sk_##st;
STAGES(M)
#undef M
}
}
// The non-portable options are pre-compiled static data arrays pulled in from SkJumper_generated.h.
#if defined(__aarch64__)
static void* aarch64_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)aarch64_sk_##st;
STAGES(M)
#undef M
}
}
#elif defined(__ARM_NEON__)
static void* armv7_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)armv7_sk_##st;
STAGES(M)
#undef M
}
}
#elif defined(__x86_64__) || defined(_M_X64)
static void* sse2_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sse2_sk_##st;
STAGES(M)
#undef M
}
}
static void* sse41_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)sse41_sk_##st;
STAGES(M)
#undef M
}
}
static void* hsw_lookup(SkRasterPipeline::StockStage st) {
switch (st) {
default: return nullptr;
#define M(st) case SkRasterPipeline::st: return (void*)hsw_sk_##st;
STAGES(M)
#undef M
}
}
#endif
bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
// We'll look for the best vector instruction set and stride we can use.
size_t stride = 0;
void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
void* just_return = nullptr;
#if defined(__aarch64__)
stride = 4;
lookup = aarch64_lookup;
just_return = (void*)aarch64_sk_just_return;
#elif defined(__ARM_NEON__)
if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
stride = 2;
lookup = armv7_lookup;
just_return = (void*)armv7_sk_just_return;
}
#elif defined(__x86_64__) || defined(_M_X64)
stride = 4;
lookup = sse2_lookup;
just_return = (void*)sse2_sk_just_return;
if (SkCpu::Supports(SkCpu::SSE41)) {
stride = 4;
lookup = sse41_lookup;
just_return = (void*)sse41_sk_just_return;
}
if (SkCpu::Supports(SkCpu::HSW)) {
stride = 8;
lookup = hsw_lookup;
just_return = (void*)hsw_sk_just_return;
}
#endif
SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
// If possible, build and run a program to run at full vector stride.
const size_t limit = x+n;
if (stride) {
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = lookup(st.stage);
if (!fn) {
return false;
}
*ip++ = fn;
*ip++ = st.ctx;
}
*ip = (void*)just_return;
ip = program.get();
auto start = (JumperStage*)*ip++;
while (x + stride <= limit) {
start(x, ip, &kConstants);
x += stride;
}
}
// If there's any leftover, build and run stride=1 portable code.
if (x < limit) {
stride = 1;
void** ip = program.get();
for (auto&& st : fStages) {
auto fn = portable_lookup(st.stage);
if (!fn) {
return false;
}
*ip++ = fn;
*ip++ = st.ctx;
}
*ip = (void*)sk_just_return;
ip = program.get();
auto start = (JumperStage*)*ip++;
while (x + stride <= limit) {
start(x, ip, &kConstants);
x += stride;
}
}
return true;
}