blob: f3608d47c1fa25177e3b7b75ad65853cdc153724 [file] [log] [blame]
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include "SkSplicer_shared.h"
#include <immintrin.h>
#include <string.h>
#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error This file is not like the rest of Skia.
#error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
#endif
// We have very specific inlining requirements. It helps to just take total control.
#define AI __attribute__((always_inline)) inline
// We'll be compiling this file to an object file, then extracting parts of it into
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
#define C extern "C"
// Since we know we're using Clang, we can use its vector extensions.
// These are __m256 and __m256i, but friendlier and strongly-typed.
using F = float __attribute__((ext_vector_type(8)));
using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
AI static U32 round (F v) { return _mm256_cvtps_epi32(v); }
AI static U32 expand(U8 v) { return __builtin_convertvector(v, U32); }
AI static F rcp (F v) { return _mm256_rcp_ps (v); }
AI static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
AI static F min (F a, F b) { return _mm256_min_ps (a,b); }
AI static F max (F a, F b) { return _mm256_max_ps (a,b); }
AI static F fma (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
// Stages all fit a common interface that allows SkSplicer to splice them together.
using K = const SkSplicer_constants;
using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);
// Stage's arguments act as the working set of registers within the final spliced function.
// Here's a little primer on the ABI:
// x: rdi x and n work to drive the loop, like for (; x < n; x += 8)
// n: rsi
// ctx: rdx Look for movabsq_rdx in SkSplicer.cpp to see how this works.
// constants: rcx Look for movabsq_rcx in SkSplicer.cpp to see how this works.
// vectors: ymm0-ymm7
// done() is the key to this entire splicing strategy.
//
// It matches the signature of Stage, so all the registers are kept live.
// Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
// which marks the point where we can splice one Stage onto the next.
//
// The lovely bit is that we don't have to define done(), just declare it.
C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
// It's just a convenience to make a valid, spliceable Stage, nothing magic.
#define STAGE(name) \
AI static void name##_k(size_t x, size_t n, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
C void name(size_t x, size_t n, void* ctx, K* k, \
F r, F g, F b, F a, F dr, F dg, F db, F da) { \
name##_k(x,n,ctx,k, r,g,b,a, dr,dg,db,da); \
done (x,n,ctx,k, r,g,b,a, dr,dg,db,da); \
} \
AI static void name##_k(size_t x, size_t n, void* ctx, K* k, \
F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
// We can now define Stages!
// Some things to keep in mind while writing Stages:
// - do not branch; (i.e. avoid jmp)
// - do not call functions that don't inline; (i.e. avoid call, ret, stack use)
// - do not use constant literals other than 0 and 0.0f. (i.e. avoid rip relative addressing)
//
// Some things that should work fine:
// - 0 and 0.0f;
// - arithmetic;
// - functions of F and U32 that we've defined above;
// - temporary values;
// - lambdas;
// - memcpy() with a compile-time constant size argument.
STAGE(clear) {
r = g = b = a = 0;
}
STAGE(plus) {
r = r + dr;
g = g + dg;
b = b + db;
a = a + da;
}
STAGE(srcover) {
auto A = k->_1 - a;
r = fma(dr, A, r);
g = fma(dg, A, g);
b = fma(db, A, b);
a = fma(db, A, a);
}
STAGE(dstover) { srcover_k(x,n,ctx,k, dr,dg,db,da, r,g,b,a); }
STAGE(clamp_0) {
r = max(r, 0);
g = max(g, 0);
b = max(b, 0);
a = max(a, 0);
}
STAGE(clamp_1) {
r = min(r, k->_1);
g = min(g, k->_1);
b = min(b, k->_1);
a = min(a, k->_1);
}
STAGE(clamp_a) {
a = min(a, k->_1);
r = min(r, a);
g = min(g, a);
b = min(b, a);
}
STAGE(swap) {
auto swap = [](F& v, F& dv) {
auto tmp = v;
v = dv;
dv = tmp;
};
swap(r, dr);
swap(g, dg);
swap(b, db);
swap(a, da);
}
STAGE(move_src_dst) {
dr = r;
dg = g;
db = b;
da = a;
}
STAGE(move_dst_src) {
r = dr;
g = dg;
b = db;
a = da;
}
STAGE(premul) {
r = r * a;
g = g * a;
b = b * a;
}
STAGE(unpremul) {
auto scale = if_then_else(a == 0, 0, k->_1 / a);
r = r * scale;
g = g * scale;
b = b * scale;
}
STAGE(from_srgb) {
auto fn = [&](F s) {
auto lo = s * k->_1_1292;
auto hi = fma(s*s, fma(s, k->_03000, k->_06975), k->_00025);
return if_then_else(s < k->_0055, lo, hi);
};
r = fn(r);
g = fn(g);
b = fn(b);
}
STAGE(to_srgb) {
auto fn = [&](F l) {
F sqrt = rcp (rsqrt(l)),
ftrt = rsqrt(rsqrt(l));
auto lo = l * k->_1246;
auto hi = min(k->_1, fma(k->_0411192, ftrt,
fma(k->_0689206, sqrt,
k->n_00988)));
return if_then_else(l < k->_00043, lo, hi);
};
r = fn(r);
g = fn(g);
b = fn(b);
}
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
U8 scales;
memcpy(&scales, ptr, sizeof(scales));
auto c = cast(expand(scales)) * k->_1_255;
r = r * c;
g = g * c;
b = b * c;
a = a * c;
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
U32 px;
memcpy(&px, ptr, sizeof(px));
r = cast((px ) & k->_0x000000ff) * k->_1_255;
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
a = cast((px >> 24) ) * k->_1_255;
}
STAGE(store_8888) {
auto ptr = *(uint32_t**)ctx + x;
U32 px = round(r * k->_255)
| round(g * k->_255) << 8
| round(b * k->_255) << 16
| round(a * k->_255) << 24;
memcpy(ptr, &px, sizeof(px));
}
STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
_67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
_46 = _mm_unpacklo_epi16(_45, _67),
_57 = _mm_unpackhi_epi16(_45, _67);
auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
rg4567 = _mm_unpacklo_epi16(_46, _57),
ba4567 = _mm_unpackhi_epi16(_46, _57);
r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
}
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3
rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7
ba0123 = _mm_unpacklo_epi16(B, A),
ba4567 = _mm_unpackhi_epi16(B, A);
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
}