SkRasterPipeline refactor
- Give body and tail functions separate types. This frees a register in body functions, especially important for Windows.
- Fill out default, SSE4.1, and HSW versions of all functions. This means we don't have to mess around with SkNf_abi... all functions come from the same compilation unit where SkNf is a single consistent type.
- Move Stage::next() into SkRasterPipeline_opts.h as a static inline function.
- Remove Stage::ctx() entirely... fCtx is literally the same thing.
This is a step along the way toward building the entire pipeline in src/opts, removing the need for all the stages to be functions living in SkOpts.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3680
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug-ASAN-Trybot
Change-Id: I7de78ffebc15b9bad4eda187c9f50369cd7e5e42
Reviewed-on: https://skia-review.googlesource.com/3680
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 627f0e3..1638948 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -144,8 +144,8 @@
static_assert(SK_ARRAY_COUNT(body) == SkRasterPipeline::kNumStockStages, "");
SkOpts::VoidFn tail[] = {
- (SkOpts::VoidFn)SK_OPTS_NS::just_return,
- (SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst,
+ (SkOpts::VoidFn)SK_OPTS_NS::just_return_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::swap_src_dst_tail,
(SkOpts::VoidFn)SK_OPTS_NS::store_565_tail,
(SkOpts::VoidFn)SK_OPTS_NS::store_srgb_tail,
@@ -163,34 +163,34 @@
(SkOpts::VoidFn)SK_OPTS_NS::lerp_u8_tail,
(SkOpts::VoidFn)SK_OPTS_NS::lerp_565_tail,
- (SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float,
+ (SkOpts::VoidFn)SK_OPTS_NS::lerp_constant_float_tail,
- (SkOpts::VoidFn)SK_OPTS_NS::constant_color,
+ (SkOpts::VoidFn)SK_OPTS_NS::constant_color_tail,
- (SkOpts::VoidFn)SK_OPTS_NS::dst,
- (SkOpts::VoidFn)SK_OPTS_NS::dstatop,
- (SkOpts::VoidFn)SK_OPTS_NS::dstin,
- (SkOpts::VoidFn)SK_OPTS_NS::dstout,
- (SkOpts::VoidFn)SK_OPTS_NS::dstover,
- (SkOpts::VoidFn)SK_OPTS_NS::srcatop,
- (SkOpts::VoidFn)SK_OPTS_NS::srcin,
- (SkOpts::VoidFn)SK_OPTS_NS::srcout,
- (SkOpts::VoidFn)SK_OPTS_NS::srcover,
- (SkOpts::VoidFn)SK_OPTS_NS::clear,
- (SkOpts::VoidFn)SK_OPTS_NS::modulate,
- (SkOpts::VoidFn)SK_OPTS_NS::multiply,
- (SkOpts::VoidFn)SK_OPTS_NS::plus_,
- (SkOpts::VoidFn)SK_OPTS_NS::screen,
- (SkOpts::VoidFn)SK_OPTS_NS::xor_,
- (SkOpts::VoidFn)SK_OPTS_NS::colorburn,
- (SkOpts::VoidFn)SK_OPTS_NS::colordodge,
- (SkOpts::VoidFn)SK_OPTS_NS::darken,
- (SkOpts::VoidFn)SK_OPTS_NS::difference,
- (SkOpts::VoidFn)SK_OPTS_NS::exclusion,
- (SkOpts::VoidFn)SK_OPTS_NS::hardlight,
- (SkOpts::VoidFn)SK_OPTS_NS::lighten,
- (SkOpts::VoidFn)SK_OPTS_NS::overlay,
- (SkOpts::VoidFn)SK_OPTS_NS::softlight,
+ (SkOpts::VoidFn)SK_OPTS_NS::dst_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::dstatop_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::dstin_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::dstout_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::dstover_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::srcatop_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::srcin_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::srcout_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::srcover_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::clear_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::modulate_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::multiply_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::plus__tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::screen_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::xor__tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::colorburn_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::colordodge_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::darken_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::difference_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::exclusion_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::hardlight_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::lighten_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::overlay_tail,
+ (SkOpts::VoidFn)SK_OPTS_NS::softlight_tail,
};
static_assert(SK_ARRAY_COUNT(tail) == SkRasterPipeline::kNumStockStages, "");
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index c6b85ad..525a8db 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -55,33 +55,13 @@
class SkRasterPipeline {
public:
- struct Stage;
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
- using V = SkNx_abi<8,float>;
-#else
- using V = SkNx_abi<4,float>;
-#endif
- using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, V,V,V,V,
- V,V,V,V);
-
struct Stage {
- template <typename T>
- T ctx() { return static_cast<T>(fCtx); }
-
- void SK_VECTORCALL next(size_t x, size_t tail, V v0, V v1, V v2, V v3,
- V v4, V v5, V v6, V v7) {
- // Stages are logically a pipeline, and physically are contiguous in an array.
- // To get to the next stage, we just increment our pointer to the next array element.
- ((Fn)fNext)(this+1, x,tail, v0,v1,v2,v3, v4,v5,v6,v7);
- }
-
// It makes next() a good bit cheaper if we hold the next function to call here,
// rather than logically simpler choice of the function implementing this stage.
void (*fNext)();
void* fCtx;
};
-
SkRasterPipeline();
// Run the pipeline constructed with append(), walking x through [x,x+n),
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp
index ecbb3e7..3c5d19d 100644
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@@ -35,11 +35,6 @@
STAGE(lerp_u8);
STAGE(lerp_565);
- #undef STAGE
-
- #define STAGE(stage) \
- body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
- tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
STAGE(just_return);
STAGE(swap_src_dst);
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index d55978c..4e277aa 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -21,7 +21,7 @@
srcover_srgb_srgb = sse41::srcover_srgb_srgb;
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
- #define STAGE(stage) \
+ #define STAGE(stage) \
body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage##_tail
@@ -41,40 +41,34 @@
STAGE(lerp_u8);
STAGE(lerp_565);
- #undef STAGE
- #define STAGE(stage) \
- body[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage; \
- tail[SkRasterPipeline::stage] = (SkOpts::VoidFn)SK_OPTS_NS::stage
+ STAGE(just_return);
+ STAGE(swap_src_dst);
+ STAGE(lerp_constant_float);
+ STAGE(constant_color);
- // The commented-out stages don't actually benefit from SSE 4.1.
- // To cut down on code bloat we skip them here, using the identical SSE2 defaults.
-
- //STAGE(lerp_constant_float);
- //STAGE(constant_color);
-
- //STAGE(dst);
- //STAGE(dstatop);
- //STAGE(dstin);
- //STAGE(dstout);
- //STAGE(dstover);
- //STAGE(srcatop);
- //STAGE(srcin);
- //STAGE(srcout);
- //STAGE(srcover);
- //STAGE(clear);
- //STAGE(modulate);
- //STAGE(multiply);
- //STAGE(plus_);
- //STAGE(screen);
- //STAGE(xor_);
+ STAGE(dst);
+ STAGE(dstatop);
+ STAGE(dstin);
+ STAGE(dstout);
+ STAGE(dstover);
+ STAGE(srcatop);
+ STAGE(srcin);
+ STAGE(srcout);
+ STAGE(srcover);
+ STAGE(clear);
+ STAGE(modulate);
+ STAGE(multiply);
+ STAGE(plus_);
+ STAGE(screen);
+ STAGE(xor_);
STAGE(colorburn);
STAGE(colordodge);
- //STAGE(darken);
- //STAGE(difference);
- //STAGE(exclusion);
+ STAGE(darken);
+ STAGE(difference);
+ STAGE(exclusion);
STAGE(hardlight);
- //STAGE(lighten);
+ STAGE(lighten);
STAGE(overlay);
STAGE(softlight);
#undef STAGE
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 3266554..7e884c7 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -12,36 +12,53 @@
#include "SkPM4f.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
+#include <utility>
-using SkNf_abi = SkRasterPipeline::V;
-static constexpr auto N = sizeof(SkNf_abi) / sizeof(float);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ static constexpr int N = 8;
+#else
+ static constexpr int N = 4;
+#endif
+
using SkNf = SkNx<N, float>;
using SkNi = SkNx<N, int>;
using SkNh = SkNx<N, uint16_t>;
+using Body = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t,
+ SkNf,SkNf,SkNf,SkNf,
+ SkNf,SkNf,SkNf,SkNf);
+using Tail = void(SK_VECTORCALL *)(SkRasterPipeline::Stage*, size_t, size_t,
+ SkNf,SkNf,SkNf,SkNf,
+ SkNf,SkNf,SkNf,SkNf);
+
#define SI static inline
+template <typename Fn, typename... Args>
+SI void next(SkRasterPipeline::Stage* st, Args&&... args) {
+ // Stages are logically a pipeline, and physically are contiguous in an array.
+ // To get to the next stage, we just increment our pointer to the next array element.
+ ((Fn)st->fNext)(st+1, std::forward<Args>(args)...);
+}
+
#define STAGE(name, kCallNext) \
template <bool kIsTail> \
static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \
SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \
- SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
- SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
- SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
- SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
- name##_kernel<false>(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da); \
+ SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ name##_kernel<false>(st->fCtx, x,0, r,g,b,a, dr,dg,db,da); \
if (kCallNext) { \
- st->next(x,tail, r,g,b,a, dr,dg,db,da); \
+ next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
} \
} \
SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
- SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
- SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
- SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
- name##_kernel<true>(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da); \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ name##_kernel<true>(st->fCtx, x,tail, r,g,b,a, dr,dg,db,da); \
if (kCallNext) { \
- st->next(x,tail, r,g,b,a, dr,dg,db,da); \
+ next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
} \
template <bool kIsTail> \
@@ -54,15 +71,23 @@
#define RGBA_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
- SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
- SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
- SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
- SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
+ SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = name##_kernel(a,a,da,da); \
- st->next(x,tail, r,g,b,a, dr,dg,db,da); \
+ next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ r = name##_kernel(r,a,dr,da); \
+ g = name##_kernel(g,a,dg,da); \
+ b = name##_kernel(b,a,db,da); \
+ a = name##_kernel(a,a,da,da); \
+ next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@@ -71,15 +96,23 @@
#define RGB_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
- SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
- SkNf_abi R, SkNf_abi G, SkNf_abi B, SkNf_abi A, \
- SkNf_abi DR, SkNf_abi DG, SkNf_abi DB, SkNf_abi DA) { \
- SkNf r=R,g=G,b=B,a=A, dr=DR,dg=DG,db=DB,da=DA; \
+ SI void SK_VECTORCALL name(SkRasterPipeline::Stage* st, size_t x, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = a + (da * (1.0f-a)); \
- st->next(x,tail, r,g,b,a, dr,dg,db,da); \
+ next<Body>(st, x, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void SK_VECTORCALL name##_tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ r = name##_kernel(r,a,dr,da); \
+ g = name##_kernel(g,a,dg,da); \
+ b = name##_kernel(b,a,db,da); \
+ a = a + (da * (1.0f-a)); \
+ next<Tail>(st, x,tail, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@@ -90,11 +123,11 @@
SI void run_pipeline(size_t x, size_t n,
void (*vBodyStart)(), SkRasterPipeline::Stage* body,
void (*vTailStart)(), SkRasterPipeline::Stage* tail) {
- auto bodyStart = (SkRasterPipeline::Fn)vBodyStart,
- tailStart = (SkRasterPipeline::Fn)vTailStart;
- SkNf v{0}; // TODO: uninitialized would be a bit faster, but some compilers are whiny.
+ auto bodyStart = (Body)vBodyStart;
+ auto tailStart = (Tail)vTailStart;
+ SkNf v; // Fastest to start uninitialized.
while (n >= N) {
- bodyStart(body, x,0, v,v,v,v, v,v,v,v);
+ bodyStart(body, x, v,v,v,v, v,v,v,v);
x += N;
n -= N;
}