SkRasterPipeline preliminaries
Re-uploading to see if I can get a CL number < 2^31.
patch from issue 2147533002 at patchset 240001 (http://crrev.com/2147533002#ps240001)
Already reviewed at the other crrev link.
TBR=
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2147533002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2144573004
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
new file mode 100644
index 0000000..b3b8798
--- /dev/null
+++ b/bench/SkRasterPipelineBench.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkRasterPipeline.h"
+#include "SkSRGB.h"
+
+static const int N = 1023;
+
+static uint32_t dst[N],
+ src[N];
+static uint8_t mask[N];
+
+// We'll build up a somewhat realistic useful pipeline:
+// - load srgb src
+// - scale src by 8-bit mask
+// - load srgb dst
+// - src = srcover(dst, src)
+// - store src back as srgb
+// Every stage except for srcover interacts with memory, and so will need _tail variants.
+
+static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint32_t*>() + x;
+
+ r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
+
+ g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
+
+ b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
+
+ a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_s_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint32_t*>() + x;
+
+ r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
+ g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
+ b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
+ a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint32_t*>() + x;
+
+ dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
+
+ dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
+
+ db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
+
+ da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_d_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint32_t*>() + x;
+
+ dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
+ dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
+ db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
+ da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL scale_u8(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint8_t*>() + x;
+
+ auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
+ r *= c;
+ g *= c;
+ b *= c;
+ a *= c;
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL scale_u8_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const uint8_t*>() + x;
+
+ auto c = *ptr * (1/255.0f);
+ r *= c;
+ g *= c;
+ b *= c;
+ a *= c;
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto A = 1.0f - a;
+ r += dr * A;
+ g += dg * A;
+ b += db * A;
+ a += da * A;
+
+ st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static Sk4f clamp(const Sk4f& x) {
+ return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
+}
+
+static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<uint32_t*>() + x;
+
+ r = clamp(sk_linear_to_srgb(r));
+ g = clamp(sk_linear_to_srgb(g));
+ b = clamp(sk_linear_to_srgb(b));
+ a = clamp( 255.0f * a );
+
+ ( SkNx_cast<int>(r)
+ | SkNx_cast<int>(g) << 8
+ | SkNx_cast<int>(b) << 16
+ | SkNx_cast<int>(a) << 24 ).store(ptr);
+}
+
+static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<uint32_t*>() + x;
+
+ auto rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
+ rgba = {rgba[0], rgba[1], rgba[2], 255.0f*a[0]};
+ rgba = clamp(rgba);
+
+ SkNx_cast<uint8_t>(rgba).store(ptr);
+}
+
+class SkRasterPipelineBench : public Benchmark {
+public:
+ SkRasterPipelineBench() {}
+
+ bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+ const char* onGetName() override { return "SkRasterPipelineBench"; }
+
+ void onDraw(int loops, SkCanvas*) override {
+ SkRasterPipeline p;
+ p.append(load_s_srgb, load_s_srgb_tail, src);
+ p.append( scale_u8, scale_u8_tail, mask);
+ p.append(load_d_srgb, load_d_srgb_tail, dst);
+ p.append(srcover);
+ p.append( store_srgb, store_srgb_tail, dst);
+
+ while (loops --> 0) {
+ p.run(N);
+ }
+ }
+};
+
+DEF_BENCH( return new SkRasterPipelineBench; )
diff --git a/gyp/core.gypi b/gyp/core.gypi
index 97e38c2..f494463 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -231,6 +231,7 @@
'<(skia_src_path)/core/SkQuadClipper.cpp',
'<(skia_src_path)/core/SkQuadClipper.h',
'<(skia_src_path)/core/SkRasterClip.cpp',
+ '<(skia_src_path)/core/SkRasterPipeline.cpp',
'<(skia_src_path)/core/SkRasterizer.cpp',
'<(skia_src_path)/core/SkReadBuffer.h',
'<(skia_src_path)/core/SkReadBuffer.cpp',
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
new file mode 100644
index 0000000..6a8f109
--- /dev/null
+++ b/src/core/SkRasterPipeline.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkRasterPipeline.h"
+
+SkRasterPipeline::SkRasterPipeline() {}
+
+void SkRasterPipeline::append(SkRasterPipeline::Fn body, const void* body_ctx,
+ SkRasterPipeline::Fn tail, const void* tail_ctx) {
+ // We can't add more stages after being rewired to run().
+ SkASSERT(!fReadyToRun);
+
+ // For now, just stash the stage's function in its own fNext slot.
+ // We'll rewire our stages before running the pipeline so fNext makes sense.
+ fBody.push_back({ body, const_cast<void*>(body_ctx) });
+ fTail.push_back({ tail, const_cast<void*>(tail_ctx) });
+}
+
+void SkRasterPipeline::run(size_t n) {
+ if (fBody.empty() || fTail.empty()) {
+ return;
+ }
+
+ if (!fReadyToRun) {
+ auto rewire = [](Stages* stages) {
+ SkASSERT(!stages->empty());
+
+ // Rotate the fNext pointers so they point to the next function to
+ // call, not function we're currently calling as set by append().
+ auto start = stages->front().fNext;
+ for (int i = 0; i < stages->count() - 1; i++) {
+ (*stages)[i].fNext = (*stages)[i+1].fNext;
+ }
+ stages->back().fNext = start; // This is a pretty handy place to stash this.
+ };
+ rewire(&fBody);
+ rewire(&fTail);
+ fReadyToRun = true;
+ }
+
+ // It's fastest to start uninitialized if the compilers all let us. If not, next fastest is 0.
+ Sk4f v;
+
+ auto start_body = fBody.back().fNext, // See rewire().
+ start_tail = fTail.back().fNext;
+
+ auto body = fBody.begin(),
+ tail = fTail.begin();
+
+ size_t x = 0;
+ while (n >= 4) {
+ start_body(body, x, v,v,v,v, v,v,v,v);
+ x += 4;
+ n -= 4;
+ }
+ while (n > 0) {
+ start_tail(tail, x, v,v,v,v, v,v,v,v);
+ x += 1;
+ n -= 1;
+ }
+}
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
new file mode 100644
index 0000000..8ae7bb1
--- /dev/null
+++ b/src/core/SkRasterPipeline.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkRasterPipeline_DEFINED
+#define SkRasterPipeline_DEFINED
+
+#include "SkNx.h"
+#include "SkTArray.h"
+#include "SkTypes.h"
+
+/**
+ * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
+ *
+ * It's particularly designed for situations where the potential pipeline is extremely
+ * combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ...
+ * No one wants to write specialized routines for all those combinations, and if we did, we'd
+ * end up bloating our code size dramatically. SkRasterPipeline stages can be chained together
+ * at runtime, so we can scale this problem linearly rather than combinatorically.
+ *
+ * Each stage is represented by a function conforming to a common interface, SkRasterPipeline::Fn,
+ * and by an arbitrary context pointer. Fn's arguments, and sometimes custom calling convention,
+ * are designed to maximize the amount of data we can pass along the pipeline cheaply.
+ * On many machines all arguments stay in registers the entire time.
+ *
+ * The meaning of the arguments to Fn are sometimes fixed...
+ * - The Stage* always represents the current stage, mainly providing access to ctx().
+ * - The size_t is always the destination x coordinate. If you need y, put it in your context.
+ * - By the time the shader's done, the first four vectors should hold source red,
+ * green, blue, and alpha, up to 4 pixels' worth each.
+ *
+ * ...and sometimes flexible:
+ * - In the shader, the first four vectors can be used for anything, e.g. sample coordinates.
+ * - The last four vectors are scratch registers that can be used to communicate between
+ * stages; transfer modes use these to hold the original destination pixel components.
+ *
+ * On some platforms the last four vectors are slower to work with than the other arguments.
+ *
+ * When done mutating its arguments and/or context, a stage can either:
+ * 1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or
+ * 2) return, indicating the pipeline is complete for these pixels.
+ *
+ * Some obvious stages that typically return are those that write a color to a destination pointer,
+ * but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
+ */
+
+class SkRasterPipeline {
+public:
+ struct Stage;
+ using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
+ Sk4f,Sk4f,Sk4f,Sk4f);
+ struct Stage {
+ template <typename T>
+ T ctx() { return static_cast<T>(fCtx); }
+
+ void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ // Stages are logically a pipeline, and physically are contiguous in an array.
+ // To get to the next stage, we just increment our pointer to the next array element.
+ fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7);
+ }
+
+ // It makes next() a good bit cheaper if we hold the next function to call here,
+ // rather than logically simpler choice of the function implementing this stage.
+ Fn fNext;
+ void* fCtx;
+ };
+
+
+ SkRasterPipeline();
+
+ // Run the pipeline constructed with append(), walking x through [0,n),
+ // generally in 4 pixel steps, but sometimes 1 pixel at a time.
+ void run(size_t n);
+
+ // Use this append() if your stage is sensitive to the number of pixels you're working with:
+ // - body will always be called for a full 4 pixels
+ // - tail will always be called for a single pixel
+ // Typically this is only an essential distintion for stages that read or write memory.
+ void append(Fn body, const void* body_ctx,
+ Fn tail, const void* tail_ctx);
+
+ // Most stages don't actually care if they're working on 4 or 1 pixel.
+ void append(Fn fn, const void* ctx = nullptr) {
+ this->append(fn, ctx, fn, ctx);
+ }
+
+ // Most 4 pixel or 1 pixel variants share the same context pointer.
+ void append(Fn body, Fn tail, const void* ctx = nullptr) {
+ this->append(body, ctx, tail, ctx);
+ }
+
+private:
+ using Stages = SkSTArray<10, Stage, /*MEM_COPY=*/true>;
+
+ Stages fBody,
+ fTail;
+ bool fReadyToRun = false;
+};
+
+#endif//SkRasterPipeline_DEFINED
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 6d4cade..91cd104 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -386,6 +386,7 @@
SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
+ SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 8952ff7..78cea3b 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -150,6 +150,7 @@
_mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
}
+ SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
new file mode 100644
index 0000000..1db0206
--- /dev/null
+++ b/tests/SkRasterPipelineTest.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Test.h"
+#include "SkRasterPipeline.h"
+
+// load needs two variants, one to load 4 values...
+static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ auto ptr = st->ctx<const float*>();
+ v0 = Sk4f{ptr[x+0]};
+ v1 = Sk4f{ptr[x+1]};
+ v2 = Sk4f{ptr[x+2]};
+ v3 = Sk4f{ptr[x+3]};
+
+ st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// ...and one to load a single value.
+static void SK_VECTORCALL load_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ auto ptr = st->ctx<const float*>();
+ v0 = Sk4f{ptr[x]};
+
+ st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// square doesn't really care how many of its inputs are active, nor does it need a context.
+static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ v0 *= v0;
+ v1 *= v1;
+ v2 *= v2;
+ v3 *= v3;
+ st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// Like load, store has a _tail variant. It ends the pipeline by returning.
+static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ auto ptr = st->ctx<float*>();
+ ptr[x+0] = v0[0];
+ ptr[x+1] = v1[0];
+ ptr[x+2] = v2[0];
+ ptr[x+3] = v3[0];
+}
+
+static void SK_VECTORCALL store_tail(SkRasterPipeline::Stage* st, size_t x,
+ Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+ Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+ auto ptr = st->ctx<float*>();
+ ptr[x+0] = v0[0];
+}
+
+DEF_TEST(SkRasterPipeline, r) {
+ // We'll build up and run a simple pipeline that exercises the salient
+ // mechanics of SkRasterPipeline:
+ // - context pointers
+ // - stages sensitive to the number of pixels
+ // - stages insensitive to the number of pixels
+ //
+ // This pipeline loads up some values, squares them, then writes them back to memory.
+
+ const float src_vals[] = { 1,2,3,4,5 };
+ float dst_vals[] = { 0,0,0,0,0 };
+
+ SkRasterPipeline p;
+ p.append(load, load_tail, src_vals);
+ p.append(square);
+ p.append(store, store_tail, dst_vals);
+
+ p.run(5);
+
+ REPORTER_ASSERT(r, dst_vals[0] == 1);
+ REPORTER_ASSERT(r, dst_vals[1] == 4);
+ REPORTER_ASSERT(r, dst_vals[2] == 9);
+ REPORTER_ASSERT(r, dst_vals[3] == 16);
+ REPORTER_ASSERT(r, dst_vals[4] == 25);
+}