SkRasterPipeline preliminaries

    Re-uploading to see if I can get a CL number < 2^31.
    patch from issue 2147533002 at patchset 240001 (http://crrev.com/2147533002#ps240001)

Already reviewed at the other crrev link.
TBR=

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2147533002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2144573004
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
new file mode 100644
index 0000000..b3b8798
--- /dev/null
+++ b/bench/SkRasterPipelineBench.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkRasterPipeline.h"
+#include "SkSRGB.h"
+
+static const int N = 1023;
+
+static uint32_t dst[N],
+                src[N];
+static uint8_t mask[N];
+
+// We'll build up a somewhat realistic useful pipeline:
+//   - load srgb src
+//   - scale src by 8-bit mask
+//   - load srgb dst
+//   - src = srcover(dst, src)
+//   - store src back as srgb
+// Every stage except for srcover interacts with memory, and so will need _tail variants.
+
+static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x,
+                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint32_t*>() + x;
+
+    r = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
+              sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
+              sk_linear_from_srgb[(ptr[2] >>  0) & 0xff],
+              sk_linear_from_srgb[(ptr[3] >>  0) & 0xff] };
+
+    g = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  8) & 0xff],
+              sk_linear_from_srgb[(ptr[1] >>  8) & 0xff],
+              sk_linear_from_srgb[(ptr[2] >>  8) & 0xff],
+              sk_linear_from_srgb[(ptr[3] >>  8) & 0xff] };
+
+    b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
+              sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
+              sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
+              sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
+
+    a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_s_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+                                           Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                           Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint32_t*>() + x;
+
+    r = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
+    g = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
+    b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
+    a = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
+                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint32_t*>() + x;
+
+    dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >>  0) & 0xff] };
+
+    dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >>  8) & 0xff] };
+
+    db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
+
+    da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL load_d_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+                                           Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                           Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint32_t*>() + x;
+
+    dr = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
+    dg = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
+    db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
+    da = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL scale_u8(SkRasterPipeline::Stage* st, size_t x,
+                                   Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                   Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint8_t*>() + x;
+
+    auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
+    r *= c;
+    g *= c;
+    b *= c;
+    a *= c;
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL scale_u8_tail(SkRasterPipeline::Stage* st, size_t x,
+                                        Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                        Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<const uint8_t*>() + x;
+
+    auto c = *ptr * (1/255.0f);
+    r *= c;
+    g *= c;
+    b *= c;
+    a *= c;
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
+                                  Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                  Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto A = 1.0f - a;
+    r += dr * A;
+    g += dg * A;
+    b += db * A;
+    a += da * A;
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
+
+static Sk4f clamp(const Sk4f& x) {
+    return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
+}
+
+static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
+                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<uint32_t*>() + x;
+
+    r = clamp(sk_linear_to_srgb(r));
+    g = clamp(sk_linear_to_srgb(g));
+    b = clamp(sk_linear_to_srgb(b));
+    a = clamp(         255.0f * a );
+
+    ( SkNx_cast<int>(r)
+    | SkNx_cast<int>(g) << 8
+    | SkNx_cast<int>(b) << 16
+    | SkNx_cast<int>(a) << 24 ).store(ptr);
+}
+
+static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
+                                          Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                          Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    auto ptr = st->ctx<uint32_t*>() + x;
+
+    auto rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
+    rgba = {rgba[0], rgba[1], rgba[2], 255.0f*a[0]};
+    rgba = clamp(rgba);
+
+    SkNx_cast<uint8_t>(rgba).store(ptr);
+}
+
+class SkRasterPipelineBench : public Benchmark {
+public:
+    SkRasterPipelineBench() {}
+
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+    const char* onGetName() override { return "SkRasterPipelineBench"; }
+
+    void onDraw(int loops, SkCanvas*) override {
+        SkRasterPipeline p;
+        p.append(load_s_srgb, load_s_srgb_tail,  src);
+        p.append(   scale_u8,    scale_u8_tail, mask);
+        p.append(load_d_srgb, load_d_srgb_tail,  dst);
+        p.append(srcover);
+        p.append( store_srgb,  store_srgb_tail,  dst);
+
+        while (loops --> 0) {
+            p.run(N);
+        }
+    }
+};
+
+DEF_BENCH( return new SkRasterPipelineBench; )
diff --git a/gyp/core.gypi b/gyp/core.gypi
index 97e38c2..f494463 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -231,6 +231,7 @@
         '<(skia_src_path)/core/SkQuadClipper.cpp',
         '<(skia_src_path)/core/SkQuadClipper.h',
         '<(skia_src_path)/core/SkRasterClip.cpp',
+        '<(skia_src_path)/core/SkRasterPipeline.cpp',
         '<(skia_src_path)/core/SkRasterizer.cpp',
         '<(skia_src_path)/core/SkReadBuffer.h',
         '<(skia_src_path)/core/SkReadBuffer.cpp',
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
new file mode 100644
index 0000000..6a8f109
--- /dev/null
+++ b/src/core/SkRasterPipeline.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkRasterPipeline.h"
+
+SkRasterPipeline::SkRasterPipeline() {}
+
+void SkRasterPipeline::append(SkRasterPipeline::Fn body, const void* body_ctx,
+                              SkRasterPipeline::Fn tail, const void* tail_ctx) {
+    // We can't add more stages after being rewired to run().
+    SkASSERT(!fReadyToRun);
+
+    // For now, just stash the stage's function in its own fNext slot.
+    // We'll rewire our stages before running the pipeline so fNext makes sense.
+    fBody.push_back({ body, const_cast<void*>(body_ctx) });
+    fTail.push_back({ tail, const_cast<void*>(tail_ctx) });
+}
+
+void SkRasterPipeline::run(size_t n) {
+    if (fBody.empty() || fTail.empty()) {
+        return;
+    }
+
+    if (!fReadyToRun) {
+        auto rewire = [](Stages* stages) {
+            SkASSERT(!stages->empty());
+
+            // Rotate the fNext pointers so they point to the next function to
+            // call, not function we're currently calling as set by append().
+            auto start = stages->front().fNext;
+            for (int i = 0; i < stages->count() - 1; i++) {
+                (*stages)[i].fNext = (*stages)[i+1].fNext;
+            }
+            stages->back().fNext = start;  // This is a pretty handy place to stash this.
+        };
+        rewire(&fBody);
+        rewire(&fTail);
+        fReadyToRun = true;
+    }
+
+    // It's fastest to start uninitialized if the compilers all let us.  If not, next fastest is 0.
+    Sk4f v;
+
+    auto start_body = fBody.back().fNext,  // See rewire().
+         start_tail = fTail.back().fNext;
+
+    auto body = fBody.begin(),
+         tail = fTail.begin();
+
+    size_t x = 0;
+    while (n >= 4) {
+        start_body(body, x, v,v,v,v, v,v,v,v);
+        x += 4;
+        n -= 4;
+    }
+    while (n > 0) {
+        start_tail(tail, x, v,v,v,v, v,v,v,v);
+        x += 1;
+        n -= 1;
+    }
+}
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
new file mode 100644
index 0000000..8ae7bb1
--- /dev/null
+++ b/src/core/SkRasterPipeline.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkRasterPipeline_DEFINED
+#define SkRasterPipeline_DEFINED
+
+#include "SkNx.h"
+#include "SkTArray.h"
+#include "SkTypes.h"
+
+/**
+ * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
+ *
+ * It's particularly designed for situations where the potential pipeline is extremely
+ * combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ...
+ * No one wants to write specialized routines for all those combinations, and if we did, we'd
+ * end up bloating our code size dramatically.  SkRasterPipeline stages can be chained together
+ * at runtime, so we can scale this problem linearly rather than combinatorically.
+ *
+ * Each stage is represented by a function conforming to a common interface, SkRasterPipeline::Fn,
+ * and by an arbitrary context pointer.  Fn's arguments, and sometimes custom calling convention,
+ * are designed to maximize the amount of data we can pass along the pipeline cheaply.
+ * On many machines all arguments stay in registers the entire time.
+ *
+ * The meaning of the arguments to Fn are sometimes fixed...
+ *    - The Stage* always represents the current stage, mainly providing access to ctx().
+ *    - The size_t is always the destination x coordinate.  If you need y, put it in your context.
+ *    - By the time the shader's done, the first four vectors should hold source red,
+ *      green, blue, and alpha, up to 4 pixels' worth each.
+ *
+ * ...and sometimes flexible:
+ *    - In the shader, the first four vectors can be used for anything, e.g. sample coordinates.
+ *    - The last four vectors are scratch registers that can be used to communicate between
+ *      stages; transfer modes use these to hold the original destination pixel components.
+ *
+ * On some platforms the last four vectors are slower to work with than the other arguments.
+ *
+ * When done mutating its arguments and/or context, a stage can either:
+ *   1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or
+ *   2) return, indicating the pipeline is complete for these pixels.
+ *
+ * Some obvious stages that typically return are those that write a color to a destination pointer,
+ * but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
+ */
+
+class SkRasterPipeline {
+public:
+    struct Stage;
+    using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
+                                                     Sk4f,Sk4f,Sk4f,Sk4f);
+    struct Stage {
+        template <typename T>
+        T ctx() { return static_cast<T>(fCtx); }
+
+        void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                          Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+            // Stages are logically a pipeline, and physically are contiguous in an array.
+            // To get to the next stage, we just increment our pointer to the next array element.
+            fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7);
+        }
+
+        // It makes next() a good bit cheaper if we hold the next function to call here,
+        // rather than logically simpler choice of the function implementing this stage.
+        Fn fNext;
+        void* fCtx;
+    };
+
+
+    SkRasterPipeline();
+
+    // Run the pipeline constructed with append(), walking x through [0,n),
+    // generally in 4 pixel steps, but sometimes 1 pixel at a time.
+    void run(size_t n);
+
+    // Use this append() if your stage is sensitive to the number of pixels you're working with:
+    //   - body will always be called for a full 4 pixels
+    //   - tail will always be called for a single pixel
+    // Typically this is only an essential distintion for stages that read or write memory.
+    void append(Fn body, const void* body_ctx,
+                Fn tail, const void* tail_ctx);
+
+    // Most stages don't actually care if they're working on 4 or 1 pixel.
+    void append(Fn fn, const void* ctx = nullptr) {
+        this->append(fn, ctx, fn, ctx);
+    }
+
+    // Most 4 pixel or 1 pixel variants share the same context pointer.
+    void append(Fn body, Fn tail, const void* ctx = nullptr) {
+        this->append(body, ctx, tail, ctx);
+    }
+
+private:
+    using Stages = SkSTArray<10, Stage, /*MEM_COPY=*/true>;
+
+    Stages fBody,
+           fTail;
+    bool   fReadyToRun = false;
+};
+
+#endif//SkRasterPipeline_DEFINED
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 6d4cade..91cd104 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -386,6 +386,7 @@
     SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
     SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
 
+    SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
     SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
 
     SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 8952ff7..78cea3b 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -150,6 +150,7 @@
                                   _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
     }
 
+    SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
     SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
 
     SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
new file mode 100644
index 0000000..1db0206
--- /dev/null
+++ b/tests/SkRasterPipelineTest.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Test.h"
+#include "SkRasterPipeline.h"
+
+// load needs two variants, one to load 4 values...
+static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x,
+                               Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                               Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+    auto ptr = st->ctx<const float*>();
+    v0 = Sk4f{ptr[x+0]};
+    v1 = Sk4f{ptr[x+1]};
+    v2 = Sk4f{ptr[x+2]};
+    v3 = Sk4f{ptr[x+3]};
+
+    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// ...and one to load a single value.
+static void SK_VECTORCALL load_tail(SkRasterPipeline::Stage* st, size_t x,
+                                    Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                    Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+    auto ptr = st->ctx<const float*>();
+    v0 = Sk4f{ptr[x]};
+
+    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// square doesn't really care how many of its inputs are active, nor does it need a context.
+static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x,
+                                 Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                 Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+    v0 *= v0;
+    v1 *= v1;
+    v2 *= v2;
+    v3 *= v3;
+    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+}
+
+// Like load, store has a _tail variant.  It ends the pipeline by returning.
+static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x,
+                                Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+    auto ptr = st->ctx<float*>();
+    ptr[x+0] = v0[0];
+    ptr[x+1] = v1[0];
+    ptr[x+2] = v2[0];
+    ptr[x+3] = v3[0];
+}
+
+static void SK_VECTORCALL store_tail(SkRasterPipeline::Stage* st, size_t x,
+                                     Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                     Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+    auto ptr = st->ctx<float*>();
+    ptr[x+0] = v0[0];
+}
+
+DEF_TEST(SkRasterPipeline, r) {
+    // We'll build up and run a simple pipeline that exercises the salient
+    // mechanics of SkRasterPipeline:
+    //    - context pointers
+    //    - stages sensitive to the number of pixels
+    //    - stages insensitive to the number of pixels
+    //
+    // This pipeline loads up some values, squares them, then writes them back to memory.
+
+    const float src_vals[] = { 1,2,3,4,5 };
+    float       dst_vals[] = { 0,0,0,0,0 };
+
+    SkRasterPipeline p;
+    p.append(load, load_tail, src_vals);
+    p.append(square);
+    p.append(store, store_tail, dst_vals);
+
+    p.run(5);
+
+    REPORTER_ASSERT(r, dst_vals[0] ==  1);
+    REPORTER_ASSERT(r, dst_vals[1] ==  4);
+    REPORTER_ASSERT(r, dst_vals[2] ==  9);
+    REPORTER_ASSERT(r, dst_vals[3] == 16);
+    REPORTER_ASSERT(r, dst_vals[4] == 25);
+}