trim another instruction off SkRasterPipeline overhead The overhead of a stage today is 3 x86 instructions, typically looking something like this: - movq (%rdi), %rax // Load the next stage function pointer. - addq $0x10, %rdi // Step our progress ahead 16 bytes to that next stage. - jmpq *%rax // Transfer control to that stage. But if we make sure the pointer's in esi/rsi, we can use lodsd/lodsq to do those first two steps in one instruction: - lodsq (%rsi), %rax (≈ movq (%rdi), %rax; addq $0x8, %rsi). - jmpq *%rax This CL rearranges things so that we can take advantage of this and generally trim off an instruction of overhead. Instead of a vector of {Fn, ctx} pairs, we'll flatten it down into a single interlaced program vector of void*, basically just ommitting any null context pointers. We pass the pointer to program as the second argument to Fn, putting it in rsi. These two changes together make getting the next Fn to call or the current context the same cheap lodsq instruction, encapsulated as load_and_increment(). Here's how the simple "modulate" blend stage changes: vmulps %ymm4, %ymm0, %ymm0 vmulps %ymm5, %ymm1, %ymm1 vmulps %ymm6, %ymm2, %ymm2 vmulps %ymm7, %ymm3, %ymm3 movq (%rdi), %rax addq $0x10, %rdi jmpq *%rax ~~~~~~~~> vmulps %ymm4, %ymm0, %ymm0 vmulps %ymm5, %ymm1, %ymm1 vmulps %ymm6, %ymm2, %ymm2 vmulps %ymm7, %ymm3, %ymm3 lodsq (%rsi), %rax jmpq *%rax This does make getting the current context a one-time, destructive operation. It's switched from referring to ctx as a void* directly to using ctx() as a thunk that returns a void*. No stage so far has ever referred to ctx twice, and it all appears to inline, so this seems harmless. "matrix_2x3" is a good example of what stages that use context pointers end up looking like: lodsq (%rsi), %rax vbroadcastss (%rax), %ymm9 vbroadcastss 0x8(%rax), %ymm10 vbroadcastss 0x10(%rax), %ymm8 vfmadd231ps %ymm10, %ymm1, %ymm8 vfmadd231ps %ymm9, %ymm0, %ymm8 vbroadcastss 0x4(%rax), %ymm10 vbroadcastss 0xc(%rax), %ymm11 vbroadcastss 0x14(%rax), %ymm9 vfmadd231ps %ymm11, %ymm1, %ymm9 vfmadd231ps %ymm10, %ymm0, %ymm9 lodsq (%rsi), %rax vmovaps %ymm8, %ymm0 vmovaps %ymm9, %ymm1 jmpq *%rax We can't do this with MSVC, as there's no intrinsic for it I can find, and they disallow inline assembly, and rsi is not used to pass arguments to functions there anyway. ARM doesn't need it... it does this in two instructions naturally anyway. We could do this for 32-bit x86 but I'd just rather focus on x86-64. It's unclear to me that this makes things any faster, but doesn't appear to make things any slower, and makes I think both the code and disassembly simpler. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: Ia7b543a6718c75a33095371924003c5402b3445a Reviewed-on: https://skia-review.googlesource.com/6271 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>

commit: e61c40707e70a2be9e32227a929173864f7895e1 [log] [tgz]
author: Mike Klein <mtklein@chromium.org> Thu Dec 29 11:06:34 2016 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Tue Jan 03 18:13:21 2017 +0000
tree: a12726f4f2f2d1d3788f7cf99a8477ceae8191dd
parent: 7551898f8eba322acb04c74ae12aae1ed3548105 [diff]
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index e011541..1c4d13a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h

@@ -35,58 +35,95 @@
     using SkNh = SkNx<N, uint16_t>;
     using SkNb = SkNx<N, uint8_t>;
 
-    struct Stage;
-    using Fn = void(SK_VECTORCALL *)(Stage*, size_t x_tail, SkNf,SkNf,SkNf,SkNf,
-                                                            SkNf,SkNf,SkNf,SkNf);
-    struct Stage { Fn next; void* ctx; };
-
+    using Fn = void(SK_VECTORCALL *)(size_t x_tail, void** p, SkNf,SkNf,SkNf,SkNf,
+                                                              SkNf,SkNf,SkNf,SkNf);
     // x_tail encodes two values x and tail as x*N+tail, where 0 <= tail < N.
     // x is the induction variable we're walking along, incrementing by N each step.
     // tail == 0 means work with a full N pixels; otherwise use only the low tail pixels.
+    //
+    // p is our program, a sequence of Fn to call interlaced with any void* context pointers.  E.g.
+    //    &load_8888
+    //    (src ptr)
+    //    &from_srgb
+    //    &load_f16_d
+    //    (dst ptr)
+    //    &srcover
+    //    &store_f16
+    //    (dst ptr)
+    //    &just_return
 
 }  // namespace
 
 #define SI static inline
 
+// Basically, return *(*ptr)++, maybe faster than the compiler can do it.
+SI void* load_and_increment(void*** ptr) {
+    // We do this often enough that it's worth hyper-optimizing.
+    // x86 can do this in one instruction if ptr is in rsi.
+    // (This is why p is the second argument to Fn: it's passed in rsi.)
+#if defined(__GNUC__) && defined(__x86_64__)
+    void* rax;
+    __asm__("lodsq" : "=a"(rax), "+S"(*ptr));
+    return rax;
+#else
+    return *(*ptr)++;
+#endif
+}
+
 // Stages are logically a pipeline, and physically are contiguous in an array.
 // To get to the next stage, we just increment our pointer to the next array element.
-SI void SK_VECTORCALL next(Stage* st, size_t x_tail, SkNf  r, SkNf  g, SkNf  b, SkNf  a,
-                                                     SkNf dr, SkNf dg, SkNf db, SkNf da) {
-    st->next(st+1, x_tail, r,g,b,a, dr,dg,db,da);
+SI void SK_VECTORCALL next(size_t x_tail, void** p, SkNf  r, SkNf  g, SkNf  b, SkNf  a,
+                                                    SkNf dr, SkNf dg, SkNf db, SkNf da) {
+    auto next = (Fn)load_and_increment(&p);
+    next(x_tail,p, r,g,b,a, dr,dg,db,da);
 }
 
 // Stages defined below always call next.
 // This is always the last stage, a backstop that actually returns to the caller when done.
-SI void SK_VECTORCALL just_return(Stage*, size_t, SkNf, SkNf, SkNf, SkNf,
+SI void SK_VECTORCALL just_return(size_t, void**, SkNf, SkNf, SkNf, SkNf,
                                                   SkNf, SkNf, SkNf, SkNf) {}
 
 #define STAGE(name)                                                                      \
-    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,         \
+    static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail,                    \
                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);  \
-    SI void SK_VECTORCALL name(Stage* st, size_t x_tail,                                 \
+    SI void SK_VECTORCALL name(size_t x_tail, void** p,                                  \
                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,                       \
                                SkNf dr, SkNf dg, SkNf db, SkNf da) {                     \
-        name##_kernel(st->ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da);                \
-        next(st, x_tail, r,g,b,a, dr,dg,db,da);                                          \
+        name##_kernel(x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da);                         \
+        next(x_tail,p, r,g,b,a, dr,dg,db,da);                                            \
     }                                                                                    \
-    static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail,         \
+    static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail,                    \
                                                SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
                                                SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
 
+#define STAGE_CTX(name, Ctx)                                                             \
+    static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail,           \
+                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
+                                               SkNf& dr, SkNf& dg, SkNf& db, SkNf& da);  \
+    SI void SK_VECTORCALL name(size_t x_tail, void** p,                                  \
+                               SkNf  r, SkNf  g, SkNf  b, SkNf  a,                       \
+                               SkNf dr, SkNf dg, SkNf db, SkNf da) {                     \
+        auto ctx = (Ctx)load_and_increment(&p);                                          \
+        name##_kernel(ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da);                    \
+        next(x_tail,p, r,g,b,a, dr,dg,db,da);                                            \
+    }                                                                                    \
+    static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail,           \
+                                               SkNf&  r, SkNf&  g, SkNf&  b, SkNf&  a,   \
+                                               SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
 
 // Many xfermodes apply the same logic to each channel.
 #define RGBA_XFERMODE(name)                                                     \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
                                                const SkNf& d, const SkNf& da);  \
-    SI void SK_VECTORCALL name(Stage* st, size_t x_tail,                        \
+    SI void SK_VECTORCALL name(size_t x_tail, void** p,                         \
                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,              \
                                SkNf dr, SkNf dg, SkNf db, SkNf da) {            \
         r = name##_kernel(r,a,dr,da);                                           \
         g = name##_kernel(g,a,dg,da);                                           \
         b = name##_kernel(b,a,db,da);                                           \
         a = name##_kernel(a,a,da,da);                                           \
-        next(st, x_tail, r,g,b,a, dr,dg,db,da);                                 \
+        next(x_tail,p, r,g,b,a, dr,dg,db,da);                                   \
     }                                                                           \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
                                                const SkNf& d, const SkNf& da)
@@ -95,14 +132,14 @@
 #define RGB_XFERMODE(name)                                                      \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
                                                const SkNf& d, const SkNf& da);  \
-    SI void SK_VECTORCALL name(Stage* st, size_t x_tail,                         \
+    SI void SK_VECTORCALL name(size_t x_tail, void** p,                         \
                                SkNf  r, SkNf  g, SkNf  b, SkNf  a,              \
                                SkNf dr, SkNf dg, SkNf db, SkNf da) {            \
         r = name##_kernel(r,a,dr,da);                                           \
         g = name##_kernel(g,a,dg,da);                                           \
         b = name##_kernel(b,a,db,da);                                           \
         a = a + (da * (1.0f-a));                                                \
-        next(st, x_tail, r,g,b,a, dr,dg,db,da);                                 \
+        next(x_tail,p, r,g,b,a, dr,dg,db,da);                                   \
     }                                                                           \
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
                                                const SkNf& d, const SkNf& da)
@@ -274,8 +311,8 @@
     *a = SkHalfToFloat_finite_ftz(ah);
 }
 
-STAGE(trace) {
-    SkDebugf("%s\n", (const char*)ctx);
+STAGE_CTX(trace, const char*) {
+    SkDebugf("%s\n", ctx);
 }
 STAGE(registers) {
     auto print = [](const char* name, const SkNf& v) {
@@ -332,11 +369,10 @@
     b *= a;
 }
 
-STAGE(set_rgb) {
-    auto rgb = (const float*)ctx;
-    r = rgb[0];
-    g = rgb[1];
-    b = rgb[2];
+STAGE_CTX(set_rgb, const float*) {
+    r = ctx[0];
+    g = ctx[1];
+    b = ctx[2];
 }
 
 STAGE(move_src_dst) {
@@ -381,7 +417,7 @@
         // x^(141/64) = x^(128/64) * x^(12/64) * x^(1/64)
         return SkNf::Max((x*x) * (x16*x16*x16) * (x64), 0.0f);
     };
-    
+
     r = from_2dot2(r);
     g = from_2dot2(g);
     b = from_2dot2(b);
@@ -403,17 +439,16 @@
 }
 
 // The default shader produces a constant color (from the SkPaint).
-STAGE(constant_color) {
-    auto color = (const SkPM4f*)ctx;
-    r = color->r();
-    g = color->g();
-    b = color->b();
-    a = color->a();
+STAGE_CTX(constant_color, const SkPM4f*) {
+    r = ctx->r();
+    g = ctx->g();
+    b = ctx->b();
+    a = ctx->a();
 }
 
 // s' = sc for a scalar c.
-STAGE(scale_1_float) {
-    SkNf c = *(const float*)ctx;
+STAGE_CTX(scale_1_float, const float*) {
+    SkNf c = *ctx;
 
     r *= c;
     g *= c;
@@ -421,10 +456,10 @@
     a *= c;
 }
 // s' = sc for 8-bit c.
-STAGE(scale_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
-
+STAGE_CTX(scale_u8, const uint8_t**) {
+    auto ptr = *ctx + x;
     SkNf c = SkNf_from_byte(load(tail, ptr));
+
     r = r*c;
     g = g*c;
     b = b*c;
@@ -436,8 +471,8 @@
 }
 
 // s' = d(1-c) + sc, for a scalar c.
-STAGE(lerp_1_float) {
-    SkNf c = *(const float*)ctx;
+STAGE_CTX(lerp_1_float, const float*) {
+    SkNf c = *ctx;
 
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
@@ -446,10 +481,10 @@
 }
 
 // s' = d(1-c) + sc for 8-bit c.
-STAGE(lerp_u8) {
-    auto ptr = *(const uint8_t**)ctx + x;
-
+STAGE_CTX(lerp_u8, const uint8_t**) {
+    auto ptr = *ctx + x;
     SkNf c = SkNf_from_byte(load(tail, ptr));
+
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
@@ -457,8 +492,8 @@
 }
 
 // s' = d(1-c) + sc for 565 c.
-STAGE(lerp_565) {
-    auto ptr = *(const uint16_t**)ctx + x;
+STAGE_CTX(lerp_565, const uint16_t**) {
+    auto ptr = *ctx + x;
     SkNf cr, cg, cb;
     from_565(load(tail, ptr), &cr, &cg, &cb);
 
@@ -468,26 +503,26 @@
     a = 1.0f;
 }
 
-STAGE(load_565) {
-    auto ptr = *(const uint16_t**)ctx + x;
+STAGE_CTX(load_565, const uint16_t**) {
+    auto ptr = *ctx + x;
     from_565(load(tail, ptr), &r,&g,&b);
     a = 1.0f;
 }
-STAGE(load_565_d) {
-    auto ptr = *(const uint16_t**)ctx + x;
+STAGE_CTX(load_565_d, const uint16_t**) {
+    auto ptr = *ctx + x;
     from_565(load(tail, ptr), &dr,&dg,&db);
     da = 1.0f;
 }
-STAGE(store_565) {
-    auto ptr = *(uint16_t**)ctx + x;
+STAGE_CTX(store_565, uint16_t**) {
+    auto ptr = *ctx + x;
     store(tail, SkNx_cast<uint16_t>( SkNf_round(r, SK_R16_MASK) << SK_R16_SHIFT
                                    | SkNf_round(g, SK_G16_MASK) << SK_G16_SHIFT
                                    | SkNf_round(b, SK_B16_MASK) << SK_B16_SHIFT), ptr);
 }
 
 
-STAGE(load_f16) {
-    auto ptr = *(const uint64_t**)ctx + x;
+STAGE_CTX(load_f16, const uint64_t**) {
+    auto ptr = *ctx + x;
 
     const void* src = ptr;
     SkNx<N, uint64_t> px;
@@ -497,8 +532,8 @@
     }
     from_f16(src, &r, &g, &b, &a);
 }
-STAGE(load_f16_d) {
-    auto ptr = *(const uint64_t**)ctx + x;
+STAGE_CTX(load_f16_d, const uint64_t**) {
+    auto ptr = *ctx + x;
 
     const void* src = ptr;
     SkNx<N, uint64_t> px;
@@ -508,8 +543,8 @@
     }
     from_f16(src, &dr, &dg, &db, &da);
 }
-STAGE(store_f16) {
-    auto ptr = *(uint64_t**)ctx + x;
+STAGE_CTX(store_f16, uint64_t**) {
+    auto ptr = *ctx + x;
 
     SkNx<N, uint64_t> px;
     SkNh::Store4(tail ? (void*)&px : (void*)ptr, SkFloatToHalf_finite_ftz(r),
@@ -521,8 +556,8 @@
     }
 }
 
-STAGE(store_f32) {
-    auto ptr = *(SkPM4f**)ctx + x;
+STAGE_CTX(store_f32, SkPM4f**) {
+    auto ptr = *ctx + x;
 
     SkNx<N, SkPM4f> px;
     SkNf::Store4(tail ? (void*)&px : (void*)ptr, r,g,b,a);
@@ -532,15 +567,15 @@
 }
 
 
-STAGE(load_8888) {
-    auto ptr = *(const uint32_t**)ctx + x;
+STAGE_CTX(load_8888, const uint32_t**) {
+    auto ptr = *ctx + x;
     from_8888(load(tail, ptr), &r, &g, &b, &a);
 }
-STAGE(load_8888_d) {
-    auto ptr = *(const uint32_t**)ctx + x;
+STAGE_CTX(load_8888_d, const uint32_t**) {
+    auto ptr = *ctx + x;
     from_8888(load(tail, ptr), &dr, &dg, &db, &da);
 }
-STAGE(store_8888) {
+STAGE_CTX(store_8888, uint32_t**) {
     auto byte = [](const SkNf& x, int ix) {
         // Here's a neat trick: 0x47000000 == 32768.0f, and 0x470000ff == 32768.0f + (255/256.0f).
         auto v = SkNf_fma(255/256.0f, x, 32768.0f);
@@ -551,35 +586,33 @@
         return (SkNi::Load(&v) & 0xff) << (8*ix);  // B or G
     };
 
-    auto ptr = *(uint32_t**)ctx + x;
+    auto ptr = *ctx + x;
     store(tail, byte(r,0)|byte(g,1)|byte(b,2)|byte(a,3), (int*)ptr);
 }
 
-STAGE(load_tables) {
-    auto loadCtx = (const LoadTablesContext*)ctx;
-    auto ptr = loadCtx->fSrc + x;
+STAGE_CTX(load_tables, const LoadTablesContext*) {
+    auto ptr = ctx->fSrc + x;
 
     SkNu rgba = load(tail, ptr);
     auto to_int = [](const SkNu& v) { return SkNi::Load(&v); };
-    r = gather(tail, loadCtx->fR, to_int((rgba >>  0) & 0xff));
-    g = gather(tail, loadCtx->fG, to_int((rgba >>  8) & 0xff));
-    b = gather(tail, loadCtx->fB, to_int((rgba >> 16) & 0xff));
+    r = gather(tail, ctx->fR, to_int((rgba >>  0) & 0xff));
+    g = gather(tail, ctx->fG, to_int((rgba >>  8) & 0xff));
+    b = gather(tail, ctx->fB, to_int((rgba >> 16) & 0xff));
     a = SkNf_from_byte(rgba >> 24);
 }
 
-STAGE(store_tables) {
-    auto storeCtx = (const StoreTablesContext*)ctx;
-    auto ptr = storeCtx->fDst + x;
+STAGE_CTX(store_tables, const StoreTablesContext*) {
+    auto ptr = ctx->fDst + x;
 
-    float scale = storeCtx->fCount - 1;
+    float scale = ctx->fCount - 1;
     SkNi ri = SkNf_round(scale, r);
     SkNi gi = SkNf_round(scale, g);
     SkNi bi = SkNf_round(scale, b);
 
-    store(tail, ( SkNx_cast<int>(gather(tail, storeCtx->fR, ri)) << 0
-                | SkNx_cast<int>(gather(tail, storeCtx->fG, gi)) << 8
-                | SkNx_cast<int>(gather(tail, storeCtx->fB, bi)) << 16
-                | SkNf_round(255.0f, a)                          << 24), (int*)ptr);
+    store(tail, ( SkNx_cast<int>(gather(tail, ctx->fR, ri)) << 0
+                | SkNx_cast<int>(gather(tail, ctx->fG, gi)) << 8
+                | SkNx_cast<int>(gather(tail, ctx->fB, bi)) << 16
+                | SkNf_round(255.0f, a)                     << 24), (int*)ptr);
 }
 
 SI SkNf inv(const SkNf& x) { return 1.0f - x; }
@@ -640,16 +673,16 @@
     r = g = b = 0;
 }
 
-STAGE(matrix_2x3) {
-    auto m = (const float*)ctx;
+STAGE_CTX(matrix_2x3, const float*) {
+    auto m = ctx;
 
     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[2], m[4])),
          G = SkNf_fma(r,m[1], SkNf_fma(g,m[3], m[5]));
     r = R;
     g = G;
 }
-STAGE(matrix_3x4) {
-    auto m = (const float*)ctx;
+STAGE_CTX(matrix_3x4, const float*) {
+    auto m = ctx;
 
     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[3], SkNf_fma(b,m[6], m[ 9]))),
          G = SkNf_fma(r,m[1], SkNf_fma(g,m[4], SkNf_fma(b,m[7], m[10]))),
@@ -658,8 +691,8 @@
     g = G;
     b = B;
 }
-STAGE(matrix_4x5) {
-    auto m = (const float*)ctx;
+STAGE_CTX(matrix_4x5, const float*) {
+    auto m = ctx;
 
     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[4], SkNf_fma(b,m[ 8], SkNf_fma(a,m[12], m[16])))),
          G = SkNf_fma(r,m[1], SkNf_fma(g,m[5], SkNf_fma(b,m[ 9], SkNf_fma(a,m[13], m[17])))),
@@ -670,9 +703,9 @@
     b = B;
     a = A;
 }
-STAGE(matrix_perspective) {
+STAGE_CTX(matrix_perspective, const float*) {
     // N.B. unlike the matrix_NxM stages, this takes a row-major matrix.
-    auto m = (const float*)ctx;
+    auto m = ctx;
 
     auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[1], m[2])),
          G = SkNf_fma(r,m[3], SkNf_fma(g,m[4], m[5])),
@@ -692,10 +725,10 @@
     // Max(NaN, 0) = 0, but Max(0, NaN) = NaN, so we want this exact order to ensure NaN => 0
     return SkNf::Min(SkNf::Max(SkNf::Load(result), 0.0f), 1.0f);
 }
-STAGE(parametric_r) { r = parametric(r, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_g) { g = parametric(g, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_b) { b = parametric(b, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_a) { a = parametric(a, *(const SkColorSpaceTransferFn*)ctx); }
+STAGE_CTX(parametric_r, const SkColorSpaceTransferFn*) { r = parametric(r, *ctx); }
+STAGE_CTX(parametric_g, const SkColorSpaceTransferFn*) { g = parametric(g, *ctx); }
+STAGE_CTX(parametric_b, const SkColorSpaceTransferFn*) { b = parametric(b, *ctx); }
+STAGE_CTX(parametric_a, const SkColorSpaceTransferFn*) { a = parametric(a, *ctx); }
 
 SI SkNf table(const SkNf& v, const SkTableTransferFn& table) {
     float result[N];
@@ -705,13 +738,13 @@
     // no need to clamp - tables are by-design [0,1] -> [0,1]
     return SkNf::Load(result);
 }
-STAGE(table_r) { r = table(r, *(const SkTableTransferFn*)ctx); }
-STAGE(table_g) { g = table(g, *(const SkTableTransferFn*)ctx); }
-STAGE(table_b) { b = table(b, *(const SkTableTransferFn*)ctx); }
-STAGE(table_a) { a = table(a, *(const SkTableTransferFn*)ctx); }
+STAGE_CTX(table_r, const SkTableTransferFn*) { r = table(r, *ctx); }
+STAGE_CTX(table_g, const SkTableTransferFn*) { g = table(g, *ctx); }
+STAGE_CTX(table_b, const SkTableTransferFn*) { b = table(b, *ctx); }
+STAGE_CTX(table_a, const SkTableTransferFn*) { a = table(a, *ctx); }
 
-STAGE(color_lookup_table) {
-    const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx;
+STAGE_CTX(color_lookup_table, const SkColorLookUpTable*) {
+    const SkColorLookUpTable* colorLUT = ctx;
     SkASSERT(3 == colorLUT->inputChannels() || 4 == colorLUT->inputChannels());
     SkASSERT(3 == colorLUT->outputChannels());
     float result[3][N];
@@ -782,33 +815,29 @@
     result = SkNf::Min(result, nextafterf(l, 0));
     return assert_in_tile(result, l);
 }
-STAGE( clamp_x) { r = clamp (r, *(const float*)ctx); }
-STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
-STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
-STAGE( clamp_y) { g = clamp (g, *(const float*)ctx); }
-STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
-STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
+STAGE_CTX( clamp_x, const float*) { r = clamp (r, *ctx); }
+STAGE_CTX(repeat_x, const float*) { r = repeat(r, *ctx); }
+STAGE_CTX(mirror_x, const float*) { r = mirror(r, *ctx); }
+STAGE_CTX( clamp_y, const float*) { g = clamp (g, *ctx); }
+STAGE_CTX(repeat_y, const float*) { g = repeat(g, *ctx); }
+STAGE_CTX(mirror_y, const float*) { g = mirror(g, *ctx); }
 
-STAGE(save_xy) {
-    auto sc = (SkImageShaderContext*)ctx;
-
-    r.store(sc->x);
-    g.store(sc->y);
+STAGE_CTX(save_xy, SkImageShaderContext*) {
+    r.store(ctx->x);
+    g.store(ctx->y);
 
     // Whether bilinear or bicubic, all sample points have the same fractional offset (fx,fy).
     // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
     // surrounding (x,y), all (0.5,0.5) off-center.
     auto fract = [](const SkNf& v) { return v - v.floor(); };
-    fract(r + 0.5f).store(sc->fx);
-    fract(g + 0.5f).store(sc->fy);
+    fract(r + 0.5f).store(ctx->fx);
+    fract(g + 0.5f).store(ctx->fy);
 }
 
-STAGE(accumulate) {
-    auto sc = (const SkImageShaderContext*)ctx;
-
+STAGE_CTX(accumulate, const SkImageShaderContext*) {
     // Bilinear and bicubic filtering are both separable, so we'll end up with independent
     // scale contributions in x and y that we multiply together to get each pixel's scale factor.
-    auto scale = SkNf::Load(sc->scalex) * SkNf::Load(sc->scaley);
+    auto scale = SkNf::Load(ctx->scalex) * SkNf::Load(ctx->scaley);
     dr = SkNf_fma(scale, r, dr);
     dg = SkNf_fma(scale, g, dg);
     db = SkNf_fma(scale, b, db);
@@ -820,25 +849,21 @@
 // At positive offsets, the x-axis contribution to that rectangular area is fx; (1-fx)
 // at negative x offsets.  The y-axis is treated symmetrically.
 template <int Scale>
-SI void bilinear_x(void* ctx, SkNf* x) {
-    auto sc = (SkImageShaderContext*)ctx;
-
-    *x = SkNf::Load(sc->x) + Scale*0.5f;
-    auto fx = SkNf::Load(sc->fx);
-    (Scale > 0 ? fx : (1.0f - fx)).store(sc->scalex);
+SI void bilinear_x(SkImageShaderContext* ctx, SkNf* x) {
+    *x = SkNf::Load(ctx->x) + Scale*0.5f;
+    auto fx = SkNf::Load(ctx->fx);
+    (Scale > 0 ? fx : (1.0f - fx)).store(ctx->scalex);
 }
 template <int Scale>
-SI void bilinear_y(void* ctx, SkNf* y) {
-    auto sc = (SkImageShaderContext*)ctx;
-
-    *y = SkNf::Load(sc->y) + Scale*0.5f;
-    auto fy = SkNf::Load(sc->fy);
-    (Scale > 0 ? fy : (1.0f - fy)).store(sc->scaley);
+SI void bilinear_y(SkImageShaderContext* ctx, SkNf* y) {
+    *y = SkNf::Load(ctx->y) + Scale*0.5f;
+    auto fy = SkNf::Load(ctx->fy);
+    (Scale > 0 ? fy : (1.0f - fy)).store(ctx->scaley);
 }
-STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
-STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
-STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
-STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
+STAGE_CTX(bilinear_nx, SkImageShaderContext*) { bilinear_x<-1>(ctx, &r); }
+STAGE_CTX(bilinear_px, SkImageShaderContext*) { bilinear_x<+1>(ctx, &r); }
+STAGE_CTX(bilinear_ny, SkImageShaderContext*) { bilinear_y<-1>(ctx, &g); }
+STAGE_CTX(bilinear_py, SkImageShaderContext*) { bilinear_y<+1>(ctx, &g); }
 
 
 // In bilinear interpolation, the 16 pixels at +/- 0.5 and +/- 1.5 offsets from the sample
@@ -859,94 +884,87 @@
 }
 
 template <int Scale>
-SI void bicubic_x(void* ctx, SkNf* x) {
-    auto sc = (SkImageShaderContext*)ctx;
-
-    *x = SkNf::Load(sc->x) + Scale*0.5f;
-    auto fx = SkNf::Load(sc->fx);
-    if (Scale == -3) { return bicubic_far (1.0f - fx).store(sc->scalex); }
-    if (Scale == -1) { return bicubic_near(1.0f - fx).store(sc->scalex); }
-    if (Scale == +1) { return bicubic_near(       fx).store(sc->scalex); }
-    if (Scale == +3) { return bicubic_far (       fx).store(sc->scalex); }
+SI void bicubic_x(SkImageShaderContext* ctx, SkNf* x) {
+    *x = SkNf::Load(ctx->x) + Scale*0.5f;
+    auto fx = SkNf::Load(ctx->fx);
+    if (Scale == -3) { return bicubic_far (1.0f - fx).store(ctx->scalex); }
+    if (Scale == -1) { return bicubic_near(1.0f - fx).store(ctx->scalex); }
+    if (Scale == +1) { return bicubic_near(       fx).store(ctx->scalex); }
+    if (Scale == +3) { return bicubic_far (       fx).store(ctx->scalex); }
     SkDEBUGFAIL("unreachable");
 }
 template <int Scale>
-SI void bicubic_y(void* ctx, SkNf* y) {
-    auto sc = (SkImageShaderContext*)ctx;
-
-    *y = SkNf::Load(sc->y) + Scale*0.5f;
-    auto fy = SkNf::Load(sc->fy);
-    if (Scale == -3) { return bicubic_far (1.0f - fy).store(sc->scaley); }
-    if (Scale == -1) { return bicubic_near(1.0f - fy).store(sc->scaley); }
-    if (Scale == +1) { return bicubic_near(       fy).store(sc->scaley); }
-    if (Scale == +3) { return bicubic_far (       fy).store(sc->scaley); }
+SI void bicubic_y(SkImageShaderContext* ctx, SkNf* y) {
+    *y = SkNf::Load(ctx->y) + Scale*0.5f;
+    auto fy = SkNf::Load(ctx->fy);
+    if (Scale == -3) { return bicubic_far (1.0f - fy).store(ctx->scaley); }
+    if (Scale == -1) { return bicubic_near(1.0f - fy).store(ctx->scaley); }
+    if (Scale == +1) { return bicubic_near(       fy).store(ctx->scaley); }
+    if (Scale == +3) { return bicubic_far (       fy).store(ctx->scaley); }
     SkDEBUGFAIL("unreachable");
 }
-STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
-STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
-STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
-STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
+STAGE_CTX(bicubic_n3x, SkImageShaderContext*) { bicubic_x<-3>(ctx, &r); }
+STAGE_CTX(bicubic_n1x, SkImageShaderContext*) { bicubic_x<-1>(ctx, &r); }
+STAGE_CTX(bicubic_p1x, SkImageShaderContext*) { bicubic_x<+1>(ctx, &r); }
+STAGE_CTX(bicubic_p3x, SkImageShaderContext*) { bicubic_x<+3>(ctx, &r); }
 
-STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
-STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
-STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
-STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
+STAGE_CTX(bicubic_n3y, SkImageShaderContext*) { bicubic_y<-3>(ctx, &g); }
+STAGE_CTX(bicubic_n1y, SkImageShaderContext*) { bicubic_y<-1>(ctx, &g); }
+STAGE_CTX(bicubic_p1y, SkImageShaderContext*) { bicubic_y<+1>(ctx, &g); }
+STAGE_CTX(bicubic_p3y, SkImageShaderContext*) { bicubic_y<+3>(ctx, &g); }
 
 
 template <typename T>
-SI SkNi offset_and_ptr(T** ptr, const void* ctx, const SkNf& x, const SkNf& y) {
-    auto sc = (const SkImageShaderContext*)ctx;
-
+SI SkNi offset_and_ptr(T** ptr, const SkImageShaderContext* ctx, const SkNf& x, const SkNf& y) {
     SkNi ix = SkNx_cast<int>(x),
          iy = SkNx_cast<int>(y);
-    SkNi offset = iy*sc->stride + ix;
+    SkNi offset = iy*ctx->stride + ix;
 
-    *ptr = (const T*)sc->pixels;
+    *ptr = (const T*)ctx->pixels;
     return offset;
 }
 
-STAGE(gather_a8) {
+STAGE_CTX(gather_a8, const SkImageShaderContext*) {
     const uint8_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     r = g = b = 0.0f;
     a = SkNf_from_byte(gather(tail, p, offset));
 }
-STAGE(gather_i8) {
-    auto sc = (const SkImageShaderContext*)ctx;
+STAGE_CTX(gather_i8, const SkImageShaderContext*) {
     const uint8_t* p;
-    SkNi offset = offset_and_ptr(&p, sc, r, g);
+    SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     SkNi ix = SkNx_cast<int>(gather(tail, p, offset));
-    from_8888(gather(tail, sc->ctable->readColors(), ix), &r, &g, &b, &a);
+    from_8888(gather(tail, ctx->ctable->readColors(), ix), &r, &g, &b, &a);
 }
-STAGE(gather_g8) {
+STAGE_CTX(gather_g8, const SkImageShaderContext*) {
     const uint8_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     r = g = b = SkNf_from_byte(gather(tail, p, offset));
     a = 1.0f;
 }
-STAGE(gather_565) {
+STAGE_CTX(gather_565, const SkImageShaderContext*) {
     const uint16_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     from_565(gather(tail, p, offset), &r, &g, &b);
     a = 1.0f;
 }
-STAGE(gather_4444) {
+STAGE_CTX(gather_4444, const SkImageShaderContext*) {
     const uint16_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     from_4444(gather(tail, p, offset), &r, &g, &b, &a);
 }
-STAGE(gather_8888) {
+STAGE_CTX(gather_8888, const SkImageShaderContext*) {
     const uint32_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     from_8888(gather(tail, p, offset), &r, &g, &b, &a);
 }
-STAGE(gather_f16) {
+STAGE_CTX(gather_f16, const SkImageShaderContext*) {
     const uint64_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
@@ -967,17 +985,15 @@
 
 namespace {
     struct Compiled {
-        Compiled(const SkRasterPipeline::Stage* stages, int nstages) : fStages(nstages) {
-            if (nstages == 0) {
-                return;
+        Compiled(const SkRasterPipeline::Stage* stages, int nstages) {
+            fProgram.reserve(2*nstages+1);
+            for (int i = 0; i < nstages; i++) {
+                fProgram.push_back((void*)enum_to_Fn(stages[i].stage));
+                if (stages[i].ctx) {
+                    fProgram.push_back(stages[i].ctx);
+                }
             }
-            fStart = enum_to_Fn(stages[0].stage);
-            for (int i = 0; i < nstages-1; i++) {
-                fStages[i].next = enum_to_Fn(stages[i+1].stage);
-                fStages[i].ctx  = stages[i].ctx;
-            }
-            fStages[nstages-1].next = just_return;
-            fStages[nstages-1].ctx  = stages[nstages-1].ctx;
+            fProgram.push_back((void*)just_return);
         }
 
         void operator()(size_t x, size_t y, size_t n) {
@@ -987,19 +1003,20 @@
                 _0 = SkNf(0),
                 _1 = SkNf(1);
 
+            void** p = fProgram.data();
+            auto start = (Fn)load_and_increment(&p);
             while (n >= N) {
-                fStart(fStages.data(), x*N, X,Y,_1,_0, _0,_0,_0,_0);
+                start(x*N, p, X,Y,_1,_0, _0,_0,_0,_0);
                 X += (float)N;
                 x += N;
                 n -= N;
             }
             if (n) {
-                fStart(fStages.data(), x*N+n, X,Y,_1,_0, _0,_0,_0,_0);
+                start(x*N+n, p, X,Y,_1,_0, _0,_0,_0,_0);
             }
         }
 
-        Fn fStart = just_return;
-        std::vector<Stage> fStages;
+        std::vector<void*> fProgram;
     };
 }
 
@@ -1019,6 +1036,7 @@
 
 #undef SI
 #undef STAGE
+#undef STAGE_CTX
 #undef RGBA_XFERMODE
 #undef RGB_XFERMODE
commit	e61c40707e70a2be9e32227a929173864f7895e1	[log] [tgz]
author	Mike Klein <mtklein@chromium.org>	Thu Dec 29 11:06:34 2016 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Tue Jan 03 18:13:21 2017 +0000
tree	a12726f4f2f2d1d3788f7cf99a8477ceae8191dd
parent	7551898f8eba322acb04c74ae12aae1ed3548105 [diff]