blob: 623ccdc0b29f1cc69361fe136e90a66bab998b51 [file] [log] [blame]
Mike Kleine1caee12017-02-15 13:31:12 -05001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkJumper.h"
Mike Klein5664e652017-05-01 16:01:38 -04009#include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast()
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040010#include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand()
Mike Kleine1caee12017-02-15 13:31:12 -050011
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040012// Our fundamental vector depth is our pixel stride.
Mike Kleinc31858b2017-03-01 13:07:40 -050013static const size_t kStride = sizeof(F) / sizeof(float);
14
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040015// A reminder:
16// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
17// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
18// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
19
Mike Klein5664e652017-05-01 16:01:38 -040020// You can use most constants in this file, but in a few rare exceptions we read from this struct.
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040021using K = const SkJumper_constants;
22
Herb Derby7b4202d2017-04-10 10:52:34 -040023
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040024// Let's start first with the mechanisms we use to build Stages.
25
26// Our program is an array of void*, either
27// - 1 void* per stage with no context pointer, the next stage;
28// - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
29
30// load_and_inc() steps the program forward by 1 void*, returning that pointer.
31SI void* load_and_inc(void**& program) {
32#if defined(__GNUC__) && defined(__x86_64__)
33 // If program is in %rsi (we try to make this likely) then this is a single instruction.
34 void* rax;
35 asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
36 return rax;
Mike Kleine1caee12017-02-15 13:31:12 -050037#else
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040038 // On ARM *program++ compiles into pretty ideal code without any handholding.
39 return *program++;
Mike Kleine1caee12017-02-15 13:31:12 -050040#endif
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040041}
42
Mike Klein8a823fa2017-04-05 17:29:26 -040043// LazyCtx doesn't do anything unless you call operator T*(), encapsulating the logic
44// from above that stages without a context pointer are represented by just 1 void*.
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040045struct LazyCtx {
46 void* ptr;
47 void**& program;
48
49 explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
50
51 template <typename T>
52 operator T*() {
53 if (!ptr) { ptr = load_and_inc(program); }
54 return (T*)ptr;
55 }
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040056};
57
58// A little wrapper macro to name Stages differently depending on the instruction set.
59// That lets us link together several options.
60#if !defined(JUMPER)
61 #define WRAP(name) sk_##name
62#elif defined(__aarch64__)
63 #define WRAP(name) sk_##name##_aarch64
64#elif defined(__arm__)
65 #define WRAP(name) sk_##name##_vfp4
66#elif defined(__AVX2__)
67 #define WRAP(name) sk_##name##_hsw
68#elif defined(__AVX__)
69 #define WRAP(name) sk_##name##_avx
70#elif defined(__SSE4_1__)
71 #define WRAP(name) sk_##name##_sse41
72#elif defined(__SSE2__)
73 #define WRAP(name) sk_##name##_sse2
74#endif
75
76// We're finally going to get to what a Stage function looks like!
77// It's best to jump down to the #else case first, then to come back up here for AVX.
78
79#if defined(JUMPER) && defined(__AVX__)
80 // There's a big cost to switch between SSE and AVX, so we do a little
81 // extra work to handle even the jagged <kStride tail in AVX mode.
82 // Compared to normal stages, we maintain an extra tail register:
83 // tail == 0 ~~> work on a full kStride pixels
84 // tail != 0 ~~> work on only the first tail pixels
85 // tail is always < kStride.
86 using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
87
Mike Klein7fee90c2017-04-07 16:55:09 -040088 MAYBE_MSABI
Mike Kleinb9c4a6f2017-04-03 13:54:55 -040089 extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
90 F v{};
91 auto start = (Stage*)load_and_inc(program);
92 while (x + kStride <= limit) {
93 start(x,program,k,0, v,v,v,v, v,v,v,v);
94 x += kStride;
95 }
96 if (size_t tail = limit - x) {
97 start(x,program,k,tail, v,v,v,v, v,v,v,v);
98 }
99 return limit;
100 }
101
102 #define STAGE(name) \
103 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
104 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
105 extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
106 F r, F g, F b, F a, F dr, F dg, F db, F da) { \
107 LazyCtx ctx(program); \
108 name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
109 auto next = (Stage*)load_and_inc(program); \
110 next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
111 } \
112 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
113 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
114
115#else
116 // Other instruction sets (SSE, NEON, portable) can fall back on narrower
117 // pipelines cheaply, which frees us to always assume tail==0.
118
119 // Stages tail call between each other by following program as described above.
120 // x is our induction variable, stepping forward kStride at a time.
121 using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
122
123 // On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V.
Mike Klein7fee90c2017-04-07 16:55:09 -0400124 MAYBE_MSABI
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400125 extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
126 F v{};
127 auto start = (Stage*)load_and_inc(program);
128 while (x + kStride <= limit) {
129 start(x,program,k, v,v,v,v, v,v,v,v);
130 x += kStride;
131 }
132 return x;
133 }
134
135 // This STAGE macro makes it easier to write stages, handling all the Stage chaining for you.
136 #define STAGE(name) \
137 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
138 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
139 extern "C" void WRAP(name)(size_t x, void** program, K* k, \
140 F r, F g, F b, F a, F dr, F dg, F db, F da) { \
141 LazyCtx ctx(program); \
142 name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
143 auto next = (Stage*)load_and_inc(program); \
144 next(x,program,k, r,g,b,a, dr,dg,db,da); \
145 } \
146 SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
147 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
148#endif
149
150// just_return() is a simple no-op stage that only exists to end the chain,
151// returning back up to start_pipeline(), and from there to the caller.
152extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
153
154
Mike Klein8a823fa2017-04-05 17:29:26 -0400155// We could start defining normal Stages now. But first, some helper functions.
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400156
157// These load() and store() methods are tail-aware,
158// but focus mainly on keeping the at-stride tail==0 case fast.
Mike Kleine1caee12017-02-15 13:31:12 -0500159
Mike Kleinc31858b2017-03-01 13:07:40 -0500160template <typename V, typename T>
Mike Klein64b97482017-03-14 17:35:04 -0700161SI V load(const T* src, size_t tail) {
Mike Kleinc31858b2017-03-01 13:07:40 -0500162#if defined(JUMPER)
Mike Klein4e7fc0c2017-03-02 11:16:22 -0500163 __builtin_assume(tail < kStride);
Mike Kleinc31858b2017-03-01 13:07:40 -0500164 if (__builtin_expect(tail, 0)) {
165 V v{}; // Any inactive lanes are zeroed.
Mike Klein4e7fc0c2017-03-02 11:16:22 -0500166 switch (tail-1) {
167 case 6: v[6] = src[6];
168 case 5: v[5] = src[5];
169 case 4: v[4] = src[4];
170 case 3: v[3] = src[3];
171 case 2: v[2] = src[2];
172 case 1: v[1] = src[1];
173 case 0: v[0] = src[0];
Mike Kleinc31858b2017-03-01 13:07:40 -0500174 }
175 return v;
176 }
177#endif
178 return unaligned_load<V>(src);
179}
180
Mike Kleinc31858b2017-03-01 13:07:40 -0500181template <typename V, typename T>
Mike Klein64b97482017-03-14 17:35:04 -0700182SI void store(T* dst, V v, size_t tail) {
Mike Kleinc31858b2017-03-01 13:07:40 -0500183#if defined(JUMPER)
Mike Klein4e7fc0c2017-03-02 11:16:22 -0500184 __builtin_assume(tail < kStride);
Mike Kleinc31858b2017-03-01 13:07:40 -0500185 if (__builtin_expect(tail, 0)) {
Mike Klein4e7fc0c2017-03-02 11:16:22 -0500186 switch (tail-1) {
187 case 6: dst[6] = v[6];
188 case 5: dst[5] = v[5];
189 case 4: dst[4] = v[4];
190 case 3: dst[3] = v[3];
191 case 2: dst[2] = v[2];
192 case 1: dst[1] = v[1];
193 case 0: dst[0] = v[0];
Mike Kleinc31858b2017-03-01 13:07:40 -0500194 }
195 return;
196 }
197#endif
Mike Kleinc33aa902017-05-15 10:20:48 -0400198 unaligned_store(dst, v);
Mike Kleinc31858b2017-03-01 13:07:40 -0500199}
200
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400201// This doesn't look strictly necessary, but without it Clang would generate load() using
202// compiler-generated constants that we can't support. This version doesn't need constants.
203#if defined(JUMPER) && defined(__AVX__)
Mike Klein767c7e72017-03-02 14:08:36 -0500204 template <>
205 inline U8 load(const uint8_t* src, size_t tail) {
206 if (__builtin_expect(tail, 0)) {
207 uint64_t v = 0;
208 size_t shift = 0;
209 #pragma nounroll
210 while (tail --> 0) {
211 v |= (uint64_t)*src++ << shift;
212 shift += 8;
213 }
214 return unaligned_load<U8>(&v);
215 }
216 return unaligned_load<U8>(src);
217 }
218#endif
219
Brian Osman74fc5932017-05-22 19:25:36 +0000220// AVX2 adds some mask loads and stores that make for shorter, faster code.
221#if defined(JUMPER) && defined(__AVX2__)
Mike Klein64b97482017-03-14 17:35:04 -0700222 SI U32 mask(size_t tail) {
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400223 // We go a little out of our way to avoid needing large constant values here.
224
Mike Klein767c7e72017-03-02 14:08:36 -0500225 // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
226 // Start fully on, then shift away lanes from the top until we've got our mask.
227 uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
228
229 // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
Brian Osman74fc5932017-05-22 19:25:36 +0000230 return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
Mike Klein767c7e72017-03-02 14:08:36 -0500231 }
232
233 template <>
234 inline U32 load(const uint32_t* src, size_t tail) {
235 __builtin_assume(tail < kStride);
236 if (__builtin_expect(tail, 0)) {
Brian Osman74fc5932017-05-22 19:25:36 +0000237 return _mm256_maskload_epi32((const int*)src, mask(tail));
Mike Klein767c7e72017-03-02 14:08:36 -0500238 }
239 return unaligned_load<U32>(src);
240 }
241
242 template <>
243 inline void store(uint32_t* dst, U32 v, size_t tail) {
244 __builtin_assume(tail < kStride);
245 if (__builtin_expect(tail, 0)) {
Brian Osman74fc5932017-05-22 19:25:36 +0000246 return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
Mike Klein767c7e72017-03-02 14:08:36 -0500247 }
Mike Kleinc33aa902017-05-15 10:20:48 -0400248 unaligned_store(dst, v);
Mike Klein767c7e72017-03-02 14:08:36 -0500249 }
250#endif
251
Mike Klein40de6da2017-04-07 13:09:29 -0400252SI F from_byte(U8 b) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400253 return cast(expand(b)) * (1/255.0f);
Mike Klein40de6da2017-04-07 13:09:29 -0400254}
Mike Klein64b97482017-03-14 17:35:04 -0700255SI void from_565(U16 _565, F* r, F* g, F* b) {
Mike Klein3f81f372017-02-23 13:03:57 -0500256 U32 wide = expand(_565);
Mike Kleinfe560a82017-05-01 12:56:35 -0400257 *r = cast(wide & (31<<11)) * (1.0f / (31<<11));
258 *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5));
259 *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0));
Mike Kleine1caee12017-02-15 13:31:12 -0500260}
Mike Kleinf809fef2017-03-31 13:52:45 -0400261SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
262 U32 wide = expand(_4444);
Mike Kleinfe560a82017-05-01 12:56:35 -0400263 *r = cast(wide & (15<<12)) * (1.0f / (15<<12));
264 *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8));
265 *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4));
266 *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0));
Mike Kleinf809fef2017-03-31 13:52:45 -0400267}
Mike Kleindec4ea82017-04-06 15:04:05 -0400268SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400269 *r = cast((_8888 ) & 0xff) * (1/255.0f);
270 *g = cast((_8888 >> 8) & 0xff) * (1/255.0f);
271 *b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
272 *a = cast((_8888 >> 24) ) * (1/255.0f);
Mike Kleindec4ea82017-04-06 15:04:05 -0400273}
274
275template <typename T>
Mike Klein994ef972017-04-12 11:51:11 -0400276SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
Mike Kleindec4ea82017-04-06 15:04:05 -0400277 *ptr = (const T*)ctx->pixels;
278 return trunc_(y)*ctx->stride + trunc_(x);
279}
Mike Kleine1caee12017-02-15 13:31:12 -0500280
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400281// Now finally, normal Stages!
Mike Kleine1caee12017-02-15 13:31:12 -0500282
283STAGE(seed_shader) {
284 auto y = *(const int*)ctx;
285
286 // It's important for speed to explicitly cast(x) and cast(y),
287 // which has the effect of splatting them to vectors before converting to floats.
288 // On Intel this breaks a data dependency on previous loop iterations' registers.
Mike Klein5d7f2b52017-05-20 13:21:59 -0400289 r = cast(x) + 0.5f + unaligned_load<F>(k->iota_F);
Mike Klein2229b572017-04-21 10:30:29 -0400290 g = cast(y) + 0.5f;
291 b = 1.0f;
Mike Kleine1caee12017-02-15 13:31:12 -0500292 a = 0;
293 dr = dg = db = da = 0;
294}
295
Mike Klein581e6982017-05-03 13:05:13 -0400296STAGE(dither) {
297 auto c = (const SkJumper_DitherCtx*)ctx;
298
299 // Get [(x,y), (x+1,y), (x+2,y), ...] loaded up in integer vectors.
Mike Klein5d7f2b52017-05-20 13:21:59 -0400300 U32 X = x + unaligned_load<U32>(k->iota_U32),
Mike Klein581e6982017-05-03 13:05:13 -0400301 Y = (uint32_t)*c->y;
302
303 // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
304 // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
305
306 // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
307 Y ^= X;
308
309 // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
310 // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
311 U32 M = (Y & 1) << 5 | (X & 1) << 4
312 | (Y & 2) << 2 | (X & 2) << 1
313 | (Y & 4) >> 1 | (X & 4) >> 2;
314
Mike Kleindb711c92017-05-03 17:57:48 -0400315 // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
316 // We want to make sure our dither is less than 0.5 in either direction to keep exact values
317 // like 0 and 1 unchanged after rounding.
318 F dither = cast(M) * (2/128.0f) - (63/128.0f);
Mike Klein581e6982017-05-03 13:05:13 -0400319
Mike Kleindb711c92017-05-03 17:57:48 -0400320 r += c->rate*dither;
321 g += c->rate*dither;
322 b += c->rate*dither;
Mike Klein7e68bc92017-05-16 12:03:15 -0400323
324 r = max(0, min(r, a));
325 g = max(0, min(g, a));
326 b = max(0, min(b, a));
Mike Klein581e6982017-05-03 13:05:13 -0400327}
328
Mike Reed9959f722017-05-15 09:34:22 -0400329// load 4 floats from memory, and splat them into r,g,b,a
Mike Kleine1caee12017-02-15 13:31:12 -0500330STAGE(constant_color) {
Mike Klein8a823fa2017-04-05 17:29:26 -0400331 auto rgba = (const float*)ctx;
Mike Kleine1caee12017-02-15 13:31:12 -0500332 r = rgba[0];
333 g = rgba[1];
334 b = rgba[2];
335 a = rgba[3];
336}
337
Mike Reed9959f722017-05-15 09:34:22 -0400338// load registers r,g,b,a from context (mirrors store_rgba)
339STAGE(load_rgba) {
340 auto ptr = (const float*)ctx;
341 r = unaligned_load<F>(ptr + 0*kStride);
342 g = unaligned_load<F>(ptr + 1*kStride);
343 b = unaligned_load<F>(ptr + 2*kStride);
344 a = unaligned_load<F>(ptr + 3*kStride);
345}
346
347// store registers r,g,b,a into context (mirrors load_rgba)
348STAGE(store_rgba) {
349 auto ptr = (float*)ctx;
Mike Kleinc33aa902017-05-15 10:20:48 -0400350 unaligned_store(ptr + 0*kStride, r);
351 unaligned_store(ptr + 1*kStride, g);
352 unaligned_store(ptr + 2*kStride, b);
353 unaligned_store(ptr + 3*kStride, a);
Mike Reed9959f722017-05-15 09:34:22 -0400354}
355
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400356// Most blend modes apply the same logic to each channel.
Mike Kleinaaca1e42017-03-31 09:29:01 -0400357#define BLEND_MODE(name) \
358 SI F name##_channel(F s, F d, F sa, F da); \
359 STAGE(name) { \
360 r = name##_channel(r,dr,a,da); \
361 g = name##_channel(g,dg,a,da); \
362 b = name##_channel(b,db,a,da); \
363 a = name##_channel(a,da,a,da); \
364 } \
365 SI F name##_channel(F s, F d, F sa, F da)
Mike Kleine1caee12017-02-15 13:31:12 -0500366
Mike Kleinfe560a82017-05-01 12:56:35 -0400367SI F inv(F x) { return 1.0f - x; }
Mike Klein66b09ab2017-03-31 10:29:40 -0400368SI F two(F x) { return x + x; }
Mike Kleine1caee12017-02-15 13:31:12 -0500369
Mike Kleinaaca1e42017-03-31 09:29:01 -0400370BLEND_MODE(clear) { return 0; }
371BLEND_MODE(srcatop) { return s*da + d*inv(sa); }
372BLEND_MODE(dstatop) { return d*sa + s*inv(da); }
373BLEND_MODE(srcin) { return s * da; }
374BLEND_MODE(dstin) { return d * sa; }
375BLEND_MODE(srcout) { return s * inv(da); }
376BLEND_MODE(dstout) { return d * inv(sa); }
377BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
378BLEND_MODE(dstover) { return mad(s, inv(da), d); }
379
380BLEND_MODE(modulate) { return s*d; }
381BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
382BLEND_MODE(plus_) { return s + d; }
383BLEND_MODE(screen) { return s + d - s*d; }
384BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); }
Mike Klein66b09ab2017-03-31 10:29:40 -0400385#undef BLEND_MODE
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400386
387// Most other blend modes apply the same logic to colors, and srcover to alpha.
Mike Klein66b09ab2017-03-31 10:29:40 -0400388#define BLEND_MODE(name) \
389 SI F name##_channel(F s, F d, F sa, F da); \
390 STAGE(name) { \
391 r = name##_channel(r,dr,a,da); \
392 g = name##_channel(g,dg,a,da); \
393 b = name##_channel(b,db,a,da); \
394 a = mad(da, inv(a), a); \
395 } \
396 SI F name##_channel(F s, F d, F sa, F da)
397
398BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; }
399BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; }
400BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
401BLEND_MODE(exclusion) { return s + d - two(s*d); }
402
Mike Klein61b84162017-03-31 11:48:14 -0400403BLEND_MODE(colorburn) {
404 return if_then_else(d == da, d + s*inv(da),
405 if_then_else(s == 0, s + d*inv(sa),
406 sa*(da - min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa)));
407}
408BLEND_MODE(colordodge) {
409 return if_then_else(d == 0, d + s*inv(da),
410 if_then_else(s == sa, s + d*inv(sa),
411 sa*min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa)));
412}
413BLEND_MODE(hardlight) {
414 return s*inv(da) + d*inv(sa)
415 + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
416}
417BLEND_MODE(overlay) {
418 return s*inv(da) + d*inv(sa)
419 + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
420}
421
422BLEND_MODE(softlight) {
423 F m = if_then_else(da > 0, d / da, 0),
424 s2 = two(s),
425 m4 = two(two(m));
426
427 // The logic forks three ways:
428 // 1. dark src?
429 // 2. light src, dark dst?
430 // 3. light src, light dst?
Mike Kleinfe560a82017-05-01 12:56:35 -0400431 F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1.
432 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2.
433 liteDst = rcp(rsqrt(m)) - m, // Used in case 3.
Mike Klein61b84162017-03-31 11:48:14 -0400434 liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
435 return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
436}
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400437#undef BLEND_MODE
Mike Klein61b84162017-03-31 11:48:14 -0400438
Mike Kleinbb338332017-05-04 12:42:52 -0400439// We're basing our implemenation of non-separable blend modes on
440// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
441// and
442// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
443// They're equivalent, but ES' math has been better simplified.
Mike Klein08aa88d2017-05-12 12:59:24 -0400444//
445// Anything extra we add beyond that is to make the math work with premul inputs.
Mike Kleinbb338332017-05-04 12:42:52 -0400446
447SI F max(F r, F g, F b) { return max(r, max(g, b)); }
448SI F min(F r, F g, F b) { return min(r, min(g, b)); }
449
450SI F sat(F r, F g, F b) { return max(r,g,b) - min(r,g,b); }
451SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; }
452
453SI void set_sat(F* r, F* g, F* b, F s) {
454 F mn = min(*r,*g,*b),
455 mx = max(*r,*g,*b),
456 sat = mx - mn;
457
458 // Map min channel to 0, max channel to s, and scale the middle proportionally.
459 auto scale = [=](F c) {
460 return if_then_else(sat == 0, 0, (c - mn) * s / sat);
461 };
462 *r = scale(*r);
463 *g = scale(*g);
464 *b = scale(*b);
465}
Mike Klein08aa88d2017-05-12 12:59:24 -0400466SI void set_lum(F* r, F* g, F* b, F l) {
467 F diff = l - lum(*r, *g, *b);
468 *r += diff;
469 *g += diff;
470 *b += diff;
471}
472SI void clip_color(F* r, F* g, F* b, F a) {
Mike Kleinbb338332017-05-04 12:42:52 -0400473 F mn = min(*r, *g, *b),
474 mx = max(*r, *g, *b),
475 l = lum(*r, *g, *b);
476
477 auto clip = [=](F c) {
478 c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) );
Mike Klein08aa88d2017-05-12 12:59:24 -0400479 c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c);
Mike Kleinbb338332017-05-04 12:42:52 -0400480 c = max(c, 0); // Sometimes without this we may dip just a little negative.
481 return c;
482 };
483 *r = clip(*r);
484 *g = clip(*g);
485 *b = clip(*b);
486}
Mike Kleinbb338332017-05-04 12:42:52 -0400487
488STAGE(hue) {
Mike Klein08aa88d2017-05-12 12:59:24 -0400489 F R = r*a,
490 G = g*a,
491 B = b*a;
Mike Kleinbb338332017-05-04 12:42:52 -0400492
Mike Klein08aa88d2017-05-12 12:59:24 -0400493 set_sat(&R, &G, &B, sat(dr,dg,db)*a);
494 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
495 clip_color(&R,&G,&B, a*da);
Mike Kleinbb338332017-05-04 12:42:52 -0400496
Mike Klein08aa88d2017-05-12 12:59:24 -0400497 r = r*inv(da) + dr*inv(a) + R;
498 g = g*inv(da) + dg*inv(a) + G;
499 b = b*inv(da) + db*inv(a) + B;
Mike Kleinbb338332017-05-04 12:42:52 -0400500 a = a + da - a*da;
Mike Kleinbb338332017-05-04 12:42:52 -0400501}
502STAGE(saturation) {
Mike Klein08aa88d2017-05-12 12:59:24 -0400503 F R = dr*a,
504 G = dg*a,
505 B = db*a;
Mike Kleinbb338332017-05-04 12:42:52 -0400506
Mike Klein08aa88d2017-05-12 12:59:24 -0400507 set_sat(&R, &G, &B, sat( r, g, b)*da);
508 set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
509 clip_color(&R,&G,&B, a*da);
Mike Kleinbb338332017-05-04 12:42:52 -0400510
Mike Klein08aa88d2017-05-12 12:59:24 -0400511 r = r*inv(da) + dr*inv(a) + R;
512 g = g*inv(da) + dg*inv(a) + G;
513 b = b*inv(da) + db*inv(a) + B;
Mike Kleinbb338332017-05-04 12:42:52 -0400514 a = a + da - a*da;
Mike Kleinbb338332017-05-04 12:42:52 -0400515}
516STAGE(color) {
Mike Klein08aa88d2017-05-12 12:59:24 -0400517 F R = r*da,
518 G = g*da,
519 B = b*da;
Mike Kleinbb338332017-05-04 12:42:52 -0400520
Mike Klein08aa88d2017-05-12 12:59:24 -0400521 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
522 clip_color(&R,&G,&B, a*da);
Mike Kleinbb338332017-05-04 12:42:52 -0400523
Mike Klein08aa88d2017-05-12 12:59:24 -0400524 r = r*inv(da) + dr*inv(a) + R;
525 g = g*inv(da) + dg*inv(a) + G;
526 b = b*inv(da) + db*inv(a) + B;
Mike Kleinbb338332017-05-04 12:42:52 -0400527 a = a + da - a*da;
Mike Kleinbb338332017-05-04 12:42:52 -0400528}
529STAGE(luminosity) {
Mike Klein08aa88d2017-05-12 12:59:24 -0400530 F R = dr*a,
531 G = dg*a,
532 B = db*a;
Mike Kleinbb338332017-05-04 12:42:52 -0400533
Mike Klein08aa88d2017-05-12 12:59:24 -0400534 set_lum(&R, &G, &B, lum(r,g,b)*da);
535 clip_color(&R,&G,&B, a*da);
Mike Kleinbb338332017-05-04 12:42:52 -0400536
Mike Klein08aa88d2017-05-12 12:59:24 -0400537 r = r*inv(da) + dr*inv(a) + R;
538 g = g*inv(da) + dg*inv(a) + G;
539 b = b*inv(da) + db*inv(a) + B;
Mike Kleinbb338332017-05-04 12:42:52 -0400540 a = a + da - a*da;
Mike Kleinbb338332017-05-04 12:42:52 -0400541}
542
Mike Kleine1caee12017-02-15 13:31:12 -0500543STAGE(clamp_0) {
544 r = max(r, 0);
545 g = max(g, 0);
546 b = max(b, 0);
547 a = max(a, 0);
548}
549
550STAGE(clamp_1) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400551 r = min(r, 1.0f);
552 g = min(g, 1.0f);
553 b = min(b, 1.0f);
554 a = min(a, 1.0f);
Mike Kleine1caee12017-02-15 13:31:12 -0500555}
556
557STAGE(clamp_a) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400558 a = min(a, 1.0f);
Mike Kleine1caee12017-02-15 13:31:12 -0500559 r = min(r, a);
560 g = min(g, a);
561 b = min(b, a);
562}
563
Mike Kleind9e82252017-02-22 14:17:32 -0500564STAGE(set_rgb) {
565 auto rgb = (const float*)ctx;
566 r = rgb[0];
567 g = rgb[1];
568 b = rgb[2];
569}
570STAGE(swap_rb) {
571 auto tmp = r;
572 r = b;
573 b = tmp;
574}
575
Mike Kleine1caee12017-02-15 13:31:12 -0500576STAGE(swap) {
577 auto swap = [](F& v, F& dv) {
578 auto tmp = v;
579 v = dv;
580 dv = tmp;
581 };
582 swap(r, dr);
583 swap(g, dg);
584 swap(b, db);
585 swap(a, da);
586}
587STAGE(move_src_dst) {
588 dr = r;
589 dg = g;
590 db = b;
591 da = a;
592}
593STAGE(move_dst_src) {
594 r = dr;
595 g = dg;
596 b = db;
597 a = da;
598}
599
600STAGE(premul) {
601 r = r * a;
602 g = g * a;
603 b = b * a;
604}
605STAGE(unpremul) {
Mike Klein08aa88d2017-05-12 12:59:24 -0400606 auto scale = if_then_else(a == 0, 0, 1.0f / a);
607 r *= scale;
608 g *= scale;
609 b *= scale;
Mike Kleine1caee12017-02-15 13:31:12 -0500610}
611
612STAGE(from_srgb) {
613 auto fn = [&](F s) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400614 auto lo = s * (1/12.92f);
615 auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f);
616 return if_then_else(s < 0.055f, lo, hi);
Mike Kleine1caee12017-02-15 13:31:12 -0500617 };
618 r = fn(r);
619 g = fn(g);
620 b = fn(b);
621}
622STAGE(to_srgb) {
623 auto fn = [&](F l) {
Mike Kleinf45e3d72017-05-15 17:36:59 -0400624 F t = rsqrt(l);
625 auto lo = l * 12.92f;
626 auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), 1.1334244f)
627 * rcp(0.14513608f + t);
628 return if_then_else(l < 0.00465985f, lo, hi);
Mike Kleine1caee12017-02-15 13:31:12 -0500629 };
630 r = fn(r);
631 g = fn(g);
632 b = fn(b);
633}
634
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400635STAGE(rgb_to_hsl) {
636 F mx = max(max(r,g), b),
637 mn = min(min(r,g), b),
638 d = mx - mn,
Mike Kleinfe560a82017-05-01 12:56:35 -0400639 d_rcp = 1.0f / d;
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400640
Mike Kleinfe560a82017-05-01 12:56:35 -0400641 F h = (1/6.0f) *
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400642 if_then_else(mx == mn, 0,
Mike Kleinfe560a82017-05-01 12:56:35 -0400643 if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0),
644 if_then_else(mx == g, (b-r)*d_rcp + 2.0f,
645 (r-g)*d_rcp + 4.0f)));
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400646
Mike Kleinfe560a82017-05-01 12:56:35 -0400647 F l = (mx + mn) * 0.5f;
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400648 F s = if_then_else(mx == mn, 0,
Mike Kleinfe560a82017-05-01 12:56:35 -0400649 d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn));
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400650
651 r = h;
652 g = s;
653 b = l;
654}
655STAGE(hsl_to_rgb) {
656 F h = r,
657 s = g,
658 l = b;
659
Mike Klein5664e652017-05-01 16:01:38 -0400660 F q = l + if_then_else(l >= 0.5f, s - l*s, l*s),
661 p = 2.0f*l - q;
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400662
663 auto hue_to_rgb = [&](F t) {
Mike Klein879a08a2017-05-01 15:34:01 -0400664 t = fract(t);
Mike Klein5664e652017-05-01 16:01:38 -0400665
666 F r = p;
667 r = if_then_else(t >= 4/6.0f, r, p + (q-p)*(4.0f - 6.0f*t));
668 r = if_then_else(t >= 3/6.0f, r, q);
669 r = if_then_else(t >= 1/6.0f, r, p + (q-p)*( 6.0f*t));
670 return r;
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400671 };
672
Mike Kleinfb11acd2017-05-01 14:22:10 -0400673 r = if_then_else(s == 0, l, hue_to_rgb(h + (1/3.0f)));
674 g = if_then_else(s == 0, l, hue_to_rgb(h ));
675 b = if_then_else(s == 0, l, hue_to_rgb(h - (1/3.0f)));
Mike Kleindb1cbcb2017-04-12 08:35:41 -0400676}
677
Mike Kleine3d44212017-02-24 08:21:18 -0500678STAGE(scale_1_float) {
679 auto c = *(const float*)ctx;
680
681 r = r * c;
682 g = g * c;
683 b = b * c;
684 a = a * c;
685}
Mike Kleine1caee12017-02-15 13:31:12 -0500686STAGE(scale_u8) {
687 auto ptr = *(const uint8_t**)ctx + x;
688
Mike Kleinc31858b2017-03-01 13:07:40 -0500689 auto scales = load<U8>(ptr, tail);
Mike Klein40de6da2017-04-07 13:09:29 -0400690 auto c = from_byte(scales);
Mike Kleine1caee12017-02-15 13:31:12 -0500691
692 r = r * c;
693 g = g * c;
694 b = b * c;
695 a = a * c;
696}
Mike Kleine3d44212017-02-24 08:21:18 -0500697
Mike Kleinb9c4a6f2017-04-03 13:54:55 -0400698SI F lerp(F from, F to, F t) {
699 return mad(to-from, t, from);
700}
701
Mike Kleine3d44212017-02-24 08:21:18 -0500702STAGE(lerp_1_float) {
703 auto c = *(const float*)ctx;
704
705 r = lerp(dr, r, c);
706 g = lerp(dg, g, c);
707 b = lerp(db, b, c);
708 a = lerp(da, a, c);
709}
Mike Klein2b767362017-02-22 13:52:40 -0500710STAGE(lerp_u8) {
711 auto ptr = *(const uint8_t**)ctx + x;
712
Mike Kleinc31858b2017-03-01 13:07:40 -0500713 auto scales = load<U8>(ptr, tail);
Mike Klein40de6da2017-04-07 13:09:29 -0400714 auto c = from_byte(scales);
Mike Klein2b767362017-02-22 13:52:40 -0500715
716 r = lerp(dr, r, c);
717 g = lerp(dg, g, c);
718 b = lerp(db, b, c);
719 a = lerp(da, a, c);
720}
Mike Kleine3d44212017-02-24 08:21:18 -0500721STAGE(lerp_565) {
722 auto ptr = *(const uint16_t**)ctx + x;
723
724 F cr,cg,cb;
Mike Klein5224f462017-03-07 17:29:54 -0500725 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
Mike Kleine3d44212017-02-24 08:21:18 -0500726
727 r = lerp(dr, r, cr);
728 g = lerp(dg, g, cg);
729 b = lerp(db, b, cb);
bungeman6f9f2592017-05-10 13:50:12 -0400730 a = max(lerp(da, a, cr), lerp(da, a, cg), lerp(da, a, cb));
Mike Kleine3d44212017-02-24 08:21:18 -0500731}
Mike Kleine1caee12017-02-15 13:31:12 -0500732
733STAGE(load_tables) {
Mike Kleina3735cd2017-04-17 13:19:05 -0400734 auto c = (const SkJumper_LoadTablesCtx*)ctx;
Mike Kleine1caee12017-02-15 13:31:12 -0500735
Mike Kleina3735cd2017-04-17 13:19:05 -0400736 auto px = load<U32>((const uint32_t*)c->src + x, tail);
Mike Klein0aa742f2017-04-27 13:36:57 -0400737 r = gather(c->r, (px ) & 0xff);
738 g = gather(c->g, (px >> 8) & 0xff);
739 b = gather(c->b, (px >> 16) & 0xff);
Mike Kleinfe560a82017-05-01 12:56:35 -0400740 a = cast( (px >> 24)) * (1/255.0f);
Mike Kleine1caee12017-02-15 13:31:12 -0500741}
Mike Kleina3735cd2017-04-17 13:19:05 -0400742STAGE(load_tables_u16_be) {
743 auto c = (const SkJumper_LoadTablesCtx*)ctx;
744 auto ptr = (const uint16_t*)c->src + 4*x;
745
746 U16 R,G,B,A;
747 load4(ptr, tail, &R,&G,&B,&A);
748
Mike Klein0aa742f2017-04-27 13:36:57 -0400749 // c->src is big-endian, so & 0xff grabs the 8 most signficant bits.
750 r = gather(c->r, expand(R) & 0xff);
751 g = gather(c->g, expand(G) & 0xff);
752 b = gather(c->b, expand(B) & 0xff);
Mike Kleinfe560a82017-05-01 12:56:35 -0400753 a = (1/65535.0f) * cast(expand(bswap(A)));
Mike Kleina3735cd2017-04-17 13:19:05 -0400754}
755STAGE(load_tables_rgb_u16_be) {
756 auto c = (const SkJumper_LoadTablesCtx*)ctx;
757 auto ptr = (const uint16_t*)c->src + 3*x;
758
759 U16 R,G,B;
760 load3(ptr, tail, &R,&G,&B);
761
Mike Klein0aa742f2017-04-27 13:36:57 -0400762 // c->src is big-endian, so & 0xff grabs the 8 most signficant bits.
763 r = gather(c->r, expand(R) & 0xff);
764 g = gather(c->g, expand(G) & 0xff);
765 b = gather(c->b, expand(B) & 0xff);
Mike Kleinfe560a82017-05-01 12:56:35 -0400766 a = 1.0f;
Mike Kleina3735cd2017-04-17 13:19:05 -0400767}
Mike Kleine1caee12017-02-15 13:31:12 -0500768
Mike Klein40de6da2017-04-07 13:09:29 -0400769STAGE(byte_tables) {
770 struct Tables { const uint8_t *r, *g, *b, *a; };
771 auto tables = (const Tables*)ctx;
772
Mike Kleinfe560a82017-05-01 12:56:35 -0400773 r = from_byte(gather(tables->r, round(r, 255.0f)));
774 g = from_byte(gather(tables->g, round(g, 255.0f)));
775 b = from_byte(gather(tables->b, round(b, 255.0f)));
776 a = from_byte(gather(tables->a, round(a, 255.0f)));
Mike Klein40de6da2017-04-07 13:09:29 -0400777}
778
779STAGE(byte_tables_rgb) {
780 struct Tables { const uint8_t *r, *g, *b; int n; };
781 auto tables = (const Tables*)ctx;
782
783 F scale = tables->n - 1;
784 r = from_byte(gather(tables->r, round(r, scale)));
785 g = from_byte(gather(tables->g, round(g, scale)));
786 b = from_byte(gather(tables->b, round(b, scale)));
787}
788
Mike Kleinc7d9c0b2017-04-17 14:43:59 -0400789SI F table(F v, const SkJumper_TableCtx* ctx) {
790 return gather(ctx->table, round(v, ctx->size - 1));
791}
792STAGE(table_r) { r = table(r, ctx); }
793STAGE(table_g) { g = table(g, ctx); }
794STAGE(table_b) { b = table(b, ctx); }
795STAGE(table_a) { a = table(a, ctx); }
796
Mike Klein44375172017-04-17 19:32:05 -0400797SI F parametric(F v, const SkJumper_ParametricTransferFunction* ctx) {
798 F r = if_then_else(v <= ctx->D, mad(ctx->C, v, ctx->F)
799 , approx_powf(mad(ctx->A, v, ctx->B), ctx->G) + ctx->E);
Mike Kleinfe560a82017-05-01 12:56:35 -0400800 return min(max(r, 0), 1.0f); // Clamp to [0,1], with argument order mattering to handle NaN.
Mike Klein44375172017-04-17 19:32:05 -0400801}
802STAGE(parametric_r) { r = parametric(r, ctx); }
803STAGE(parametric_g) { g = parametric(g, ctx); }
804STAGE(parametric_b) { b = parametric(b, ctx); }
805STAGE(parametric_a) { a = parametric(a, ctx); }
806
Mike Klein4e3e9f82017-04-20 11:04:29 -0400807STAGE(lab_to_xyz) {
Mike Kleinfe560a82017-05-01 12:56:35 -0400808 F L = r * 100.0f,
809 A = g * 255.0f - 128.0f,
810 B = b * 255.0f - 128.0f;
Mike Klein4e3e9f82017-04-20 11:04:29 -0400811
Mike Kleinfe560a82017-05-01 12:56:35 -0400812 F Y = (L + 16.0f) * (1/116.0f),
813 X = Y + A*(1/500.0f),
814 Z = Y - B*(1/200.0f);
Mike Klein4e3e9f82017-04-20 11:04:29 -0400815
Mike Kleinfe560a82017-05-01 12:56:35 -0400816 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
817 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
818 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
Mike Klein4e3e9f82017-04-20 11:04:29 -0400819
820 // Adjust to D50 illuminant.
Mike Kleinfe560a82017-05-01 12:56:35 -0400821 r = X * 0.96422f;
822 g = Y ;
823 b = Z * 0.82521f;
Mike Klein4e3e9f82017-04-20 11:04:29 -0400824}
825
Mike Klein420e38f2017-02-24 09:05:14 -0500826STAGE(load_a8) {
827 auto ptr = *(const uint8_t**)ctx + x;
828
829 r = g = b = 0.0f;
Mike Klein40de6da2017-04-07 13:09:29 -0400830 a = from_byte(load<U8>(ptr, tail));
Mike Klein420e38f2017-02-24 09:05:14 -0500831}
Mike Klein21bd3e42017-04-06 16:32:29 -0400832STAGE(gather_a8) {
833 const uint8_t* ptr;
834 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
835 r = g = b = 0.0f;
Mike Klein40de6da2017-04-07 13:09:29 -0400836 a = from_byte(gather(ptr, ix));
Mike Klein21bd3e42017-04-06 16:32:29 -0400837}
Mike Klein420e38f2017-02-24 09:05:14 -0500838STAGE(store_a8) {
839 auto ptr = *(uint8_t**)ctx + x;
840
Mike Kleinfe560a82017-05-01 12:56:35 -0400841 U8 packed = pack(pack(round(a, 255.0f)));
Mike Kleinc31858b2017-03-01 13:07:40 -0500842 store(ptr, packed, tail);
Mike Klein420e38f2017-02-24 09:05:14 -0500843}
844
Mike Kleinf809fef2017-03-31 13:52:45 -0400845STAGE(load_g8) {
846 auto ptr = *(const uint8_t**)ctx + x;
847
Mike Klein40de6da2017-04-07 13:09:29 -0400848 r = g = b = from_byte(load<U8>(ptr, tail));
Mike Kleinfe560a82017-05-01 12:56:35 -0400849 a = 1.0f;
Mike Kleinf809fef2017-03-31 13:52:45 -0400850}
Mike Klein21bd3e42017-04-06 16:32:29 -0400851STAGE(gather_g8) {
852 const uint8_t* ptr;
853 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
Mike Klein40de6da2017-04-07 13:09:29 -0400854 r = g = b = from_byte(gather(ptr, ix));
Mike Kleinfe560a82017-05-01 12:56:35 -0400855 a = 1.0f;
Mike Klein21bd3e42017-04-06 16:32:29 -0400856}
Mike Kleinf809fef2017-03-31 13:52:45 -0400857
Mike Klein7d3d8722017-04-06 17:53:18 -0400858STAGE(gather_i8) {
Mike Klein994ef972017-04-12 11:51:11 -0400859 auto c = (const SkJumper_GatherCtx*)ctx;
Mike Klein7d3d8722017-04-06 17:53:18 -0400860 const uint8_t* ptr;
861 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
862 ix = expand(gather(ptr, ix));
863 from_8888(gather(c->ctable, ix), &r,&g,&b,&a);
864}
865
Mike Klein3f81f372017-02-23 13:03:57 -0500866STAGE(load_565) {
867 auto ptr = *(const uint16_t**)ctx + x;
868
Mike Klein5224f462017-03-07 17:29:54 -0500869 from_565(load<U16>(ptr, tail), &r,&g,&b);
Mike Kleinfe560a82017-05-01 12:56:35 -0400870 a = 1.0f;
Mike Klein3f81f372017-02-23 13:03:57 -0500871}
Mike Klein21bd3e42017-04-06 16:32:29 -0400872STAGE(gather_565) {
873 const uint16_t* ptr;
874 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
875 from_565(gather(ptr, ix), &r,&g,&b);
Mike Kleinfe560a82017-05-01 12:56:35 -0400876 a = 1.0f;
Mike Klein21bd3e42017-04-06 16:32:29 -0400877}
Mike Klein3f81f372017-02-23 13:03:57 -0500878STAGE(store_565) {
879 auto ptr = *(uint16_t**)ctx + x;
880
Mike Kleinfe560a82017-05-01 12:56:35 -0400881 U16 px = pack( round(r, 31.0f) << 11
882 | round(g, 63.0f) << 5
883 | round(b, 31.0f) );
Mike Kleinc31858b2017-03-01 13:07:40 -0500884 store(ptr, px, tail);
Mike Klein3f81f372017-02-23 13:03:57 -0500885}
886
Mike Kleinf809fef2017-03-31 13:52:45 -0400887STAGE(load_4444) {
888 auto ptr = *(const uint16_t**)ctx + x;
889 from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
890}
Mike Klein21bd3e42017-04-06 16:32:29 -0400891STAGE(gather_4444) {
892 const uint16_t* ptr;
893 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
894 from_4444(gather(ptr, ix), &r,&g,&b,&a);
895}
Mike Kleinf809fef2017-03-31 13:52:45 -0400896STAGE(store_4444) {
897 auto ptr = *(uint16_t**)ctx + x;
Mike Kleinfe560a82017-05-01 12:56:35 -0400898 U16 px = pack( round(r, 15.0f) << 12
899 | round(g, 15.0f) << 8
900 | round(b, 15.0f) << 4
901 | round(a, 15.0f) );
Mike Kleinf809fef2017-03-31 13:52:45 -0400902 store(ptr, px, tail);
903}
904
Mike Kleine1caee12017-02-15 13:31:12 -0500905STAGE(load_8888) {
906 auto ptr = *(const uint32_t**)ctx + x;
Mike Kleindec4ea82017-04-06 15:04:05 -0400907 from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
908}
909STAGE(gather_8888) {
910 const uint32_t* ptr;
911 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
912 from_8888(gather(ptr, ix), &r,&g,&b,&a);
Mike Kleine1caee12017-02-15 13:31:12 -0500913}
Mike Kleine1caee12017-02-15 13:31:12 -0500914STAGE(store_8888) {
915 auto ptr = *(uint32_t**)ctx + x;
916
Mike Kleinfe560a82017-05-01 12:56:35 -0400917 U32 px = round(r, 255.0f)
918 | round(g, 255.0f) << 8
919 | round(b, 255.0f) << 16
920 | round(a, 255.0f) << 24;
Mike Kleinc31858b2017-03-01 13:07:40 -0500921 store(ptr, px, tail);
Mike Kleine1caee12017-02-15 13:31:12 -0500922}
923
924STAGE(load_f16) {
925 auto ptr = *(const uint64_t**)ctx + x;
926
Mike Klein114e6b32017-04-03 22:21:15 -0400927 U16 R,G,B,A;
Mike Kleinfa6eb912017-04-05 10:18:27 -0400928 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
Mike Klein114e6b32017-04-03 22:21:15 -0400929 r = from_half(R);
930 g = from_half(G);
931 b = from_half(B);
932 a = from_half(A);
Mike Kleine1caee12017-02-15 13:31:12 -0500933}
Mike Klein5f055f02017-04-06 20:02:11 -0400934STAGE(gather_f16) {
935 const uint64_t* ptr;
936 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
937 auto px = gather(ptr, ix);
938
939 U16 R,G,B,A;
940 load4((const uint16_t*)&px,0, &R,&G,&B,&A);
941 r = from_half(R);
942 g = from_half(G);
943 b = from_half(B);
944 a = from_half(A);
945}
Mike Kleine1caee12017-02-15 13:31:12 -0500946STAGE(store_f16) {
947 auto ptr = *(uint64_t**)ctx + x;
Mike Kleinfa6eb912017-04-05 10:18:27 -0400948 store4((uint16_t*)ptr,tail, to_half(r)
949 , to_half(g)
950 , to_half(b)
951 , to_half(a));
Mike Kleine1caee12017-02-15 13:31:12 -0500952}
953
Mike Klein3146bb92017-04-05 14:45:02 -0400954STAGE(load_u16_be) {
Mike Kleinb3821732017-04-17 10:58:05 -0400955 auto ptr = *(const uint16_t**)ctx + 4*x;
Mike Klein3146bb92017-04-05 14:45:02 -0400956
957 U16 R,G,B,A;
Mike Kleinb3821732017-04-17 10:58:05 -0400958 load4(ptr,tail, &R,&G,&B,&A);
Mike Klein3146bb92017-04-05 14:45:02 -0400959
Mike Kleinfe560a82017-05-01 12:56:35 -0400960 r = (1/65535.0f) * cast(expand(bswap(R)));
961 g = (1/65535.0f) * cast(expand(bswap(G)));
962 b = (1/65535.0f) * cast(expand(bswap(B)));
963 a = (1/65535.0f) * cast(expand(bswap(A)));
Mike Klein3146bb92017-04-05 14:45:02 -0400964}
Mike Kleinb3821732017-04-17 10:58:05 -0400965STAGE(load_rgb_u16_be) {
966 auto ptr = *(const uint16_t**)ctx + 3*x;
967
968 U16 R,G,B;
969 load3(ptr,tail, &R,&G,&B);
970
Mike Kleinfe560a82017-05-01 12:56:35 -0400971 r = (1/65535.0f) * cast(expand(bswap(R)));
972 g = (1/65535.0f) * cast(expand(bswap(G)));
973 b = (1/65535.0f) * cast(expand(bswap(B)));
974 a = 1.0f;
Mike Kleinb3821732017-04-17 10:58:05 -0400975}
Mike Klein3146bb92017-04-05 14:45:02 -0400976STAGE(store_u16_be) {
Mike Kleinb3821732017-04-17 10:58:05 -0400977 auto ptr = *(uint16_t**)ctx + 4*x;
Mike Klein3146bb92017-04-05 14:45:02 -0400978
Mike Kleinfe560a82017-05-01 12:56:35 -0400979 U16 R = bswap(pack(round(r, 65535.0f))),
980 G = bswap(pack(round(g, 65535.0f))),
981 B = bswap(pack(round(b, 65535.0f))),
982 A = bswap(pack(round(a, 65535.0f)));
Mike Klein3146bb92017-04-05 14:45:02 -0400983
Mike Kleinb3821732017-04-17 10:58:05 -0400984 store4(ptr,tail, R,G,B,A);
Mike Klein3146bb92017-04-05 14:45:02 -0400985}
986
Mike Klein14987eb2017-04-06 10:22:26 -0400987STAGE(load_f32) {
988 auto ptr = *(const float**)ctx + 4*x;
989 load4(ptr,tail, &r,&g,&b,&a);
990}
Mike Klein94fc0fe2017-03-03 14:05:32 -0500991STAGE(store_f32) {
992 auto ptr = *(float**)ctx + 4*x;
Mike Kleinfa6eb912017-04-05 10:18:27 -0400993 store4(ptr,tail, r,g,b,a);
Mike Klein94fc0fe2017-03-03 14:05:32 -0500994}
995
Mike Klein64b97482017-03-14 17:35:04 -0700996SI F clamp(F v, float limit) {
Mike Klein8ca33562017-05-23 08:07:43 -0400997 return min(max(0, v), limit);
Mike Klein9fe1b222017-02-24 11:04:50 -0500998}
Mike Klein64b97482017-03-14 17:35:04 -0700999SI F repeat(F v, float limit) {
Mike Klein8ca33562017-05-23 08:07:43 -04001000 return v - floor_(v/limit)*limit;
Mike Klein9fe1b222017-02-24 11:04:50 -05001001}
Mike Klein64b97482017-03-14 17:35:04 -07001002SI F mirror(F v, float limit) {
Mike Klein8ca33562017-05-23 08:07:43 -04001003 return abs_( (v-limit) - (limit+limit)*floor_((v-limit)/(limit+limit)) - limit );
Mike Klein9fe1b222017-02-24 11:04:50 -05001004}
Mike Klein5224f462017-03-07 17:29:54 -05001005STAGE(clamp_x) { r = clamp (r, *(const float*)ctx); }
1006STAGE(clamp_y) { g = clamp (g, *(const float*)ctx); }
1007STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
1008STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
1009STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
1010STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
Mike Kleine1caee12017-02-15 13:31:12 -05001011
Mike Klein8ca33562017-05-23 08:07:43 -04001012STAGE( clamp_x_1) { r = clamp (r, 1.0f); }
1013STAGE(repeat_x_1) { r = repeat(r, 1.0f); }
Mike Klein9f85d682017-05-23 07:52:01 -04001014STAGE(mirror_x_1) { r = abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f ); }
1015
Mike Kleine9ed07d2017-03-07 12:28:11 -05001016STAGE(luminance_to_alpha) {
Mike Kleinfe560a82017-05-01 12:56:35 -04001017 a = r*0.2126f + g*0.7152f + b*0.0722f;
Mike Kleine9ed07d2017-03-07 12:28:11 -05001018 r = g = b = 0;
1019}
1020
Mike Kleine1caee12017-02-15 13:31:12 -05001021STAGE(matrix_2x3) {
1022 auto m = (const float*)ctx;
1023
Mike Kleinb8d52752017-02-16 10:21:29 -05001024 auto R = mad(r,m[0], mad(g,m[2], m[4])),
1025 G = mad(r,m[1], mad(g,m[3], m[5]));
Mike Kleine1caee12017-02-15 13:31:12 -05001026 r = R;
1027 g = G;
1028}
1029STAGE(matrix_3x4) {
1030 auto m = (const float*)ctx;
1031
Mike Kleinb8d52752017-02-16 10:21:29 -05001032 auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
1033 G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
1034 B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
Mike Kleine1caee12017-02-15 13:31:12 -05001035 r = R;
1036 g = G;
1037 b = B;
1038}
Mike Kleine9ed07d2017-03-07 12:28:11 -05001039STAGE(matrix_4x5) {
1040 auto m = (const float*)ctx;
1041
1042 auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))),
1043 G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))),
1044 B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))),
1045 A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19]))));
1046 r = R;
1047 g = G;
1048 b = B;
1049 a = A;
1050}
Mike Reed02640952017-05-19 15:32:13 -04001051STAGE(matrix_4x3) {
1052 auto m = (const float*)ctx;
1053 auto X = r,
1054 Y = g;
1055
1056 r = mad(X, m[0], mad(Y, m[4], m[ 8]));
1057 g = mad(X, m[1], mad(Y, m[5], m[ 9]));
1058 b = mad(X, m[2], mad(Y, m[6], m[10]));
1059 a = mad(X, m[3], mad(Y, m[7], m[11]));
1060}
Mike Klein11d2df02017-02-24 11:51:36 -05001061STAGE(matrix_perspective) {
1062 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
1063 auto m = (const float*)ctx;
1064
1065 auto R = mad(r,m[0], mad(g,m[1], m[2])),
1066 G = mad(r,m[3], mad(g,m[4], m[5])),
1067 Z = mad(r,m[6], mad(g,m[7], m[8]));
1068 r = R * rcp(Z);
1069 g = G * rcp(Z);
1070}
Mike Kleine1caee12017-02-15 13:31:12 -05001071
Herb Derby4de13042017-05-15 10:49:39 -04001072SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
1073 F* r, F* g, F* b, F* a) {
1074 F fr, br, fg, bg, fb, bb, fa, ba;
1075#if defined(JUMPER) && defined(__AVX2__)
1076 if (c->stopCount <=8) {
1077 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
1078 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
1079 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx);
1080 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx);
1081 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx);
1082 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx);
1083 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx);
1084 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx);
1085 } else
1086#endif
1087 {
1088 fr = gather(c->fs[0], idx);
1089 br = gather(c->bs[0], idx);
1090 fg = gather(c->fs[1], idx);
1091 bg = gather(c->bs[1], idx);
1092 fb = gather(c->fs[2], idx);
1093 bb = gather(c->bs[2], idx);
1094 fa = gather(c->fs[3], idx);
1095 ba = gather(c->bs[3], idx);
Herb Derby7b4202d2017-04-10 10:52:34 -04001096 }
1097
Herb Derby4de13042017-05-15 10:49:39 -04001098 *r = mad(t, fr, br);
1099 *g = mad(t, fg, bg);
1100 *b = mad(t, fb, bb);
1101 *a = mad(t, fa, ba);
1102}
1103
1104STAGE(evenly_spaced_gradient) {
1105 auto c = (const SkJumper_GradientCtx*)ctx;
1106 auto t = r;
1107 auto idx = trunc_(t * (c->stopCount-1));
1108 gradient_lookup(c, idx, t, &r, &g, &b, &a);
1109}
1110
1111STAGE(gradient) {
1112 auto c = (const SkJumper_GradientCtx*)ctx;
1113 auto t = r;
1114 U32 idx = 0;
1115
1116 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
1117 for (size_t i = 1; i < c->stopCount; i++) {
1118 idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
1119 }
1120
1121 gradient_lookup(c, idx, t, &r, &g, &b, &a);
Herb Derby7b4202d2017-04-10 10:52:34 -04001122}
1123
Mike Klein5c7960b2017-05-11 10:59:22 -04001124STAGE(evenly_spaced_2_stop_gradient) {
Herb Derby7b4202d2017-04-10 10:52:34 -04001125 struct Ctx { float f[4], b[4]; };
Mike Klein8a823fa2017-04-05 17:29:26 -04001126 auto c = (const Ctx*)ctx;
Mike Kleine1caee12017-02-15 13:31:12 -05001127
1128 auto t = r;
Herb Derby7b4202d2017-04-10 10:52:34 -04001129 r = mad(t, c->f[0], c->b[0]);
1130 g = mad(t, c->f[1], c->b[1]);
1131 b = mad(t, c->f[2], c->b[2]);
1132 a = mad(t, c->f[3], c->b[3]);
Mike Kleine1caee12017-02-15 13:31:12 -05001133}
Mike Klein0a904492017-04-12 12:52:48 -04001134
Mike Klein5c7960b2017-05-11 10:59:22 -04001135STAGE(xy_to_unit_angle) {
Herb Derby7eb86982017-05-02 19:04:39 -04001136 F X = r,
1137 Y = g;
1138 F xabs = abs_(X),
1139 yabs = abs_(Y);
1140
1141 F slope = min(xabs, yabs)/max(xabs, yabs);
1142 F s = slope * slope;
1143
1144 // Use a 7th degree polynomial to approximate atan.
1145 // This was generated using sollya.gforge.inria.fr.
1146 // A float optimized polynomial was generated using the following command.
1147 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
1148 F phi = slope
1149 * (0.15912117063999176025390625f + s
1150 * (-5.185396969318389892578125e-2f + s
1151 * (2.476101927459239959716796875e-2f + s
1152 * (-7.0547382347285747528076171875e-3f))));
1153
1154 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
1155 phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi);
1156 phi = if_then_else(Y < 0.0f , 1.0f - phi , phi);
1157 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
1158 r = phi;
1159}
1160
Herb Derby090fbf82017-05-08 15:10:36 -04001161STAGE(xy_to_radius) {
1162 F X2 = r * r,
1163 Y2 = g * g;
Mike Kleinfd35c742017-05-15 15:55:54 -04001164 r = sqrt_(X2 + Y2);
Herb Derby090fbf82017-05-08 15:10:36 -04001165}
1166
Mike Klein0a904492017-04-12 12:52:48 -04001167STAGE(save_xy) {
1168 auto c = (SkJumper_SamplerCtx*)ctx;
1169
1170 // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
1171 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
1172 // surrounding (x,y) at (0.5,0.5) off-center.
Mike Kleinfe560a82017-05-01 12:56:35 -04001173 F fx = fract(r + 0.5f),
1174 fy = fract(g + 0.5f);
Mike Klein0a904492017-04-12 12:52:48 -04001175
1176 // Samplers will need to load x and fx, or y and fy.
Mike Kleinc33aa902017-05-15 10:20:48 -04001177 unaligned_store(c->x, r);
1178 unaligned_store(c->y, g);
1179 unaligned_store(c->fx, fx);
1180 unaligned_store(c->fy, fy);
Mike Klein0a904492017-04-12 12:52:48 -04001181}
1182
1183STAGE(accumulate) {
1184 auto c = (const SkJumper_SamplerCtx*)ctx;
1185
1186 // Bilinear and bicubic filters are both separable, so we produce independent contributions
1187 // from x and y, multiplying them together here to get each pixel's total scale factor.
1188 auto scale = unaligned_load<F>(c->scalex)
1189 * unaligned_load<F>(c->scaley);
1190 dr = mad(scale, r, dr);
1191 dg = mad(scale, g, dg);
1192 db = mad(scale, b, db);
1193 da = mad(scale, a, da);
1194}
1195
1196// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
1197// are combined in direct proportion to their area overlapping that logical query pixel.
1198// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
1199// The y-axis is symmetric.
1200
1201template <int kScale>
1202SI void bilinear_x(SkJumper_SamplerCtx* ctx, F* x) {
Mike Kleinfe560a82017-05-01 12:56:35 -04001203 *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
Mike Klein0a904492017-04-12 12:52:48 -04001204 F fx = unaligned_load<F>(ctx->fx);
1205
1206 F scalex;
Mike Kleinfe560a82017-05-01 12:56:35 -04001207 if (kScale == -1) { scalex = 1.0f - fx; }
1208 if (kScale == +1) { scalex = fx; }
Mike Kleinc33aa902017-05-15 10:20:48 -04001209 unaligned_store(ctx->scalex, scalex);
Mike Klein0a904492017-04-12 12:52:48 -04001210}
1211template <int kScale>
1212SI void bilinear_y(SkJumper_SamplerCtx* ctx, F* y) {
Mike Kleinfe560a82017-05-01 12:56:35 -04001213 *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
Mike Klein0a904492017-04-12 12:52:48 -04001214 F fy = unaligned_load<F>(ctx->fy);
1215
1216 F scaley;
Mike Kleinfe560a82017-05-01 12:56:35 -04001217 if (kScale == -1) { scaley = 1.0f - fy; }
1218 if (kScale == +1) { scaley = fy; }
Mike Kleinc33aa902017-05-15 10:20:48 -04001219 unaligned_store(ctx->scaley, scaley);
Mike Klein0a904492017-04-12 12:52:48 -04001220}
1221
1222STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
1223STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
1224STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
1225STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
1226
1227
1228// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
1229// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
1230//
1231// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
1232// See GrCubicEffect for details of this particular filter.
1233
1234SI F bicubic_near(F t) {
1235 // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
Mike Kleinfe560a82017-05-01 12:56:35 -04001236 return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f));
Mike Klein0a904492017-04-12 12:52:48 -04001237}
1238SI F bicubic_far(F t) {
1239 // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
Mike Kleinfe560a82017-05-01 12:56:35 -04001240 return (t*t)*mad((7/18.0f), t, (-6/18.0f));
Mike Klein0a904492017-04-12 12:52:48 -04001241}
1242
1243template <int kScale>
1244SI void bicubic_x(SkJumper_SamplerCtx* ctx, F* x) {
Mike Kleinfe560a82017-05-01 12:56:35 -04001245 *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
Mike Klein0a904492017-04-12 12:52:48 -04001246 F fx = unaligned_load<F>(ctx->fx);
1247
1248 F scalex;
Mike Kleinfe560a82017-05-01 12:56:35 -04001249 if (kScale == -3) { scalex = bicubic_far (1.0f - fx); }
1250 if (kScale == -1) { scalex = bicubic_near(1.0f - fx); }
1251 if (kScale == +1) { scalex = bicubic_near( fx); }
1252 if (kScale == +3) { scalex = bicubic_far ( fx); }
Mike Kleinc33aa902017-05-15 10:20:48 -04001253 unaligned_store(ctx->scalex, scalex);
Mike Klein0a904492017-04-12 12:52:48 -04001254}
1255template <int kScale>
1256SI void bicubic_y(SkJumper_SamplerCtx* ctx, F* y) {
Mike Kleinfe560a82017-05-01 12:56:35 -04001257 *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
Mike Klein0a904492017-04-12 12:52:48 -04001258 F fy = unaligned_load<F>(ctx->fy);
1259
1260 F scaley;
Mike Kleinfe560a82017-05-01 12:56:35 -04001261 if (kScale == -3) { scaley = bicubic_far (1.0f - fy); }
1262 if (kScale == -1) { scaley = bicubic_near(1.0f - fy); }
1263 if (kScale == +1) { scaley = bicubic_near( fy); }
1264 if (kScale == +3) { scaley = bicubic_far ( fy); }
Mike Kleinc33aa902017-05-15 10:20:48 -04001265 unaligned_store(ctx->scaley, scaley);
Mike Klein0a904492017-04-12 12:52:48 -04001266}
1267
1268STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
1269STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
1270STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
1271STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
1272
1273STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
1274STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
1275STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
1276STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
Mike Klein7fee90c2017-04-07 16:55:09 -04001277
1278STAGE(callback) {
Mike Kleinc17dc242017-04-20 16:21:57 -04001279 auto c = (SkJumper_CallbackCtx*)ctx;
1280 store4(c->rgba,0, r,g,b,a);
1281 c->fn(c, tail ? tail : kStride);
1282 load4(c->read_from,0, &r,&g,&b,&a);
Mike Klein7fee90c2017-04-07 16:55:09 -04001283}