Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "SkJumper.h" |
Mike Klein | 5664e65 | 2017-05-01 16:01:38 -0400 | [diff] [blame] | 9 | #include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast() |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 10 | #include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand() |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 11 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 12 | // Our fundamental vector depth is our pixel stride. |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 13 | static const size_t kStride = sizeof(F) / sizeof(float); |
| 14 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 15 | // A reminder: |
| 16 | // Code guarded by defined(JUMPER) can assume that it will be compiled by Clang |
| 17 | // and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type. |
| 18 | // Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1). |
| 19 | |
Mike Klein | 5664e65 | 2017-05-01 16:01:38 -0400 | [diff] [blame] | 20 | // You can use most constants in this file, but in a few rare exceptions we read from this struct. |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 21 | using K = const SkJumper_constants; |
| 22 | |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 23 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 24 | // Let's start first with the mechanisms we use to build Stages. |
| 25 | |
| 26 | // Our program is an array of void*, either |
| 27 | // - 1 void* per stage with no context pointer, the next stage; |
| 28 | // - 2 void* per stage with a context pointer, first the context pointer, then the next stage. |
| 29 | |
| 30 | // load_and_inc() steps the program forward by 1 void*, returning that pointer. |
| 31 | SI void* load_and_inc(void**& program) { |
| 32 | #if defined(__GNUC__) && defined(__x86_64__) |
| 33 | // If program is in %rsi (we try to make this likely) then this is a single instruction. |
| 34 | void* rax; |
| 35 | asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi. |
| 36 | return rax; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 37 | #else |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 38 | // On ARM *program++ compiles into pretty ideal code without any handholding. |
| 39 | return *program++; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 40 | #endif |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 41 | } |
| 42 | |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 43 | // LazyCtx doesn't do anything unless you call operator T*(), encapsulating the logic |
| 44 | // from above that stages without a context pointer are represented by just 1 void*. |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 45 | struct LazyCtx { |
| 46 | void* ptr; |
| 47 | void**& program; |
| 48 | |
| 49 | explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {} |
| 50 | |
| 51 | template <typename T> |
| 52 | operator T*() { |
| 53 | if (!ptr) { ptr = load_and_inc(program); } |
| 54 | return (T*)ptr; |
| 55 | } |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 56 | }; |
| 57 | |
| 58 | // A little wrapper macro to name Stages differently depending on the instruction set. |
| 59 | // That lets us link together several options. |
| 60 | #if !defined(JUMPER) |
| 61 | #define WRAP(name) sk_##name |
| 62 | #elif defined(__aarch64__) |
| 63 | #define WRAP(name) sk_##name##_aarch64 |
| 64 | #elif defined(__arm__) |
| 65 | #define WRAP(name) sk_##name##_vfp4 |
| 66 | #elif defined(__AVX2__) |
| 67 | #define WRAP(name) sk_##name##_hsw |
| 68 | #elif defined(__AVX__) |
| 69 | #define WRAP(name) sk_##name##_avx |
| 70 | #elif defined(__SSE4_1__) |
| 71 | #define WRAP(name) sk_##name##_sse41 |
| 72 | #elif defined(__SSE2__) |
| 73 | #define WRAP(name) sk_##name##_sse2 |
| 74 | #endif |
| 75 | |
| 76 | // We're finally going to get to what a Stage function looks like! |
| 77 | // It's best to jump down to the #else case first, then to come back up here for AVX. |
| 78 | |
| 79 | #if defined(JUMPER) && defined(__AVX__) |
| 80 | // There's a big cost to switch between SSE and AVX, so we do a little |
| 81 | // extra work to handle even the jagged <kStride tail in AVX mode. |
| 82 | // Compared to normal stages, we maintain an extra tail register: |
| 83 | // tail == 0 ~~> work on a full kStride pixels |
| 84 | // tail != 0 ~~> work on only the first tail pixels |
| 85 | // tail is always < kStride. |
| 86 | using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F); |
| 87 | |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 88 | MAYBE_MSABI |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 89 | extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { |
| 90 | F v{}; |
| 91 | auto start = (Stage*)load_and_inc(program); |
| 92 | while (x + kStride <= limit) { |
| 93 | start(x,program,k,0, v,v,v,v, v,v,v,v); |
| 94 | x += kStride; |
| 95 | } |
| 96 | if (size_t tail = limit - x) { |
| 97 | start(x,program,k,tail, v,v,v,v, v,v,v,v); |
| 98 | } |
| 99 | return limit; |
| 100 | } |
| 101 | |
| 102 | #define STAGE(name) \ |
| 103 | SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \ |
| 104 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
| 105 | extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \ |
| 106 | F r, F g, F b, F a, F dr, F dg, F db, F da) { \ |
| 107 | LazyCtx ctx(program); \ |
| 108 | name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \ |
| 109 | auto next = (Stage*)load_and_inc(program); \ |
| 110 | next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \ |
| 111 | } \ |
| 112 | SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \ |
| 113 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| 114 | |
| 115 | #else |
| 116 | // Other instruction sets (SSE, NEON, portable) can fall back on narrower |
| 117 | // pipelines cheaply, which frees us to always assume tail==0. |
| 118 | |
| 119 | // Stages tail call between each other by following program as described above. |
| 120 | // x is our induction variable, stepping forward kStride at a time. |
| 121 | using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F); |
| 122 | |
| 123 | // On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V. |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 124 | MAYBE_MSABI |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 125 | extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { |
| 126 | F v{}; |
| 127 | auto start = (Stage*)load_and_inc(program); |
| 128 | while (x + kStride <= limit) { |
| 129 | start(x,program,k, v,v,v,v, v,v,v,v); |
| 130 | x += kStride; |
| 131 | } |
| 132 | return x; |
| 133 | } |
| 134 | |
| 135 | // This STAGE macro makes it easier to write stages, handling all the Stage chaining for you. |
| 136 | #define STAGE(name) \ |
| 137 | SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \ |
| 138 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
| 139 | extern "C" void WRAP(name)(size_t x, void** program, K* k, \ |
| 140 | F r, F g, F b, F a, F dr, F dg, F db, F da) { \ |
| 141 | LazyCtx ctx(program); \ |
| 142 | name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \ |
| 143 | auto next = (Stage*)load_and_inc(program); \ |
| 144 | next(x,program,k, r,g,b,a, dr,dg,db,da); \ |
| 145 | } \ |
| 146 | SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \ |
| 147 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| 148 | #endif |
| 149 | |
| 150 | // just_return() is a simple no-op stage that only exists to end the chain, |
| 151 | // returning back up to start_pipeline(), and from there to the caller. |
| 152 | extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {} |
| 153 | |
| 154 | |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 155 | // We could start defining normal Stages now. But first, some helper functions. |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 156 | |
| 157 | // These load() and store() methods are tail-aware, |
| 158 | // but focus mainly on keeping the at-stride tail==0 case fast. |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 159 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 160 | template <typename V, typename T> |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 161 | SI V load(const T* src, size_t tail) { |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 162 | #if defined(JUMPER) |
Mike Klein | 4e7fc0c | 2017-03-02 11:16:22 -0500 | [diff] [blame] | 163 | __builtin_assume(tail < kStride); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 164 | if (__builtin_expect(tail, 0)) { |
| 165 | V v{}; // Any inactive lanes are zeroed. |
Mike Klein | 4e7fc0c | 2017-03-02 11:16:22 -0500 | [diff] [blame] | 166 | switch (tail-1) { |
| 167 | case 6: v[6] = src[6]; |
| 168 | case 5: v[5] = src[5]; |
| 169 | case 4: v[4] = src[4]; |
| 170 | case 3: v[3] = src[3]; |
| 171 | case 2: v[2] = src[2]; |
| 172 | case 1: v[1] = src[1]; |
| 173 | case 0: v[0] = src[0]; |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 174 | } |
| 175 | return v; |
| 176 | } |
| 177 | #endif |
| 178 | return unaligned_load<V>(src); |
| 179 | } |
| 180 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 181 | template <typename V, typename T> |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 182 | SI void store(T* dst, V v, size_t tail) { |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 183 | #if defined(JUMPER) |
Mike Klein | 4e7fc0c | 2017-03-02 11:16:22 -0500 | [diff] [blame] | 184 | __builtin_assume(tail < kStride); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 185 | if (__builtin_expect(tail, 0)) { |
Mike Klein | 4e7fc0c | 2017-03-02 11:16:22 -0500 | [diff] [blame] | 186 | switch (tail-1) { |
| 187 | case 6: dst[6] = v[6]; |
| 188 | case 5: dst[5] = v[5]; |
| 189 | case 4: dst[4] = v[4]; |
| 190 | case 3: dst[3] = v[3]; |
| 191 | case 2: dst[2] = v[2]; |
| 192 | case 1: dst[1] = v[1]; |
| 193 | case 0: dst[0] = v[0]; |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 194 | } |
| 195 | return; |
| 196 | } |
| 197 | #endif |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 198 | unaligned_store(dst, v); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 199 | } |
| 200 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 201 | // This doesn't look strictly necessary, but without it Clang would generate load() using |
| 202 | // compiler-generated constants that we can't support. This version doesn't need constants. |
| 203 | #if defined(JUMPER) && defined(__AVX__) |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 204 | template <> |
| 205 | inline U8 load(const uint8_t* src, size_t tail) { |
| 206 | if (__builtin_expect(tail, 0)) { |
| 207 | uint64_t v = 0; |
| 208 | size_t shift = 0; |
| 209 | #pragma nounroll |
| 210 | while (tail --> 0) { |
| 211 | v |= (uint64_t)*src++ << shift; |
| 212 | shift += 8; |
| 213 | } |
| 214 | return unaligned_load<U8>(&v); |
| 215 | } |
| 216 | return unaligned_load<U8>(src); |
| 217 | } |
| 218 | #endif |
| 219 | |
Brian Osman | 74fc593 | 2017-05-22 19:25:36 +0000 | [diff] [blame] | 220 | // AVX2 adds some mask loads and stores that make for shorter, faster code. |
| 221 | #if defined(JUMPER) && defined(__AVX2__) |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 222 | SI U32 mask(size_t tail) { |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 223 | // We go a little out of our way to avoid needing large constant values here. |
| 224 | |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 225 | // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff. |
| 226 | // Start fully on, then shift away lanes from the top until we've got our mask. |
| 227 | uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail); |
| 228 | |
| 229 | // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff. |
Brian Osman | 74fc593 | 2017-05-22 19:25:36 +0000 | [diff] [blame] | 230 | return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask)); |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 231 | } |
| 232 | |
| 233 | template <> |
| 234 | inline U32 load(const uint32_t* src, size_t tail) { |
| 235 | __builtin_assume(tail < kStride); |
| 236 | if (__builtin_expect(tail, 0)) { |
Brian Osman | 74fc593 | 2017-05-22 19:25:36 +0000 | [diff] [blame] | 237 | return _mm256_maskload_epi32((const int*)src, mask(tail)); |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 238 | } |
| 239 | return unaligned_load<U32>(src); |
| 240 | } |
| 241 | |
| 242 | template <> |
| 243 | inline void store(uint32_t* dst, U32 v, size_t tail) { |
| 244 | __builtin_assume(tail < kStride); |
| 245 | if (__builtin_expect(tail, 0)) { |
Brian Osman | 74fc593 | 2017-05-22 19:25:36 +0000 | [diff] [blame] | 246 | return _mm256_maskstore_epi32((int*)dst, mask(tail), v); |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 247 | } |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 248 | unaligned_store(dst, v); |
Mike Klein | 767c7e7 | 2017-03-02 14:08:36 -0500 | [diff] [blame] | 249 | } |
| 250 | #endif |
| 251 | |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 252 | SI F from_byte(U8 b) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 253 | return cast(expand(b)) * (1/255.0f); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 254 | } |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 255 | SI void from_565(U16 _565, F* r, F* g, F* b) { |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 256 | U32 wide = expand(_565); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 257 | *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); |
| 258 | *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); |
| 259 | *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 260 | } |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 261 | SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { |
| 262 | U32 wide = expand(_4444); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 263 | *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); |
| 264 | *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); |
| 265 | *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); |
| 266 | *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 267 | } |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 268 | SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 269 | *r = cast((_8888 ) & 0xff) * (1/255.0f); |
| 270 | *g = cast((_8888 >> 8) & 0xff) * (1/255.0f); |
| 271 | *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); |
| 272 | *a = cast((_8888 >> 24) ) * (1/255.0f); |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 273 | } |
| 274 | |
| 275 | template <typename T> |
Mike Klein | 994ef97 | 2017-04-12 11:51:11 -0400 | [diff] [blame] | 276 | SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 277 | *ptr = (const T*)ctx->pixels; |
| 278 | return trunc_(y)*ctx->stride + trunc_(x); |
| 279 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 280 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 281 | // Now finally, normal Stages! |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 282 | |
| 283 | STAGE(seed_shader) { |
| 284 | auto y = *(const int*)ctx; |
| 285 | |
| 286 | // It's important for speed to explicitly cast(x) and cast(y), |
| 287 | // which has the effect of splatting them to vectors before converting to floats. |
| 288 | // On Intel this breaks a data dependency on previous loop iterations' registers. |
Mike Klein | 5d7f2b5 | 2017-05-20 13:21:59 -0400 | [diff] [blame] | 289 | r = cast(x) + 0.5f + unaligned_load<F>(k->iota_F); |
Mike Klein | 2229b57 | 2017-04-21 10:30:29 -0400 | [diff] [blame] | 290 | g = cast(y) + 0.5f; |
| 291 | b = 1.0f; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 292 | a = 0; |
| 293 | dr = dg = db = da = 0; |
| 294 | } |
| 295 | |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 296 | STAGE(dither) { |
| 297 | auto c = (const SkJumper_DitherCtx*)ctx; |
| 298 | |
| 299 | // Get [(x,y), (x+1,y), (x+2,y), ...] loaded up in integer vectors. |
Mike Klein | 5d7f2b5 | 2017-05-20 13:21:59 -0400 | [diff] [blame] | 300 | U32 X = x + unaligned_load<U32>(k->iota_U32), |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 301 | Y = (uint32_t)*c->y; |
| 302 | |
| 303 | // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. |
| 304 | // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. |
| 305 | |
| 306 | // We only need X and X^Y from here on, so it's easier to just think of that as "Y". |
| 307 | Y ^= X; |
| 308 | |
| 309 | // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, |
| 310 | // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda. |
| 311 | U32 M = (Y & 1) << 5 | (X & 1) << 4 |
| 312 | | (Y & 2) << 2 | (X & 2) << 1 |
| 313 | | (Y & 4) >> 1 | (X & 4) >> 2; |
| 314 | |
Mike Klein | db711c9 | 2017-05-03 17:57:48 -0400 | [diff] [blame] | 315 | // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. |
| 316 | // We want to make sure our dither is less than 0.5 in either direction to keep exact values |
| 317 | // like 0 and 1 unchanged after rounding. |
| 318 | F dither = cast(M) * (2/128.0f) - (63/128.0f); |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 319 | |
Mike Klein | db711c9 | 2017-05-03 17:57:48 -0400 | [diff] [blame] | 320 | r += c->rate*dither; |
| 321 | g += c->rate*dither; |
| 322 | b += c->rate*dither; |
Mike Klein | 7e68bc9 | 2017-05-16 12:03:15 -0400 | [diff] [blame] | 323 | |
| 324 | r = max(0, min(r, a)); |
| 325 | g = max(0, min(g, a)); |
| 326 | b = max(0, min(b, a)); |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 327 | } |
| 328 | |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 329 | // load 4 floats from memory, and splat them into r,g,b,a |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 330 | STAGE(constant_color) { |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 331 | auto rgba = (const float*)ctx; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 332 | r = rgba[0]; |
| 333 | g = rgba[1]; |
| 334 | b = rgba[2]; |
| 335 | a = rgba[3]; |
| 336 | } |
| 337 | |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 338 | // load registers r,g,b,a from context (mirrors store_rgba) |
| 339 | STAGE(load_rgba) { |
| 340 | auto ptr = (const float*)ctx; |
| 341 | r = unaligned_load<F>(ptr + 0*kStride); |
| 342 | g = unaligned_load<F>(ptr + 1*kStride); |
| 343 | b = unaligned_load<F>(ptr + 2*kStride); |
| 344 | a = unaligned_load<F>(ptr + 3*kStride); |
| 345 | } |
| 346 | |
| 347 | // store registers r,g,b,a into context (mirrors load_rgba) |
| 348 | STAGE(store_rgba) { |
| 349 | auto ptr = (float*)ctx; |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 350 | unaligned_store(ptr + 0*kStride, r); |
| 351 | unaligned_store(ptr + 1*kStride, g); |
| 352 | unaligned_store(ptr + 2*kStride, b); |
| 353 | unaligned_store(ptr + 3*kStride, a); |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 354 | } |
| 355 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 356 | // Most blend modes apply the same logic to each channel. |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 357 | #define BLEND_MODE(name) \ |
| 358 | SI F name##_channel(F s, F d, F sa, F da); \ |
| 359 | STAGE(name) { \ |
| 360 | r = name##_channel(r,dr,a,da); \ |
| 361 | g = name##_channel(g,dg,a,da); \ |
| 362 | b = name##_channel(b,db,a,da); \ |
| 363 | a = name##_channel(a,da,a,da); \ |
| 364 | } \ |
| 365 | SI F name##_channel(F s, F d, F sa, F da) |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 366 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 367 | SI F inv(F x) { return 1.0f - x; } |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 368 | SI F two(F x) { return x + x; } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 369 | |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 370 | BLEND_MODE(clear) { return 0; } |
| 371 | BLEND_MODE(srcatop) { return s*da + d*inv(sa); } |
| 372 | BLEND_MODE(dstatop) { return d*sa + s*inv(da); } |
| 373 | BLEND_MODE(srcin) { return s * da; } |
| 374 | BLEND_MODE(dstin) { return d * sa; } |
| 375 | BLEND_MODE(srcout) { return s * inv(da); } |
| 376 | BLEND_MODE(dstout) { return d * inv(sa); } |
| 377 | BLEND_MODE(srcover) { return mad(d, inv(sa), s); } |
| 378 | BLEND_MODE(dstover) { return mad(s, inv(da), d); } |
| 379 | |
| 380 | BLEND_MODE(modulate) { return s*d; } |
| 381 | BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } |
| 382 | BLEND_MODE(plus_) { return s + d; } |
| 383 | BLEND_MODE(screen) { return s + d - s*d; } |
| 384 | BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 385 | #undef BLEND_MODE |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 386 | |
| 387 | // Most other blend modes apply the same logic to colors, and srcover to alpha. |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 388 | #define BLEND_MODE(name) \ |
| 389 | SI F name##_channel(F s, F d, F sa, F da); \ |
| 390 | STAGE(name) { \ |
| 391 | r = name##_channel(r,dr,a,da); \ |
| 392 | g = name##_channel(g,dg,a,da); \ |
| 393 | b = name##_channel(b,db,a,da); \ |
| 394 | a = mad(da, inv(a), a); \ |
| 395 | } \ |
| 396 | SI F name##_channel(F s, F d, F sa, F da) |
| 397 | |
| 398 | BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; } |
| 399 | BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; } |
| 400 | BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } |
| 401 | BLEND_MODE(exclusion) { return s + d - two(s*d); } |
| 402 | |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 403 | BLEND_MODE(colorburn) { |
| 404 | return if_then_else(d == da, d + s*inv(da), |
| 405 | if_then_else(s == 0, s + d*inv(sa), |
| 406 | sa*(da - min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa))); |
| 407 | } |
| 408 | BLEND_MODE(colordodge) { |
| 409 | return if_then_else(d == 0, d + s*inv(da), |
| 410 | if_then_else(s == sa, s + d*inv(sa), |
| 411 | sa*min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa))); |
| 412 | } |
| 413 | BLEND_MODE(hardlight) { |
| 414 | return s*inv(da) + d*inv(sa) |
| 415 | + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); |
| 416 | } |
| 417 | BLEND_MODE(overlay) { |
| 418 | return s*inv(da) + d*inv(sa) |
| 419 | + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); |
| 420 | } |
| 421 | |
| 422 | BLEND_MODE(softlight) { |
| 423 | F m = if_then_else(da > 0, d / da, 0), |
| 424 | s2 = two(s), |
| 425 | m4 = two(two(m)); |
| 426 | |
| 427 | // The logic forks three ways: |
| 428 | // 1. dark src? |
| 429 | // 2. light src, dark dst? |
| 430 | // 3. light src, light dst? |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 431 | F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. |
| 432 | darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. |
| 433 | liteDst = rcp(rsqrt(m)) - m, // Used in case 3. |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 434 | liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? |
| 435 | return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)? |
| 436 | } |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 437 | #undef BLEND_MODE |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 438 | |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 439 | // We're basing our implemenation of non-separable blend modes on |
| 440 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
| 441 | // and |
| 442 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
| 443 | // They're equivalent, but ES' math has been better simplified. |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 444 | // |
| 445 | // Anything extra we add beyond that is to make the math work with premul inputs. |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 446 | |
| 447 | SI F max(F r, F g, F b) { return max(r, max(g, b)); } |
| 448 | SI F min(F r, F g, F b) { return min(r, min(g, b)); } |
| 449 | |
| 450 | SI F sat(F r, F g, F b) { return max(r,g,b) - min(r,g,b); } |
| 451 | SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } |
| 452 | |
| 453 | SI void set_sat(F* r, F* g, F* b, F s) { |
| 454 | F mn = min(*r,*g,*b), |
| 455 | mx = max(*r,*g,*b), |
| 456 | sat = mx - mn; |
| 457 | |
| 458 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
| 459 | auto scale = [=](F c) { |
| 460 | return if_then_else(sat == 0, 0, (c - mn) * s / sat); |
| 461 | }; |
| 462 | *r = scale(*r); |
| 463 | *g = scale(*g); |
| 464 | *b = scale(*b); |
| 465 | } |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 466 | SI void set_lum(F* r, F* g, F* b, F l) { |
| 467 | F diff = l - lum(*r, *g, *b); |
| 468 | *r += diff; |
| 469 | *g += diff; |
| 470 | *b += diff; |
| 471 | } |
| 472 | SI void clip_color(F* r, F* g, F* b, F a) { |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 473 | F mn = min(*r, *g, *b), |
| 474 | mx = max(*r, *g, *b), |
| 475 | l = lum(*r, *g, *b); |
| 476 | |
| 477 | auto clip = [=](F c) { |
| 478 | c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) ); |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 479 | c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 480 | c = max(c, 0); // Sometimes without this we may dip just a little negative. |
| 481 | return c; |
| 482 | }; |
| 483 | *r = clip(*r); |
| 484 | *g = clip(*g); |
| 485 | *b = clip(*b); |
| 486 | } |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 487 | |
| 488 | STAGE(hue) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 489 | F R = r*a, |
| 490 | G = g*a, |
| 491 | B = b*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 492 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 493 | set_sat(&R, &G, &B, sat(dr,dg,db)*a); |
| 494 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
| 495 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 496 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 497 | r = r*inv(da) + dr*inv(a) + R; |
| 498 | g = g*inv(da) + dg*inv(a) + G; |
| 499 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 500 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 501 | } |
| 502 | STAGE(saturation) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 503 | F R = dr*a, |
| 504 | G = dg*a, |
| 505 | B = db*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 506 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 507 | set_sat(&R, &G, &B, sat( r, g, b)*da); |
| 508 | set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.) |
| 509 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 510 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 511 | r = r*inv(da) + dr*inv(a) + R; |
| 512 | g = g*inv(da) + dg*inv(a) + G; |
| 513 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 514 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 515 | } |
| 516 | STAGE(color) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 517 | F R = r*da, |
| 518 | G = g*da, |
| 519 | B = b*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 520 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 521 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
| 522 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 523 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 524 | r = r*inv(da) + dr*inv(a) + R; |
| 525 | g = g*inv(da) + dg*inv(a) + G; |
| 526 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 527 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 528 | } |
| 529 | STAGE(luminosity) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 530 | F R = dr*a, |
| 531 | G = dg*a, |
| 532 | B = db*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 533 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 534 | set_lum(&R, &G, &B, lum(r,g,b)*da); |
| 535 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 536 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 537 | r = r*inv(da) + dr*inv(a) + R; |
| 538 | g = g*inv(da) + dg*inv(a) + G; |
| 539 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 540 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 541 | } |
| 542 | |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 543 | STAGE(clamp_0) { |
| 544 | r = max(r, 0); |
| 545 | g = max(g, 0); |
| 546 | b = max(b, 0); |
| 547 | a = max(a, 0); |
| 548 | } |
| 549 | |
| 550 | STAGE(clamp_1) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 551 | r = min(r, 1.0f); |
| 552 | g = min(g, 1.0f); |
| 553 | b = min(b, 1.0f); |
| 554 | a = min(a, 1.0f); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 555 | } |
| 556 | |
| 557 | STAGE(clamp_a) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 558 | a = min(a, 1.0f); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 559 | r = min(r, a); |
| 560 | g = min(g, a); |
| 561 | b = min(b, a); |
| 562 | } |
| 563 | |
Mike Klein | d9e8225 | 2017-02-22 14:17:32 -0500 | [diff] [blame] | 564 | STAGE(set_rgb) { |
| 565 | auto rgb = (const float*)ctx; |
| 566 | r = rgb[0]; |
| 567 | g = rgb[1]; |
| 568 | b = rgb[2]; |
| 569 | } |
| 570 | STAGE(swap_rb) { |
| 571 | auto tmp = r; |
| 572 | r = b; |
| 573 | b = tmp; |
| 574 | } |
| 575 | |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 576 | STAGE(swap) { |
| 577 | auto swap = [](F& v, F& dv) { |
| 578 | auto tmp = v; |
| 579 | v = dv; |
| 580 | dv = tmp; |
| 581 | }; |
| 582 | swap(r, dr); |
| 583 | swap(g, dg); |
| 584 | swap(b, db); |
| 585 | swap(a, da); |
| 586 | } |
| 587 | STAGE(move_src_dst) { |
| 588 | dr = r; |
| 589 | dg = g; |
| 590 | db = b; |
| 591 | da = a; |
| 592 | } |
| 593 | STAGE(move_dst_src) { |
| 594 | r = dr; |
| 595 | g = dg; |
| 596 | b = db; |
| 597 | a = da; |
| 598 | } |
| 599 | |
| 600 | STAGE(premul) { |
| 601 | r = r * a; |
| 602 | g = g * a; |
| 603 | b = b * a; |
| 604 | } |
| 605 | STAGE(unpremul) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 606 | auto scale = if_then_else(a == 0, 0, 1.0f / a); |
| 607 | r *= scale; |
| 608 | g *= scale; |
| 609 | b *= scale; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 610 | } |
| 611 | |
| 612 | STAGE(from_srgb) { |
| 613 | auto fn = [&](F s) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 614 | auto lo = s * (1/12.92f); |
| 615 | auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f); |
| 616 | return if_then_else(s < 0.055f, lo, hi); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 617 | }; |
| 618 | r = fn(r); |
| 619 | g = fn(g); |
| 620 | b = fn(b); |
| 621 | } |
| 622 | STAGE(to_srgb) { |
| 623 | auto fn = [&](F l) { |
Mike Klein | f45e3d7 | 2017-05-15 17:36:59 -0400 | [diff] [blame] | 624 | F t = rsqrt(l); |
| 625 | auto lo = l * 12.92f; |
| 626 | auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), 1.1334244f) |
| 627 | * rcp(0.14513608f + t); |
| 628 | return if_then_else(l < 0.00465985f, lo, hi); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 629 | }; |
| 630 | r = fn(r); |
| 631 | g = fn(g); |
| 632 | b = fn(b); |
| 633 | } |
| 634 | |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 635 | STAGE(rgb_to_hsl) { |
| 636 | F mx = max(max(r,g), b), |
| 637 | mn = min(min(r,g), b), |
| 638 | d = mx - mn, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 639 | d_rcp = 1.0f / d; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 640 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 641 | F h = (1/6.0f) * |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 642 | if_then_else(mx == mn, 0, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 643 | if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), |
| 644 | if_then_else(mx == g, (b-r)*d_rcp + 2.0f, |
| 645 | (r-g)*d_rcp + 4.0f))); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 646 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 647 | F l = (mx + mn) * 0.5f; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 648 | F s = if_then_else(mx == mn, 0, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 649 | d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 650 | |
| 651 | r = h; |
| 652 | g = s; |
| 653 | b = l; |
| 654 | } |
| 655 | STAGE(hsl_to_rgb) { |
| 656 | F h = r, |
| 657 | s = g, |
| 658 | l = b; |
| 659 | |
Mike Klein | 5664e65 | 2017-05-01 16:01:38 -0400 | [diff] [blame] | 660 | F q = l + if_then_else(l >= 0.5f, s - l*s, l*s), |
| 661 | p = 2.0f*l - q; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 662 | |
| 663 | auto hue_to_rgb = [&](F t) { |
Mike Klein | 879a08a | 2017-05-01 15:34:01 -0400 | [diff] [blame] | 664 | t = fract(t); |
Mike Klein | 5664e65 | 2017-05-01 16:01:38 -0400 | [diff] [blame] | 665 | |
| 666 | F r = p; |
| 667 | r = if_then_else(t >= 4/6.0f, r, p + (q-p)*(4.0f - 6.0f*t)); |
| 668 | r = if_then_else(t >= 3/6.0f, r, q); |
| 669 | r = if_then_else(t >= 1/6.0f, r, p + (q-p)*( 6.0f*t)); |
| 670 | return r; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 671 | }; |
| 672 | |
Mike Klein | fb11acd | 2017-05-01 14:22:10 -0400 | [diff] [blame] | 673 | r = if_then_else(s == 0, l, hue_to_rgb(h + (1/3.0f))); |
| 674 | g = if_then_else(s == 0, l, hue_to_rgb(h )); |
| 675 | b = if_then_else(s == 0, l, hue_to_rgb(h - (1/3.0f))); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 676 | } |
| 677 | |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 678 | STAGE(scale_1_float) { |
| 679 | auto c = *(const float*)ctx; |
| 680 | |
| 681 | r = r * c; |
| 682 | g = g * c; |
| 683 | b = b * c; |
| 684 | a = a * c; |
| 685 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 686 | STAGE(scale_u8) { |
| 687 | auto ptr = *(const uint8_t**)ctx + x; |
| 688 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 689 | auto scales = load<U8>(ptr, tail); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 690 | auto c = from_byte(scales); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 691 | |
| 692 | r = r * c; |
| 693 | g = g * c; |
| 694 | b = b * c; |
| 695 | a = a * c; |
| 696 | } |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 697 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 698 | SI F lerp(F from, F to, F t) { |
| 699 | return mad(to-from, t, from); |
| 700 | } |
| 701 | |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 702 | STAGE(lerp_1_float) { |
| 703 | auto c = *(const float*)ctx; |
| 704 | |
| 705 | r = lerp(dr, r, c); |
| 706 | g = lerp(dg, g, c); |
| 707 | b = lerp(db, b, c); |
| 708 | a = lerp(da, a, c); |
| 709 | } |
Mike Klein | 2b76736 | 2017-02-22 13:52:40 -0500 | [diff] [blame] | 710 | STAGE(lerp_u8) { |
| 711 | auto ptr = *(const uint8_t**)ctx + x; |
| 712 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 713 | auto scales = load<U8>(ptr, tail); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 714 | auto c = from_byte(scales); |
Mike Klein | 2b76736 | 2017-02-22 13:52:40 -0500 | [diff] [blame] | 715 | |
| 716 | r = lerp(dr, r, c); |
| 717 | g = lerp(dg, g, c); |
| 718 | b = lerp(db, b, c); |
| 719 | a = lerp(da, a, c); |
| 720 | } |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 721 | STAGE(lerp_565) { |
| 722 | auto ptr = *(const uint16_t**)ctx + x; |
| 723 | |
| 724 | F cr,cg,cb; |
Mike Klein | 5224f46 | 2017-03-07 17:29:54 -0500 | [diff] [blame] | 725 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 726 | |
| 727 | r = lerp(dr, r, cr); |
| 728 | g = lerp(dg, g, cg); |
| 729 | b = lerp(db, b, cb); |
bungeman | 6f9f259 | 2017-05-10 13:50:12 -0400 | [diff] [blame] | 730 | a = max(lerp(da, a, cr), lerp(da, a, cg), lerp(da, a, cb)); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 731 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 732 | |
| 733 | STAGE(load_tables) { |
Mike Klein | a3735cd | 2017-04-17 13:19:05 -0400 | [diff] [blame] | 734 | auto c = (const SkJumper_LoadTablesCtx*)ctx; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 735 | |
Mike Klein | a3735cd | 2017-04-17 13:19:05 -0400 | [diff] [blame] | 736 | auto px = load<U32>((const uint32_t*)c->src + x, tail); |
Mike Klein | 0aa742f | 2017-04-27 13:36:57 -0400 | [diff] [blame] | 737 | r = gather(c->r, (px ) & 0xff); |
| 738 | g = gather(c->g, (px >> 8) & 0xff); |
| 739 | b = gather(c->b, (px >> 16) & 0xff); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 740 | a = cast( (px >> 24)) * (1/255.0f); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 741 | } |
Mike Klein | a3735cd | 2017-04-17 13:19:05 -0400 | [diff] [blame] | 742 | STAGE(load_tables_u16_be) { |
| 743 | auto c = (const SkJumper_LoadTablesCtx*)ctx; |
| 744 | auto ptr = (const uint16_t*)c->src + 4*x; |
| 745 | |
| 746 | U16 R,G,B,A; |
| 747 | load4(ptr, tail, &R,&G,&B,&A); |
| 748 | |
Mike Klein | 0aa742f | 2017-04-27 13:36:57 -0400 | [diff] [blame] | 749 | // c->src is big-endian, so & 0xff grabs the 8 most signficant bits. |
| 750 | r = gather(c->r, expand(R) & 0xff); |
| 751 | g = gather(c->g, expand(G) & 0xff); |
| 752 | b = gather(c->b, expand(B) & 0xff); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 753 | a = (1/65535.0f) * cast(expand(bswap(A))); |
Mike Klein | a3735cd | 2017-04-17 13:19:05 -0400 | [diff] [blame] | 754 | } |
| 755 | STAGE(load_tables_rgb_u16_be) { |
| 756 | auto c = (const SkJumper_LoadTablesCtx*)ctx; |
| 757 | auto ptr = (const uint16_t*)c->src + 3*x; |
| 758 | |
| 759 | U16 R,G,B; |
| 760 | load3(ptr, tail, &R,&G,&B); |
| 761 | |
Mike Klein | 0aa742f | 2017-04-27 13:36:57 -0400 | [diff] [blame] | 762 | // c->src is big-endian, so & 0xff grabs the 8 most signficant bits. |
| 763 | r = gather(c->r, expand(R) & 0xff); |
| 764 | g = gather(c->g, expand(G) & 0xff); |
| 765 | b = gather(c->b, expand(B) & 0xff); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 766 | a = 1.0f; |
Mike Klein | a3735cd | 2017-04-17 13:19:05 -0400 | [diff] [blame] | 767 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 768 | |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 769 | STAGE(byte_tables) { |
| 770 | struct Tables { const uint8_t *r, *g, *b, *a; }; |
| 771 | auto tables = (const Tables*)ctx; |
| 772 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 773 | r = from_byte(gather(tables->r, round(r, 255.0f))); |
| 774 | g = from_byte(gather(tables->g, round(g, 255.0f))); |
| 775 | b = from_byte(gather(tables->b, round(b, 255.0f))); |
| 776 | a = from_byte(gather(tables->a, round(a, 255.0f))); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 777 | } |
| 778 | |
| 779 | STAGE(byte_tables_rgb) { |
| 780 | struct Tables { const uint8_t *r, *g, *b; int n; }; |
| 781 | auto tables = (const Tables*)ctx; |
| 782 | |
| 783 | F scale = tables->n - 1; |
| 784 | r = from_byte(gather(tables->r, round(r, scale))); |
| 785 | g = from_byte(gather(tables->g, round(g, scale))); |
| 786 | b = from_byte(gather(tables->b, round(b, scale))); |
| 787 | } |
| 788 | |
Mike Klein | c7d9c0b | 2017-04-17 14:43:59 -0400 | [diff] [blame] | 789 | SI F table(F v, const SkJumper_TableCtx* ctx) { |
| 790 | return gather(ctx->table, round(v, ctx->size - 1)); |
| 791 | } |
| 792 | STAGE(table_r) { r = table(r, ctx); } |
| 793 | STAGE(table_g) { g = table(g, ctx); } |
| 794 | STAGE(table_b) { b = table(b, ctx); } |
| 795 | STAGE(table_a) { a = table(a, ctx); } |
| 796 | |
Mike Klein | 4437517 | 2017-04-17 19:32:05 -0400 | [diff] [blame] | 797 | SI F parametric(F v, const SkJumper_ParametricTransferFunction* ctx) { |
| 798 | F r = if_then_else(v <= ctx->D, mad(ctx->C, v, ctx->F) |
| 799 | , approx_powf(mad(ctx->A, v, ctx->B), ctx->G) + ctx->E); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 800 | return min(max(r, 0), 1.0f); // Clamp to [0,1], with argument order mattering to handle NaN. |
Mike Klein | 4437517 | 2017-04-17 19:32:05 -0400 | [diff] [blame] | 801 | } |
| 802 | STAGE(parametric_r) { r = parametric(r, ctx); } |
| 803 | STAGE(parametric_g) { g = parametric(g, ctx); } |
| 804 | STAGE(parametric_b) { b = parametric(b, ctx); } |
| 805 | STAGE(parametric_a) { a = parametric(a, ctx); } |
| 806 | |
Mike Klein | 4e3e9f8 | 2017-04-20 11:04:29 -0400 | [diff] [blame] | 807 | STAGE(lab_to_xyz) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 808 | F L = r * 100.0f, |
| 809 | A = g * 255.0f - 128.0f, |
| 810 | B = b * 255.0f - 128.0f; |
Mike Klein | 4e3e9f8 | 2017-04-20 11:04:29 -0400 | [diff] [blame] | 811 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 812 | F Y = (L + 16.0f) * (1/116.0f), |
| 813 | X = Y + A*(1/500.0f), |
| 814 | Z = Y - B*(1/200.0f); |
Mike Klein | 4e3e9f8 | 2017-04-20 11:04:29 -0400 | [diff] [blame] | 815 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 816 | X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f)); |
| 817 | Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f)); |
| 818 | Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f)); |
Mike Klein | 4e3e9f8 | 2017-04-20 11:04:29 -0400 | [diff] [blame] | 819 | |
| 820 | // Adjust to D50 illuminant. |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 821 | r = X * 0.96422f; |
| 822 | g = Y ; |
| 823 | b = Z * 0.82521f; |
Mike Klein | 4e3e9f8 | 2017-04-20 11:04:29 -0400 | [diff] [blame] | 824 | } |
| 825 | |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 826 | STAGE(load_a8) { |
| 827 | auto ptr = *(const uint8_t**)ctx + x; |
| 828 | |
| 829 | r = g = b = 0.0f; |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 830 | a = from_byte(load<U8>(ptr, tail)); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 831 | } |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 832 | STAGE(gather_a8) { |
| 833 | const uint8_t* ptr; |
| 834 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 835 | r = g = b = 0.0f; |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 836 | a = from_byte(gather(ptr, ix)); |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 837 | } |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 838 | STAGE(store_a8) { |
| 839 | auto ptr = *(uint8_t**)ctx + x; |
| 840 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 841 | U8 packed = pack(pack(round(a, 255.0f))); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 842 | store(ptr, packed, tail); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 843 | } |
| 844 | |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 845 | STAGE(load_g8) { |
| 846 | auto ptr = *(const uint8_t**)ctx + x; |
| 847 | |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 848 | r = g = b = from_byte(load<U8>(ptr, tail)); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 849 | a = 1.0f; |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 850 | } |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 851 | STAGE(gather_g8) { |
| 852 | const uint8_t* ptr; |
| 853 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 854 | r = g = b = from_byte(gather(ptr, ix)); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 855 | a = 1.0f; |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 856 | } |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 857 | |
Mike Klein | 7d3d872 | 2017-04-06 17:53:18 -0400 | [diff] [blame] | 858 | STAGE(gather_i8) { |
Mike Klein | 994ef97 | 2017-04-12 11:51:11 -0400 | [diff] [blame] | 859 | auto c = (const SkJumper_GatherCtx*)ctx; |
Mike Klein | 7d3d872 | 2017-04-06 17:53:18 -0400 | [diff] [blame] | 860 | const uint8_t* ptr; |
| 861 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 862 | ix = expand(gather(ptr, ix)); |
| 863 | from_8888(gather(c->ctable, ix), &r,&g,&b,&a); |
| 864 | } |
| 865 | |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 866 | STAGE(load_565) { |
| 867 | auto ptr = *(const uint16_t**)ctx + x; |
| 868 | |
Mike Klein | 5224f46 | 2017-03-07 17:29:54 -0500 | [diff] [blame] | 869 | from_565(load<U16>(ptr, tail), &r,&g,&b); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 870 | a = 1.0f; |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 871 | } |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 872 | STAGE(gather_565) { |
| 873 | const uint16_t* ptr; |
| 874 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 875 | from_565(gather(ptr, ix), &r,&g,&b); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 876 | a = 1.0f; |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 877 | } |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 878 | STAGE(store_565) { |
| 879 | auto ptr = *(uint16_t**)ctx + x; |
| 880 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 881 | U16 px = pack( round(r, 31.0f) << 11 |
| 882 | | round(g, 63.0f) << 5 |
| 883 | | round(b, 31.0f) ); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 884 | store(ptr, px, tail); |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 885 | } |
| 886 | |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 887 | STAGE(load_4444) { |
| 888 | auto ptr = *(const uint16_t**)ctx + x; |
| 889 | from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); |
| 890 | } |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 891 | STAGE(gather_4444) { |
| 892 | const uint16_t* ptr; |
| 893 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 894 | from_4444(gather(ptr, ix), &r,&g,&b,&a); |
| 895 | } |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 896 | STAGE(store_4444) { |
| 897 | auto ptr = *(uint16_t**)ctx + x; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 898 | U16 px = pack( round(r, 15.0f) << 12 |
| 899 | | round(g, 15.0f) << 8 |
| 900 | | round(b, 15.0f) << 4 |
| 901 | | round(a, 15.0f) ); |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 902 | store(ptr, px, tail); |
| 903 | } |
| 904 | |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 905 | STAGE(load_8888) { |
| 906 | auto ptr = *(const uint32_t**)ctx + x; |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 907 | from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); |
| 908 | } |
| 909 | STAGE(gather_8888) { |
| 910 | const uint32_t* ptr; |
| 911 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 912 | from_8888(gather(ptr, ix), &r,&g,&b,&a); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 913 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 914 | STAGE(store_8888) { |
| 915 | auto ptr = *(uint32_t**)ctx + x; |
| 916 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 917 | U32 px = round(r, 255.0f) |
| 918 | | round(g, 255.0f) << 8 |
| 919 | | round(b, 255.0f) << 16 |
| 920 | | round(a, 255.0f) << 24; |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 921 | store(ptr, px, tail); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 922 | } |
| 923 | |
| 924 | STAGE(load_f16) { |
| 925 | auto ptr = *(const uint64_t**)ctx + x; |
| 926 | |
Mike Klein | 114e6b3 | 2017-04-03 22:21:15 -0400 | [diff] [blame] | 927 | U16 R,G,B,A; |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 928 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
Mike Klein | 114e6b3 | 2017-04-03 22:21:15 -0400 | [diff] [blame] | 929 | r = from_half(R); |
| 930 | g = from_half(G); |
| 931 | b = from_half(B); |
| 932 | a = from_half(A); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 933 | } |
Mike Klein | 5f055f0 | 2017-04-06 20:02:11 -0400 | [diff] [blame] | 934 | STAGE(gather_f16) { |
| 935 | const uint64_t* ptr; |
| 936 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 937 | auto px = gather(ptr, ix); |
| 938 | |
| 939 | U16 R,G,B,A; |
| 940 | load4((const uint16_t*)&px,0, &R,&G,&B,&A); |
| 941 | r = from_half(R); |
| 942 | g = from_half(G); |
| 943 | b = from_half(B); |
| 944 | a = from_half(A); |
| 945 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 946 | STAGE(store_f16) { |
| 947 | auto ptr = *(uint64_t**)ctx + x; |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 948 | store4((uint16_t*)ptr,tail, to_half(r) |
| 949 | , to_half(g) |
| 950 | , to_half(b) |
| 951 | , to_half(a)); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 952 | } |
| 953 | |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 954 | STAGE(load_u16_be) { |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 955 | auto ptr = *(const uint16_t**)ctx + 4*x; |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 956 | |
| 957 | U16 R,G,B,A; |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 958 | load4(ptr,tail, &R,&G,&B,&A); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 959 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 960 | r = (1/65535.0f) * cast(expand(bswap(R))); |
| 961 | g = (1/65535.0f) * cast(expand(bswap(G))); |
| 962 | b = (1/65535.0f) * cast(expand(bswap(B))); |
| 963 | a = (1/65535.0f) * cast(expand(bswap(A))); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 964 | } |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 965 | STAGE(load_rgb_u16_be) { |
| 966 | auto ptr = *(const uint16_t**)ctx + 3*x; |
| 967 | |
| 968 | U16 R,G,B; |
| 969 | load3(ptr,tail, &R,&G,&B); |
| 970 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 971 | r = (1/65535.0f) * cast(expand(bswap(R))); |
| 972 | g = (1/65535.0f) * cast(expand(bswap(G))); |
| 973 | b = (1/65535.0f) * cast(expand(bswap(B))); |
| 974 | a = 1.0f; |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 975 | } |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 976 | STAGE(store_u16_be) { |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 977 | auto ptr = *(uint16_t**)ctx + 4*x; |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 978 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 979 | U16 R = bswap(pack(round(r, 65535.0f))), |
| 980 | G = bswap(pack(round(g, 65535.0f))), |
| 981 | B = bswap(pack(round(b, 65535.0f))), |
| 982 | A = bswap(pack(round(a, 65535.0f))); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 983 | |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 984 | store4(ptr,tail, R,G,B,A); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 985 | } |
| 986 | |
Mike Klein | 14987eb | 2017-04-06 10:22:26 -0400 | [diff] [blame] | 987 | STAGE(load_f32) { |
| 988 | auto ptr = *(const float**)ctx + 4*x; |
| 989 | load4(ptr,tail, &r,&g,&b,&a); |
| 990 | } |
Mike Klein | 94fc0fe | 2017-03-03 14:05:32 -0500 | [diff] [blame] | 991 | STAGE(store_f32) { |
| 992 | auto ptr = *(float**)ctx + 4*x; |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 993 | store4(ptr,tail, r,g,b,a); |
Mike Klein | 94fc0fe | 2017-03-03 14:05:32 -0500 | [diff] [blame] | 994 | } |
| 995 | |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 996 | SI F clamp(F v, float limit) { |
Mike Klein | 8ca3356 | 2017-05-23 08:07:43 -0400 | [diff] [blame] | 997 | return min(max(0, v), limit); |
Mike Klein | 9fe1b22 | 2017-02-24 11:04:50 -0500 | [diff] [blame] | 998 | } |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 999 | SI F repeat(F v, float limit) { |
Mike Klein | 8ca3356 | 2017-05-23 08:07:43 -0400 | [diff] [blame] | 1000 | return v - floor_(v/limit)*limit; |
Mike Klein | 9fe1b22 | 2017-02-24 11:04:50 -0500 | [diff] [blame] | 1001 | } |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 1002 | SI F mirror(F v, float limit) { |
Mike Klein | 8ca3356 | 2017-05-23 08:07:43 -0400 | [diff] [blame] | 1003 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)/(limit+limit)) - limit ); |
Mike Klein | 9fe1b22 | 2017-02-24 11:04:50 -0500 | [diff] [blame] | 1004 | } |
Mike Klein | 5224f46 | 2017-03-07 17:29:54 -0500 | [diff] [blame] | 1005 | STAGE(clamp_x) { r = clamp (r, *(const float*)ctx); } |
| 1006 | STAGE(clamp_y) { g = clamp (g, *(const float*)ctx); } |
| 1007 | STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); } |
| 1008 | STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); } |
| 1009 | STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); } |
| 1010 | STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1011 | |
Mike Klein | 8ca3356 | 2017-05-23 08:07:43 -0400 | [diff] [blame] | 1012 | STAGE( clamp_x_1) { r = clamp (r, 1.0f); } |
| 1013 | STAGE(repeat_x_1) { r = repeat(r, 1.0f); } |
Mike Klein | 9f85d68 | 2017-05-23 07:52:01 -0400 | [diff] [blame] | 1014 | STAGE(mirror_x_1) { r = abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f ); } |
| 1015 | |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 1016 | STAGE(luminance_to_alpha) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1017 | a = r*0.2126f + g*0.7152f + b*0.0722f; |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 1018 | r = g = b = 0; |
| 1019 | } |
| 1020 | |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1021 | STAGE(matrix_2x3) { |
| 1022 | auto m = (const float*)ctx; |
| 1023 | |
Mike Klein | b8d5275 | 2017-02-16 10:21:29 -0500 | [diff] [blame] | 1024 | auto R = mad(r,m[0], mad(g,m[2], m[4])), |
| 1025 | G = mad(r,m[1], mad(g,m[3], m[5])); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1026 | r = R; |
| 1027 | g = G; |
| 1028 | } |
| 1029 | STAGE(matrix_3x4) { |
| 1030 | auto m = (const float*)ctx; |
| 1031 | |
Mike Klein | b8d5275 | 2017-02-16 10:21:29 -0500 | [diff] [blame] | 1032 | auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), |
| 1033 | G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), |
| 1034 | B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1035 | r = R; |
| 1036 | g = G; |
| 1037 | b = B; |
| 1038 | } |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 1039 | STAGE(matrix_4x5) { |
| 1040 | auto m = (const float*)ctx; |
| 1041 | |
| 1042 | auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))), |
| 1043 | G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))), |
| 1044 | B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))), |
| 1045 | A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19])))); |
| 1046 | r = R; |
| 1047 | g = G; |
| 1048 | b = B; |
| 1049 | a = A; |
| 1050 | } |
Mike Reed | 0264095 | 2017-05-19 15:32:13 -0400 | [diff] [blame] | 1051 | STAGE(matrix_4x3) { |
| 1052 | auto m = (const float*)ctx; |
| 1053 | auto X = r, |
| 1054 | Y = g; |
| 1055 | |
| 1056 | r = mad(X, m[0], mad(Y, m[4], m[ 8])); |
| 1057 | g = mad(X, m[1], mad(Y, m[5], m[ 9])); |
| 1058 | b = mad(X, m[2], mad(Y, m[6], m[10])); |
| 1059 | a = mad(X, m[3], mad(Y, m[7], m[11])); |
| 1060 | } |
Mike Klein | 11d2df0 | 2017-02-24 11:51:36 -0500 | [diff] [blame] | 1061 | STAGE(matrix_perspective) { |
| 1062 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
| 1063 | auto m = (const float*)ctx; |
| 1064 | |
| 1065 | auto R = mad(r,m[0], mad(g,m[1], m[2])), |
| 1066 | G = mad(r,m[3], mad(g,m[4], m[5])), |
| 1067 | Z = mad(r,m[6], mad(g,m[7], m[8])); |
| 1068 | r = R * rcp(Z); |
| 1069 | g = G * rcp(Z); |
| 1070 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1071 | |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 1072 | SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t, |
| 1073 | F* r, F* g, F* b, F* a) { |
| 1074 | F fr, br, fg, bg, fb, bb, fa, ba; |
| 1075 | #if defined(JUMPER) && defined(__AVX2__) |
| 1076 | if (c->stopCount <=8) { |
| 1077 | fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); |
| 1078 | br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); |
| 1079 | fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); |
| 1080 | bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); |
| 1081 | fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); |
| 1082 | bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); |
| 1083 | fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); |
| 1084 | ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); |
| 1085 | } else |
| 1086 | #endif |
| 1087 | { |
| 1088 | fr = gather(c->fs[0], idx); |
| 1089 | br = gather(c->bs[0], idx); |
| 1090 | fg = gather(c->fs[1], idx); |
| 1091 | bg = gather(c->bs[1], idx); |
| 1092 | fb = gather(c->fs[2], idx); |
| 1093 | bb = gather(c->bs[2], idx); |
| 1094 | fa = gather(c->fs[3], idx); |
| 1095 | ba = gather(c->bs[3], idx); |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 1096 | } |
| 1097 | |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 1098 | *r = mad(t, fr, br); |
| 1099 | *g = mad(t, fg, bg); |
| 1100 | *b = mad(t, fb, bb); |
| 1101 | *a = mad(t, fa, ba); |
| 1102 | } |
| 1103 | |
| 1104 | STAGE(evenly_spaced_gradient) { |
| 1105 | auto c = (const SkJumper_GradientCtx*)ctx; |
| 1106 | auto t = r; |
| 1107 | auto idx = trunc_(t * (c->stopCount-1)); |
| 1108 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
| 1109 | } |
| 1110 | |
| 1111 | STAGE(gradient) { |
| 1112 | auto c = (const SkJumper_GradientCtx*)ctx; |
| 1113 | auto t = r; |
| 1114 | U32 idx = 0; |
| 1115 | |
| 1116 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
| 1117 | for (size_t i = 1; i < c->stopCount; i++) { |
| 1118 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
| 1119 | } |
| 1120 | |
| 1121 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 1122 | } |
| 1123 | |
Mike Klein | 5c7960b | 2017-05-11 10:59:22 -0400 | [diff] [blame] | 1124 | STAGE(evenly_spaced_2_stop_gradient) { |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 1125 | struct Ctx { float f[4], b[4]; }; |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 1126 | auto c = (const Ctx*)ctx; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1127 | |
| 1128 | auto t = r; |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 1129 | r = mad(t, c->f[0], c->b[0]); |
| 1130 | g = mad(t, c->f[1], c->b[1]); |
| 1131 | b = mad(t, c->f[2], c->b[2]); |
| 1132 | a = mad(t, c->f[3], c->b[3]); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1133 | } |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1134 | |
Mike Klein | 5c7960b | 2017-05-11 10:59:22 -0400 | [diff] [blame] | 1135 | STAGE(xy_to_unit_angle) { |
Herb Derby | 7eb8698 | 2017-05-02 19:04:39 -0400 | [diff] [blame] | 1136 | F X = r, |
| 1137 | Y = g; |
| 1138 | F xabs = abs_(X), |
| 1139 | yabs = abs_(Y); |
| 1140 | |
| 1141 | F slope = min(xabs, yabs)/max(xabs, yabs); |
| 1142 | F s = slope * slope; |
| 1143 | |
| 1144 | // Use a 7th degree polynomial to approximate atan. |
| 1145 | // This was generated using sollya.gforge.inria.fr. |
| 1146 | // A float optimized polynomial was generated using the following command. |
| 1147 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
| 1148 | F phi = slope |
| 1149 | * (0.15912117063999176025390625f + s |
| 1150 | * (-5.185396969318389892578125e-2f + s |
| 1151 | * (2.476101927459239959716796875e-2f + s |
| 1152 | * (-7.0547382347285747528076171875e-3f)))); |
| 1153 | |
| 1154 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
| 1155 | phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi); |
| 1156 | phi = if_then_else(Y < 0.0f , 1.0f - phi , phi); |
| 1157 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
| 1158 | r = phi; |
| 1159 | } |
| 1160 | |
Herb Derby | 090fbf8 | 2017-05-08 15:10:36 -0400 | [diff] [blame] | 1161 | STAGE(xy_to_radius) { |
| 1162 | F X2 = r * r, |
| 1163 | Y2 = g * g; |
Mike Klein | fd35c74 | 2017-05-15 15:55:54 -0400 | [diff] [blame] | 1164 | r = sqrt_(X2 + Y2); |
Herb Derby | 090fbf8 | 2017-05-08 15:10:36 -0400 | [diff] [blame] | 1165 | } |
| 1166 | |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1167 | STAGE(save_xy) { |
| 1168 | auto c = (SkJumper_SamplerCtx*)ctx; |
| 1169 | |
| 1170 | // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). |
| 1171 | // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid |
| 1172 | // surrounding (x,y) at (0.5,0.5) off-center. |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1173 | F fx = fract(r + 0.5f), |
| 1174 | fy = fract(g + 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1175 | |
| 1176 | // Samplers will need to load x and fx, or y and fy. |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 1177 | unaligned_store(c->x, r); |
| 1178 | unaligned_store(c->y, g); |
| 1179 | unaligned_store(c->fx, fx); |
| 1180 | unaligned_store(c->fy, fy); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1181 | } |
| 1182 | |
| 1183 | STAGE(accumulate) { |
| 1184 | auto c = (const SkJumper_SamplerCtx*)ctx; |
| 1185 | |
| 1186 | // Bilinear and bicubic filters are both separable, so we produce independent contributions |
| 1187 | // from x and y, multiplying them together here to get each pixel's total scale factor. |
| 1188 | auto scale = unaligned_load<F>(c->scalex) |
| 1189 | * unaligned_load<F>(c->scaley); |
| 1190 | dr = mad(scale, r, dr); |
| 1191 | dg = mad(scale, g, dg); |
| 1192 | db = mad(scale, b, db); |
| 1193 | da = mad(scale, a, da); |
| 1194 | } |
| 1195 | |
| 1196 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
| 1197 | // are combined in direct proportion to their area overlapping that logical query pixel. |
| 1198 | // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. |
| 1199 | // The y-axis is symmetric. |
| 1200 | |
| 1201 | template <int kScale> |
| 1202 | SI void bilinear_x(SkJumper_SamplerCtx* ctx, F* x) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1203 | *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1204 | F fx = unaligned_load<F>(ctx->fx); |
| 1205 | |
| 1206 | F scalex; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1207 | if (kScale == -1) { scalex = 1.0f - fx; } |
| 1208 | if (kScale == +1) { scalex = fx; } |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 1209 | unaligned_store(ctx->scalex, scalex); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1210 | } |
| 1211 | template <int kScale> |
| 1212 | SI void bilinear_y(SkJumper_SamplerCtx* ctx, F* y) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1213 | *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1214 | F fy = unaligned_load<F>(ctx->fy); |
| 1215 | |
| 1216 | F scaley; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1217 | if (kScale == -1) { scaley = 1.0f - fy; } |
| 1218 | if (kScale == +1) { scaley = fy; } |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 1219 | unaligned_store(ctx->scaley, scaley); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1220 | } |
| 1221 | |
| 1222 | STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); } |
| 1223 | STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); } |
| 1224 | STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); } |
| 1225 | STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); } |
| 1226 | |
| 1227 | |
| 1228 | // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample |
| 1229 | // pixel center are combined with a non-uniform cubic filter, with higher values near the center. |
| 1230 | // |
| 1231 | // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. |
| 1232 | // See GrCubicEffect for details of this particular filter. |
| 1233 | |
| 1234 | SI F bicubic_near(F t) { |
| 1235 | // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1236 | return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1237 | } |
| 1238 | SI F bicubic_far(F t) { |
| 1239 | // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1240 | return (t*t)*mad((7/18.0f), t, (-6/18.0f)); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1241 | } |
| 1242 | |
| 1243 | template <int kScale> |
| 1244 | SI void bicubic_x(SkJumper_SamplerCtx* ctx, F* x) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1245 | *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1246 | F fx = unaligned_load<F>(ctx->fx); |
| 1247 | |
| 1248 | F scalex; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1249 | if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } |
| 1250 | if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } |
| 1251 | if (kScale == +1) { scalex = bicubic_near( fx); } |
| 1252 | if (kScale == +3) { scalex = bicubic_far ( fx); } |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 1253 | unaligned_store(ctx->scalex, scalex); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1254 | } |
| 1255 | template <int kScale> |
| 1256 | SI void bicubic_y(SkJumper_SamplerCtx* ctx, F* y) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1257 | *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1258 | F fy = unaligned_load<F>(ctx->fy); |
| 1259 | |
| 1260 | F scaley; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1261 | if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } |
| 1262 | if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } |
| 1263 | if (kScale == +1) { scaley = bicubic_near( fy); } |
| 1264 | if (kScale == +3) { scaley = bicubic_far ( fy); } |
Mike Klein | c33aa90 | 2017-05-15 10:20:48 -0400 | [diff] [blame] | 1265 | unaligned_store(ctx->scaley, scaley); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 1266 | } |
| 1267 | |
| 1268 | STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); } |
| 1269 | STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); } |
| 1270 | STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); } |
| 1271 | STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); } |
| 1272 | |
| 1273 | STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); } |
| 1274 | STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); } |
| 1275 | STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); } |
| 1276 | STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); } |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 1277 | |
| 1278 | STAGE(callback) { |
Mike Klein | c17dc24 | 2017-04-20 16:21:57 -0400 | [diff] [blame] | 1279 | auto c = (SkJumper_CallbackCtx*)ctx; |
| 1280 | store4(c->rgba,0, r,g,b,a); |
| 1281 | c->fn(c, tail ? tail : kStride); |
| 1282 | load4(c->read_from,0, &r,&g,&b,&a); |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 1283 | } |