Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1 | /* |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2 | * Copyright 2018 Google Inc. |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 8 | #ifndef SkRasterPipeline_opts_DEFINED |
| 9 | #define SkRasterPipeline_opts_DEFINED |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 10 | |
Brian Osman | 7190399 | 2020-05-12 13:40:01 -0400 | [diff] [blame] | 11 | #include "include/core/SkData.h" |
Mike Klein | c0bd9f9 | 2019-04-23 12:05:21 -0500 | [diff] [blame] | 12 | #include "include/core/SkTypes.h" |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 13 | #include "src/core/SkUtils.h" // unaligned_{load,store} |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 14 | #include <cstdint> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 15 | |
| 16 | // Every function in this file should be marked static and inline using SI. |
| 17 | #if defined(__clang__) |
| 18 | #define SI __attribute__((always_inline)) static inline |
| 19 | #else |
| 20 | #define SI static inline |
| 21 | #endif |
| 22 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 23 | template <typename Dst, typename Src> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 24 | SI Dst widen_cast(const Src& src) { |
John Stiles | 7109434 | 2020-07-24 10:05:43 -0400 | [diff] [blame] | 25 | static_assert(sizeof(Dst) > sizeof(Src)); |
| 26 | static_assert(std::is_trivially_copyable<Dst>::value); |
| 27 | static_assert(std::is_trivially_copyable<Src>::value); |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 28 | Dst dst; |
| 29 | memcpy(&dst, &src, sizeof(Src)); |
| 30 | return dst; |
| 31 | } |
| 32 | |
| 33 | // Our program is an array of void*, either |
| 34 | // - 1 void* per stage with no context pointer, the next stage; |
| 35 | // - 2 void* per stage with a context pointer, first the context pointer, then the next stage. |
| 36 | |
| 37 | // load_and_inc() steps the program forward by 1 void*, returning that pointer. |
| 38 | SI void* load_and_inc(void**& program) { |
| 39 | #if defined(__GNUC__) && defined(__x86_64__) |
| 40 | // If program is in %rsi (we try to make this likely) then this is a single instruction. |
| 41 | void* rax; |
| 42 | asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi. |
| 43 | return rax; |
| 44 | #else |
| 45 | // On ARM *program++ compiles into pretty ideal code without any handholding. |
| 46 | return *program++; |
| 47 | #endif |
| 48 | } |
| 49 | |
| 50 | // Lazily resolved on first cast. Does nothing if cast to Ctx::None. |
| 51 | struct Ctx { |
| 52 | struct None {}; |
| 53 | |
| 54 | void* ptr; |
| 55 | void**& program; |
| 56 | |
| 57 | explicit Ctx(void**& p) : ptr(nullptr), program(p) {} |
| 58 | |
| 59 | template <typename T> |
| 60 | operator T*() { |
| 61 | if (!ptr) { ptr = load_and_inc(program); } |
| 62 | return (T*)ptr; |
| 63 | } |
| 64 | operator None() { return None{}; } |
| 65 | }; |
| 66 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 67 | |
| 68 | #if !defined(__clang__) |
| 69 | #define JUMPER_IS_SCALAR |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 70 | #elif defined(SK_ARM_HAS_NEON) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 71 | #define JUMPER_IS_NEON |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 72 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
| 73 | #define JUMPER_IS_SKX |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 74 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 75 | #define JUMPER_IS_HSW |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 76 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 77 | #define JUMPER_IS_AVX |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 78 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 79 | #define JUMPER_IS_SSE41 |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 80 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 81 | #define JUMPER_IS_SSE2 |
| 82 | #else |
| 83 | #define JUMPER_IS_SCALAR |
| 84 | #endif |
| 85 | |
| 86 | // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 87 | #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 88 | // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. |
| 89 | #if defined(__apple_build_version__) && __clang_major__ < 9 |
| 90 | #define JUMPER_IS_SCALAR |
| 91 | #elif __clang_major__ < 5 |
| 92 | #define JUMPER_IS_SCALAR |
| 93 | #endif |
Mike Klein | b54d223 | 2018-06-01 15:53:21 -0400 | [diff] [blame] | 94 | |
| 95 | #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR) |
| 96 | #undef JUMPER_IS_NEON |
| 97 | #endif |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 98 | #endif |
| 99 | |
| 100 | #if defined(JUMPER_IS_SCALAR) |
Mike Klein | 5cc94cc | 2018-03-07 17:04:18 +0000 | [diff] [blame] | 101 | #include <math.h> |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 102 | #elif defined(JUMPER_IS_NEON) |
| 103 | #include <arm_neon.h> |
| 104 | #else |
| 105 | #include <immintrin.h> |
| 106 | #endif |
Mike Klein | 5cc94cc | 2018-03-07 17:04:18 +0000 | [diff] [blame] | 107 | |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 108 | // Notes: |
| 109 | // * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least |
| 110 | // 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise |
| 111 | // requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for |
| 112 | // Intel this requires one additional step because its estimate has 12 bit precision. |
John Stiles | bb30fc1 | 2021-09-21 22:39:27 +0000 | [diff] [blame] | 113 | |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 114 | namespace SK_OPTS_NS { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 115 | #if defined(JUMPER_IS_SCALAR) |
| 116 | // This path should lead to portable scalar code. |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 117 | using F = float ; |
| 118 | using I32 = int32_t; |
| 119 | using U64 = uint64_t; |
| 120 | using U32 = uint32_t; |
| 121 | using U16 = uint16_t; |
| 122 | using U8 = uint8_t ; |
| 123 | |
| 124 | SI F mad(F f, F m, F a) { return f*m+a; } |
| 125 | SI F min(F a, F b) { return fminf(a,b); } |
| 126 | SI F max(F a, F b) { return fmaxf(a,b); } |
| 127 | SI F abs_ (F v) { return fabsf(v); } |
| 128 | SI F floor_(F v) { return floorf(v); } |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 129 | SI F rcp_fast(F v) { return 1.0f / v; } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 130 | SI F rsqrt (F v) { return 1.0f / sqrtf(v); } |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 131 | SI F sqrt_ (F v) { return sqrtf(v); } |
| 132 | SI F rcp_precise (F v) { return 1.0f / v; } |
| 133 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 134 | SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } |
| 135 | SI U16 pack(U32 v) { return (U16)v; } |
| 136 | SI U8 pack(U16 v) { return (U8)v; } |
| 137 | |
| 138 | SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } |
| 139 | |
| 140 | template <typename T> |
| 141 | SI T gather(const T* p, U32 ix) { return p[ix]; } |
| 142 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 143 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
| 144 | *r = ptr[0]; |
| 145 | *g = ptr[1]; |
| 146 | } |
| 147 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
| 148 | ptr[0] = r; |
| 149 | ptr[1] = g; |
| 150 | } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 151 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| 152 | *r = ptr[0]; |
| 153 | *g = ptr[1]; |
| 154 | *b = ptr[2]; |
| 155 | } |
| 156 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| 157 | *r = ptr[0]; |
| 158 | *g = ptr[1]; |
| 159 | *b = ptr[2]; |
| 160 | *a = ptr[3]; |
| 161 | } |
| 162 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
| 163 | ptr[0] = r; |
| 164 | ptr[1] = g; |
| 165 | ptr[2] = b; |
| 166 | ptr[3] = a; |
| 167 | } |
| 168 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 169 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
| 170 | *r = ptr[0]; |
| 171 | *g = ptr[1]; |
| 172 | } |
| 173 | SI void store2(float* ptr, size_t tail, F r, F g) { |
| 174 | ptr[0] = r; |
| 175 | ptr[1] = g; |
| 176 | } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 177 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
| 178 | *r = ptr[0]; |
| 179 | *g = ptr[1]; |
| 180 | *b = ptr[2]; |
| 181 | *a = ptr[3]; |
| 182 | } |
| 183 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
| 184 | ptr[0] = r; |
| 185 | ptr[1] = g; |
| 186 | ptr[2] = b; |
| 187 | ptr[3] = a; |
| 188 | } |
| 189 | |
| 190 | #elif defined(JUMPER_IS_NEON) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 191 | // Since we know we're using Clang, we can use its vector extensions. |
| 192 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
| 193 | using F = V<float >; |
| 194 | using I32 = V< int32_t>; |
| 195 | using U64 = V<uint64_t>; |
| 196 | using U32 = V<uint32_t>; |
| 197 | using U16 = V<uint16_t>; |
| 198 | using U8 = V<uint8_t >; |
| 199 | |
| 200 | // We polyfill a few routines that Clang doesn't build into ext_vector_types. |
| 201 | SI F min(F a, F b) { return vminq_f32(a,b); } |
| 202 | SI F max(F a, F b) { return vmaxq_f32(a,b); } |
| 203 | SI F abs_ (F v) { return vabsq_f32(v); } |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 204 | SI F rcp_fast(F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } |
| 205 | SI F rcp_precise (F v) { auto e = rcp_fast(v); return vrecpsq_f32 (v,e ) * e; } |
| 206 | SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } |
| 207 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 208 | SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } |
| 209 | SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } |
| 210 | |
| 211 | SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } |
| 212 | |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 213 | #if defined(SK_CPU_ARM64) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 214 | SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } |
| 215 | SI F floor_(F v) { return vrndmq_f32(v); } |
| 216 | SI F sqrt_(F v) { return vsqrtq_f32(v); } |
| 217 | SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } |
| 218 | #else |
| 219 | SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } |
| 220 | SI F floor_(F v) { |
| 221 | F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
| 222 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
| 223 | } |
| 224 | |
| 225 | SI F sqrt_(F v) { |
| 226 | auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). |
| 227 | e *= vrsqrtsq_f32(v,e*e); |
| 228 | e *= vrsqrtsq_f32(v,e*e); |
| 229 | return v*e; // sqrt(v) == v*rsqrt(v). |
| 230 | } |
| 231 | |
| 232 | SI U32 round(F v, F scale) { |
| 233 | return vcvtq_u32_f32(mad(v,scale,0.5f)); |
| 234 | } |
| 235 | #endif |
| 236 | |
| 237 | |
| 238 | template <typename T> |
| 239 | SI V<T> gather(const T* p, U32 ix) { |
| 240 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
| 241 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 242 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
| 243 | uint16x4x2_t rg; |
| 244 | if (__builtin_expect(tail,0)) { |
| 245 | if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); } |
| 246 | if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); } |
| 247 | if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); } |
| 248 | } else { |
| 249 | rg = vld2_u16(ptr); |
| 250 | } |
| 251 | *r = rg.val[0]; |
| 252 | *g = rg.val[1]; |
| 253 | } |
| 254 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
| 255 | if (__builtin_expect(tail,0)) { |
| 256 | if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); } |
| 257 | if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); } |
| 258 | if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); } |
| 259 | } else { |
| 260 | vst2_u16(ptr, (uint16x4x2_t{{r,g}})); |
| 261 | } |
| 262 | } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 263 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| 264 | uint16x4x3_t rgb; |
| 265 | if (__builtin_expect(tail,0)) { |
| 266 | if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } |
| 267 | if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } |
| 268 | if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } |
| 269 | } else { |
| 270 | rgb = vld3_u16(ptr); |
| 271 | } |
| 272 | *r = rgb.val[0]; |
| 273 | *g = rgb.val[1]; |
| 274 | *b = rgb.val[2]; |
| 275 | } |
| 276 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| 277 | uint16x4x4_t rgba; |
| 278 | if (__builtin_expect(tail,0)) { |
| 279 | if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } |
| 280 | if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } |
| 281 | if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } |
| 282 | } else { |
| 283 | rgba = vld4_u16(ptr); |
| 284 | } |
| 285 | *r = rgba.val[0]; |
| 286 | *g = rgba.val[1]; |
| 287 | *b = rgba.val[2]; |
| 288 | *a = rgba.val[3]; |
| 289 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 290 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 291 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
| 292 | if (__builtin_expect(tail,0)) { |
| 293 | if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } |
| 294 | if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } |
| 295 | if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } |
| 296 | } else { |
| 297 | vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); |
| 298 | } |
| 299 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 300 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
| 301 | float32x4x2_t rg; |
| 302 | if (__builtin_expect(tail,0)) { |
| 303 | if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); } |
| 304 | if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); } |
| 305 | if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); } |
| 306 | } else { |
| 307 | rg = vld2q_f32(ptr); |
| 308 | } |
| 309 | *r = rg.val[0]; |
| 310 | *g = rg.val[1]; |
| 311 | } |
| 312 | SI void store2(float* ptr, size_t tail, F r, F g) { |
| 313 | if (__builtin_expect(tail,0)) { |
| 314 | if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); } |
| 315 | if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); } |
| 316 | if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); } |
| 317 | } else { |
| 318 | vst2q_f32(ptr, (float32x4x2_t{{r,g}})); |
| 319 | } |
| 320 | } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 321 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
| 322 | float32x4x4_t rgba; |
| 323 | if (__builtin_expect(tail,0)) { |
| 324 | if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } |
| 325 | if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } |
| 326 | if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } |
| 327 | } else { |
| 328 | rgba = vld4q_f32(ptr); |
| 329 | } |
| 330 | *r = rgba.val[0]; |
| 331 | *g = rgba.val[1]; |
| 332 | *b = rgba.val[2]; |
| 333 | *a = rgba.val[3]; |
| 334 | } |
| 335 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
| 336 | if (__builtin_expect(tail,0)) { |
| 337 | if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } |
| 338 | if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } |
| 339 | if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } |
| 340 | } else { |
| 341 | vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); |
| 342 | } |
| 343 | } |
| 344 | |
Herb Derby | 501e8e1 | 2021-09-28 14:51:04 -0400 | [diff] [blame] | 345 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 346 | // These are __m256 and __m256i, but friendlier and strongly-typed. |
| 347 | template <typename T> using V = T __attribute__((ext_vector_type(8))); |
| 348 | using F = V<float >; |
| 349 | using I32 = V< int32_t>; |
| 350 | using U64 = V<uint64_t>; |
| 351 | using U32 = V<uint32_t>; |
| 352 | using U16 = V<uint16_t>; |
| 353 | using U8 = V<uint8_t >; |
| 354 | |
| 355 | SI F mad(F f, F m, F a) { |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 356 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 357 | return _mm256_fmadd_ps(f,m,a); |
| 358 | #else |
| 359 | return f*m+a; |
| 360 | #endif |
| 361 | } |
| 362 | |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 363 | SI F min(F a, F b) { return _mm256_min_ps(a,b); } |
| 364 | SI F max(F a, F b) { return _mm256_max_ps(a,b); } |
| 365 | SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } |
| 366 | SI F floor_(F v) { return _mm256_floor_ps(v); } |
| 367 | SI F rcp_fast(F v) { return _mm256_rcp_ps (v); } |
| 368 | SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } |
| 369 | SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); } |
| 370 | SI F rcp_precise (F v) { |
| 371 | F e = rcp_fast(v); |
| 372 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
| 373 | return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e; |
| 374 | #else |
| 375 | return e * (2.0f - v * e); |
| 376 | #endif |
| 377 | } |
John Stiles | bb30fc1 | 2021-09-21 22:39:27 +0000 | [diff] [blame] | 378 | |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 379 | |
| 380 | SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 381 | SI U16 pack(U32 v) { |
| 382 | return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), |
| 383 | _mm256_extractf128_si256(v, 1)); |
| 384 | } |
| 385 | SI U8 pack(U16 v) { |
| 386 | auto r = _mm_packus_epi16(v,v); |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 387 | return sk_unaligned_load<U8>(&r); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 388 | } |
| 389 | |
| 390 | SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } |
| 391 | |
| 392 | template <typename T> |
| 393 | SI V<T> gather(const T* p, U32 ix) { |
| 394 | return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], |
| 395 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; |
| 396 | } |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 397 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 398 | SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } |
| 399 | SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } |
| 400 | SI U64 gather(const uint64_t* p, U32 ix) { |
| 401 | __m256i parts[] = { |
| 402 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), |
| 403 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), |
| 404 | }; |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 405 | return sk_bit_cast<U64>(parts); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 406 | } |
| 407 | #endif |
| 408 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 409 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
| 410 | U16 _0123, _4567; |
| 411 | if (__builtin_expect(tail,0)) { |
| 412 | _0123 = _4567 = _mm_setzero_si128(); |
| 413 | auto* d = &_0123; |
| 414 | if (tail > 3) { |
| 415 | *d = _mm_loadu_si128(((__m128i*)ptr) + 0); |
| 416 | tail -= 4; |
| 417 | ptr += 8; |
| 418 | d = &_4567; |
| 419 | } |
| 420 | bool high = false; |
| 421 | if (tail > 1) { |
| 422 | *d = _mm_loadu_si64(ptr); |
| 423 | tail -= 2; |
| 424 | ptr += 4; |
| 425 | high = true; |
| 426 | } |
| 427 | if (tail > 0) { |
| 428 | (*d)[high ? 4 : 0] = *(ptr + 0); |
| 429 | (*d)[high ? 5 : 1] = *(ptr + 1); |
| 430 | } |
| 431 | } else { |
| 432 | _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
| 433 | _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
| 434 | } |
| 435 | *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16), |
| 436 | _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16)); |
| 437 | *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16), |
| 438 | _mm_srai_epi32(_4567, 16)); |
| 439 | } |
| 440 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
| 441 | auto _0123 = _mm_unpacklo_epi16(r, g), |
| 442 | _4567 = _mm_unpackhi_epi16(r, g); |
| 443 | if (__builtin_expect(tail,0)) { |
| 444 | const auto* s = &_0123; |
| 445 | if (tail > 3) { |
| 446 | _mm_storeu_si128((__m128i*)ptr, *s); |
| 447 | s = &_4567; |
| 448 | tail -= 4; |
| 449 | ptr += 8; |
| 450 | } |
| 451 | bool high = false; |
| 452 | if (tail > 1) { |
| 453 | _mm_storel_epi64((__m128i*)ptr, *s); |
| 454 | ptr += 4; |
| 455 | tail -= 2; |
| 456 | high = true; |
| 457 | } |
| 458 | if (tail > 0) { |
| 459 | if (high) { |
| 460 | *(int32_t*)ptr = _mm_extract_epi32(*s, 2); |
| 461 | } else { |
| 462 | *(int32_t*)ptr = _mm_cvtsi128_si32(*s); |
| 463 | } |
| 464 | } |
| 465 | } else { |
| 466 | _mm_storeu_si128((__m128i*)ptr + 0, _0123); |
| 467 | _mm_storeu_si128((__m128i*)ptr + 1, _4567); |
| 468 | } |
| 469 | } |
| 470 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 471 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| 472 | __m128i _0,_1,_2,_3,_4,_5,_6,_7; |
| 473 | if (__builtin_expect(tail,0)) { |
| 474 | auto load_rgb = [](const uint16_t* src) { |
| 475 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
| 476 | return _mm_insert_epi16(v, src[2], 2); |
| 477 | }; |
| 478 | _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); |
| 479 | if ( true ) { _0 = load_rgb(ptr + 0); } |
| 480 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
| 481 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
| 482 | if (tail > 3) { _3 = load_rgb(ptr + 9); } |
| 483 | if (tail > 4) { _4 = load_rgb(ptr + 12); } |
| 484 | if (tail > 5) { _5 = load_rgb(ptr + 15); } |
| 485 | if (tail > 6) { _6 = load_rgb(ptr + 18); } |
| 486 | } else { |
| 487 | // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. |
| 488 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; |
| 489 | auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; |
| 490 | auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; |
| 491 | auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); |
| 492 | _0 = _01; _1 = _mm_srli_si128(_01, 6); |
| 493 | _2 = _23; _3 = _mm_srli_si128(_23, 6); |
| 494 | _4 = _45; _5 = _mm_srli_si128(_45, 6); |
| 495 | _6 = _67; _7 = _mm_srli_si128(_67, 6); |
| 496 | } |
| 497 | |
| 498 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
| 499 | _13 = _mm_unpacklo_epi16(_1, _3), |
| 500 | _46 = _mm_unpacklo_epi16(_4, _6), |
| 501 | _57 = _mm_unpacklo_epi16(_5, _7); |
| 502 | |
| 503 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
| 504 | bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx |
| 505 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
| 506 | bx4567 = _mm_unpackhi_epi16(_46, _57); |
| 507 | |
| 508 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
| 509 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
| 510 | *b = _mm_unpacklo_epi64(bx0123, bx4567); |
| 511 | } |
| 512 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| 513 | __m128i _01, _23, _45, _67; |
| 514 | if (__builtin_expect(tail,0)) { |
| 515 | auto src = (const double*)ptr; |
| 516 | _01 = _23 = _45 = _67 = _mm_setzero_si128(); |
| 517 | if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } |
| 518 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } |
| 519 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } |
| 520 | if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } |
| 521 | if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } |
| 522 | if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } |
| 523 | if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } |
| 524 | } else { |
| 525 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
| 526 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
| 527 | _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); |
| 528 | _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); |
| 529 | } |
| 530 | |
| 531 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
| 532 | _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 |
| 533 | _46 = _mm_unpacklo_epi16(_45, _67), |
| 534 | _57 = _mm_unpackhi_epi16(_45, _67); |
| 535 | |
| 536 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
| 537 | ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 |
| 538 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
| 539 | ba4567 = _mm_unpackhi_epi16(_46, _57); |
| 540 | |
| 541 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
| 542 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
| 543 | *b = _mm_unpacklo_epi64(ba0123, ba4567); |
| 544 | *a = _mm_unpackhi_epi64(ba0123, ba4567); |
| 545 | } |
| 546 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
| 547 | auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 |
| 548 | rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 |
| 549 | ba0123 = _mm_unpacklo_epi16(b, a), |
| 550 | ba4567 = _mm_unpackhi_epi16(b, a); |
| 551 | |
| 552 | auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), |
| 553 | _23 = _mm_unpackhi_epi32(rg0123, ba0123), |
| 554 | _45 = _mm_unpacklo_epi32(rg4567, ba4567), |
| 555 | _67 = _mm_unpackhi_epi32(rg4567, ba4567); |
| 556 | |
| 557 | if (__builtin_expect(tail,0)) { |
| 558 | auto dst = (double*)ptr; |
| 559 | if (tail > 0) { _mm_storel_pd(dst+0, _01); } |
| 560 | if (tail > 1) { _mm_storeh_pd(dst+1, _01); } |
| 561 | if (tail > 2) { _mm_storel_pd(dst+2, _23); } |
| 562 | if (tail > 3) { _mm_storeh_pd(dst+3, _23); } |
| 563 | if (tail > 4) { _mm_storel_pd(dst+4, _45); } |
| 564 | if (tail > 5) { _mm_storeh_pd(dst+5, _45); } |
| 565 | if (tail > 6) { _mm_storel_pd(dst+6, _67); } |
| 566 | } else { |
| 567 | _mm_storeu_si128((__m128i*)ptr + 0, _01); |
| 568 | _mm_storeu_si128((__m128i*)ptr + 1, _23); |
| 569 | _mm_storeu_si128((__m128i*)ptr + 2, _45); |
| 570 | _mm_storeu_si128((__m128i*)ptr + 3, _67); |
| 571 | } |
| 572 | } |
| 573 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 574 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
| 575 | F _0123, _4567; |
| 576 | if (__builtin_expect(tail, 0)) { |
| 577 | _0123 = _4567 = _mm256_setzero_ps(); |
| 578 | F* d = &_0123; |
| 579 | if (tail > 3) { |
| 580 | *d = _mm256_loadu_ps(ptr); |
| 581 | ptr += 8; |
| 582 | tail -= 4; |
| 583 | d = &_4567; |
| 584 | } |
| 585 | bool high = false; |
| 586 | if (tail > 1) { |
| 587 | *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr)); |
| 588 | ptr += 4; |
| 589 | tail -= 2; |
| 590 | high = true; |
| 591 | } |
| 592 | if (tail > 0) { |
| 593 | *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1) |
| 594 | : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0); |
| 595 | } |
| 596 | } else { |
| 597 | _0123 = _mm256_loadu_ps(ptr + 0); |
| 598 | _4567 = _mm256_loadu_ps(ptr + 8); |
| 599 | } |
| 600 | |
| 601 | F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20), |
| 602 | _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31); |
| 603 | |
| 604 | *r = _mm256_shuffle_ps(_0145, _2367, 0x88); |
| 605 | *g = _mm256_shuffle_ps(_0145, _2367, 0xDD); |
| 606 | } |
| 607 | SI void store2(float* ptr, size_t tail, F r, F g) { |
| 608 | F _0145 = _mm256_unpacklo_ps(r, g), |
| 609 | _2367 = _mm256_unpackhi_ps(r, g); |
| 610 | F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20), |
| 611 | _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31); |
| 612 | |
| 613 | if (__builtin_expect(tail, 0)) { |
| 614 | const __m256* s = &_0123; |
| 615 | if (tail > 3) { |
| 616 | _mm256_storeu_ps(ptr, *s); |
| 617 | s = &_4567; |
| 618 | tail -= 4; |
| 619 | ptr += 8; |
| 620 | } |
| 621 | bool high = false; |
| 622 | if (tail > 1) { |
| 623 | _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0)); |
| 624 | ptr += 4; |
| 625 | tail -= 2; |
| 626 | high = true; |
| 627 | } |
| 628 | if (tail > 0) { |
| 629 | *(ptr + 0) = (*s)[ high ? 4 : 0]; |
| 630 | *(ptr + 1) = (*s)[ high ? 5 : 1]; |
| 631 | } |
| 632 | } else { |
| 633 | _mm256_storeu_ps(ptr + 0, _0123); |
| 634 | _mm256_storeu_ps(ptr + 8, _4567); |
| 635 | } |
| 636 | } |
| 637 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 638 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
| 639 | F _04, _15, _26, _37; |
| 640 | _04 = _15 = _26 = _37 = 0; |
| 641 | switch (tail) { |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 642 | case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); [[fallthrough]]; |
| 643 | case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); [[fallthrough]]; |
| 644 | case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); [[fallthrough]]; |
| 645 | case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); [[fallthrough]]; |
| 646 | case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); [[fallthrough]]; |
| 647 | case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); [[fallthrough]]; |
| 648 | case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); [[fallthrough]]; |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 649 | case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); |
| 650 | } |
| 651 | |
| 652 | F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 |
| 653 | ba0145 = _mm256_unpackhi_ps(_04,_15), |
| 654 | rg2367 = _mm256_unpacklo_ps(_26,_37), |
| 655 | ba2367 = _mm256_unpackhi_ps(_26,_37); |
| 656 | |
| 657 | *r = _mm256_unpacklo_pd(rg0145, rg2367); |
| 658 | *g = _mm256_unpackhi_pd(rg0145, rg2367); |
| 659 | *b = _mm256_unpacklo_pd(ba0145, ba2367); |
| 660 | *a = _mm256_unpackhi_pd(ba0145, ba2367); |
| 661 | } |
| 662 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
| 663 | F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 |
| 664 | rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... |
| 665 | ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 |
| 666 | ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... |
| 667 | |
| 668 | F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 |
| 669 | _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... |
| 670 | _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... |
| 671 | _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... |
| 672 | |
| 673 | if (__builtin_expect(tail, 0)) { |
| 674 | if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } |
| 675 | if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } |
| 676 | if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } |
| 677 | if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } |
| 678 | if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } |
| 679 | if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } |
| 680 | if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } |
| 681 | } else { |
| 682 | F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo |
| 683 | _23 = _mm256_permute2f128_ps(_26, _37, 32), |
| 684 | _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi |
| 685 | _67 = _mm256_permute2f128_ps(_26, _37, 49); |
| 686 | _mm256_storeu_ps(ptr+ 0, _01); |
| 687 | _mm256_storeu_ps(ptr+ 8, _23); |
| 688 | _mm256_storeu_ps(ptr+16, _45); |
| 689 | _mm256_storeu_ps(ptr+24, _67); |
| 690 | } |
| 691 | } |
| 692 | |
Herb Derby | 501e8e1 | 2021-09-28 14:51:04 -0400 | [diff] [blame] | 693 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
| 694 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 695 | using F = V<float >; |
| 696 | using I32 = V< int32_t>; |
| 697 | using U64 = V<uint64_t>; |
| 698 | using U32 = V<uint32_t>; |
| 699 | using U16 = V<uint16_t>; |
| 700 | using U8 = V<uint8_t >; |
| 701 | |
| 702 | SI F mad(F f, F m, F a) { return f*m+a; } |
| 703 | SI F min(F a, F b) { return _mm_min_ps(a,b); } |
| 704 | SI F max(F a, F b) { return _mm_max_ps(a,b); } |
| 705 | SI F abs_(F v) { return _mm_and_ps(v, 0-v); } |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 706 | SI F rcp_fast(F v) { return _mm_rcp_ps (v); } |
| 707 | SI F rcp_precise (F v) { F e = rcp_fast(v); return e * (2.0f - v * e); } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 708 | SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } |
| 709 | SI F sqrt_(F v) { return _mm_sqrt_ps (v); } |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 710 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 711 | SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } |
| 712 | |
| 713 | SI U16 pack(U32 v) { |
| 714 | #if defined(JUMPER_IS_SSE41) |
| 715 | auto p = _mm_packus_epi32(v,v); |
| 716 | #else |
| 717 | // Sign extend so that _mm_packs_epi32() does the pack we want. |
| 718 | auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); |
| 719 | p = _mm_packs_epi32(p,p); |
| 720 | #endif |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 721 | return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 722 | } |
| 723 | SI U8 pack(U16 v) { |
| 724 | auto r = widen_cast<__m128i>(v); |
| 725 | r = _mm_packus_epi16(r,r); |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 726 | return sk_unaligned_load<U8>(&r); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 727 | } |
| 728 | |
| 729 | SI F if_then_else(I32 c, F t, F e) { |
| 730 | return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); |
| 731 | } |
| 732 | |
| 733 | SI F floor_(F v) { |
| 734 | #if defined(JUMPER_IS_SSE41) |
| 735 | return _mm_floor_ps(v); |
| 736 | #else |
| 737 | F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); |
| 738 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
| 739 | #endif |
| 740 | } |
| 741 | |
| 742 | template <typename T> |
| 743 | SI V<T> gather(const T* p, U32 ix) { |
| 744 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
| 745 | } |
| 746 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 747 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
| 748 | __m128i _01; |
| 749 | if (__builtin_expect(tail,0)) { |
| 750 | _01 = _mm_setzero_si128(); |
| 751 | if (tail > 1) { |
| 752 | _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00 |
| 753 | if (tail > 2) { |
Robert Phillips | f73ef0b | 2019-09-24 13:00:42 -0400 | [diff] [blame] | 754 | _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00 |
| 755 | _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00 |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 756 | } |
| 757 | } else { |
Mike Klein | 0f55db5 | 2019-09-30 10:01:08 -0500 | [diff] [blame] | 758 | _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00 |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 759 | } |
| 760 | } else { |
| 761 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3 |
| 762 | } |
| 763 | auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3 |
| 764 | auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3 |
| 765 | |
| 766 | auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3 |
| 767 | auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3 |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 768 | *r = sk_unaligned_load<U16>(&R); |
| 769 | *g = sk_unaligned_load<U16>(&G); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 770 | } |
| 771 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
| 772 | U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)); |
| 773 | if (__builtin_expect(tail, 0)) { |
| 774 | if (tail > 1) { |
| 775 | _mm_storel_epi64((__m128i*)ptr, rg); |
| 776 | if (tail > 2) { |
| 777 | int32_t rgpair = rg[2]; |
| 778 | memcpy(ptr + 4, &rgpair, sizeof(rgpair)); |
| 779 | } |
| 780 | } else { |
| 781 | int32_t rgpair = rg[0]; |
| 782 | memcpy(ptr, &rgpair, sizeof(rgpair)); |
| 783 | } |
| 784 | } else { |
| 785 | _mm_storeu_si128((__m128i*)ptr + 0, rg); |
| 786 | } |
| 787 | } |
| 788 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 789 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| 790 | __m128i _0, _1, _2, _3; |
| 791 | if (__builtin_expect(tail,0)) { |
| 792 | _1 = _2 = _3 = _mm_setzero_si128(); |
| 793 | auto load_rgb = [](const uint16_t* src) { |
| 794 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
| 795 | return _mm_insert_epi16(v, src[2], 2); |
| 796 | }; |
| 797 | if ( true ) { _0 = load_rgb(ptr + 0); } |
| 798 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
| 799 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
| 800 | } else { |
| 801 | // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. |
| 802 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , |
| 803 | _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); |
| 804 | |
| 805 | // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). |
| 806 | _0 = _01; |
| 807 | _1 = _mm_srli_si128(_01, 6); |
| 808 | _2 = _23; |
| 809 | _3 = _mm_srli_si128(_23, 6); |
| 810 | } |
| 811 | |
| 812 | // De-interlace to R,G,B. |
| 813 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
| 814 | _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx |
| 815 | |
| 816 | auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
| 817 | G = _mm_srli_si128(R, 8), |
| 818 | B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx |
| 819 | |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 820 | *r = sk_unaligned_load<U16>(&R); |
| 821 | *g = sk_unaligned_load<U16>(&G); |
| 822 | *b = sk_unaligned_load<U16>(&B); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 823 | } |
| 824 | |
| 825 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| 826 | __m128i _01, _23; |
| 827 | if (__builtin_expect(tail,0)) { |
| 828 | _01 = _23 = _mm_setzero_si128(); |
| 829 | auto src = (const double*)ptr; |
| 830 | if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 |
| 831 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 |
| 832 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 |
| 833 | } else { |
| 834 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 |
| 835 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 |
| 836 | } |
| 837 | |
| 838 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
| 839 | _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 |
| 840 | |
| 841 | auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
| 842 | ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 |
| 843 | |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 844 | *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0); |
| 845 | *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4); |
| 846 | *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0); |
| 847 | *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 848 | } |
| 849 | |
| 850 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
| 851 | auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), |
| 852 | ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); |
| 853 | |
| 854 | if (__builtin_expect(tail, 0)) { |
| 855 | auto dst = (double*)ptr; |
| 856 | if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } |
| 857 | if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } |
| 858 | if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } |
| 859 | } else { |
| 860 | _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); |
| 861 | _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); |
| 862 | } |
| 863 | } |
| 864 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 865 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
| 866 | F _01, _23; |
| 867 | if (__builtin_expect(tail, 0)) { |
| 868 | _01 = _23 = _mm_setzero_si128(); |
| 869 | if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); } |
| 870 | if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); } |
| 871 | if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); } |
| 872 | } else { |
| 873 | _01 = _mm_loadu_ps(ptr + 0); |
| 874 | _23 = _mm_loadu_ps(ptr + 4); |
| 875 | } |
| 876 | *r = _mm_shuffle_ps(_01, _23, 0x88); |
| 877 | *g = _mm_shuffle_ps(_01, _23, 0xDD); |
| 878 | } |
| 879 | SI void store2(float* ptr, size_t tail, F r, F g) { |
| 880 | F _01 = _mm_unpacklo_ps(r, g), |
| 881 | _23 = _mm_unpackhi_ps(r, g); |
| 882 | if (__builtin_expect(tail, 0)) { |
| 883 | if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); } |
| 884 | if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); } |
| 885 | if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); } |
| 886 | } else { |
| 887 | _mm_storeu_ps(ptr + 0, _01); |
| 888 | _mm_storeu_ps(ptr + 4, _23); |
| 889 | } |
| 890 | } |
| 891 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 892 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
| 893 | F _0, _1, _2, _3; |
| 894 | if (__builtin_expect(tail, 0)) { |
| 895 | _1 = _2 = _3 = _mm_setzero_si128(); |
| 896 | if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } |
| 897 | if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } |
| 898 | if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } |
| 899 | } else { |
| 900 | _0 = _mm_loadu_ps(ptr + 0); |
| 901 | _1 = _mm_loadu_ps(ptr + 4); |
| 902 | _2 = _mm_loadu_ps(ptr + 8); |
| 903 | _3 = _mm_loadu_ps(ptr +12); |
| 904 | } |
| 905 | _MM_TRANSPOSE4_PS(_0,_1,_2,_3); |
| 906 | *r = _0; |
| 907 | *g = _1; |
| 908 | *b = _2; |
| 909 | *a = _3; |
| 910 | } |
| 911 | |
| 912 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
| 913 | _MM_TRANSPOSE4_PS(r,g,b,a); |
| 914 | if (__builtin_expect(tail, 0)) { |
| 915 | if ( true ) { _mm_storeu_ps(ptr + 0, r); } |
| 916 | if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } |
| 917 | if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } |
| 918 | } else { |
| 919 | _mm_storeu_ps(ptr + 0, r); |
| 920 | _mm_storeu_ps(ptr + 4, g); |
| 921 | _mm_storeu_ps(ptr + 8, b); |
| 922 | _mm_storeu_ps(ptr +12, a); |
| 923 | } |
| 924 | } |
| 925 | #endif |
| 926 | |
| 927 | // We need to be a careful with casts. |
| 928 | // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. |
| 929 | // These named casts and bit_cast() are always what they seem to be. |
| 930 | #if defined(JUMPER_IS_SCALAR) |
| 931 | SI F cast (U32 v) { return (F)v; } |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 932 | SI F cast64(U64 v) { return (F)v; } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 933 | SI U32 trunc_(F v) { return (U32)v; } |
| 934 | SI U32 expand(U16 v) { return (U32)v; } |
| 935 | SI U32 expand(U8 v) { return (U32)v; } |
| 936 | #else |
| 937 | SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 938 | SI F cast64(U64 v) { return __builtin_convertvector( v, F); } |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 939 | SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } |
| 940 | SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } |
| 941 | SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } |
| 942 | #endif |
| 943 | |
| 944 | template <typename V> |
| 945 | SI V if_then_else(I32 c, V t, V e) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 946 | return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e))); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 947 | } |
| 948 | |
| 949 | SI U16 bswap(U16 x) { |
| 950 | #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) |
| 951 | // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes |
| 952 | // when generating code for SSE2 and SSE4.1. We'll do it manually... |
| 953 | auto v = widen_cast<__m128i>(x); |
| 954 | v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 955 | return sk_unaligned_load<U16>(&v); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 956 | #else |
| 957 | return (x<<8) | (x>>8); |
| 958 | #endif |
| 959 | } |
| 960 | |
| 961 | SI F fract(F v) { return v - floor_(v); } |
| 962 | |
| 963 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. |
| 964 | SI F approx_log2(F x) { |
| 965 | // e - 127 is a fair approximation of log2(x) in its own right... |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 966 | F e = cast(sk_bit_cast<U32>(x)) * (1.0f / (1<<23)); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 967 | |
| 968 | // ... but using the mantissa to refine its error is _much_ better. |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 969 | F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 970 | return e |
| 971 | - 124.225514990f |
| 972 | - 1.498030302f * m |
| 973 | - 1.725879990f / (0.3520887068f + m); |
| 974 | } |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 975 | |
| 976 | SI F approx_log(F x) { |
| 977 | const float ln2 = 0.69314718f; |
| 978 | return ln2 * approx_log2(x); |
| 979 | } |
| 980 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 981 | SI F approx_pow2(F x) { |
| 982 | F f = fract(x); |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 983 | return sk_bit_cast<F>(round(1.0f * (1<<23), |
| 984 | x + 121.274057500f |
| 985 | - 1.490129070f * f |
| 986 | + 27.728023300f / (4.84252568f - f))); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 987 | } |
| 988 | |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 989 | SI F approx_exp(F x) { |
| 990 | const float log2_e = 1.4426950408889634074f; |
| 991 | return approx_pow2(log2_e * x); |
| 992 | } |
| 993 | |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 994 | SI F approx_powf(F x, F y) { |
Mike Klein | 229befe | 2018-10-26 12:07:57 -0400 | [diff] [blame] | 995 | return if_then_else((x == 0)|(x == 1), x |
Mike Klein | 229befe | 2018-10-26 12:07:57 -0400 | [diff] [blame] | 996 | , approx_pow2(approx_log2(x) * y)); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 997 | } |
| 998 | |
| 999 | SI F from_half(U16 h) { |
Mike Klein | 7aacb0b | 2019-07-02 13:23:06 -0500 | [diff] [blame] | 1000 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
| 1001 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1002 | return vcvt_f32_f16(h); |
| 1003 | |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 1004 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1005 | return _mm256_cvtph_ps(h); |
| 1006 | |
| 1007 | #else |
| 1008 | // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. |
| 1009 | U32 sem = expand(h), |
| 1010 | s = sem & 0x8000, |
| 1011 | em = sem ^ s; |
| 1012 | |
| 1013 | // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. |
| 1014 | auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. |
| 1015 | return if_then_else(denorm, F(0) |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1016 | , sk_bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1017 | #endif |
| 1018 | } |
| 1019 | |
| 1020 | SI U16 to_half(F f) { |
Mike Klein | 7aacb0b | 2019-07-02 13:23:06 -0500 | [diff] [blame] | 1021 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
| 1022 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1023 | return vcvt_f16_f32(f); |
| 1024 | |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 1025 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1026 | return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); |
| 1027 | |
| 1028 | #else |
| 1029 | // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1030 | U32 sem = sk_bit_cast<U32>(f), |
Mike Klein | adc78d5 | 2018-01-01 09:06:37 -0500 | [diff] [blame] | 1031 | s = sem & 0x80000000, |
| 1032 | em = sem ^ s; |
| 1033 | |
| 1034 | // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. |
| 1035 | auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. |
| 1036 | return pack(if_then_else(denorm, U32(0) |
| 1037 | , (s>>16) + (em>>13) - ((127-15)<<10))); |
| 1038 | #endif |
| 1039 | } |
| 1040 | |
| 1041 | // Our fundamental vector depth is our pixel stride. |
| 1042 | static const size_t N = sizeof(F) / sizeof(float); |
| 1043 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1044 | // We're finally going to get to what a Stage function looks like! |
Mike Klein | 0e4d096 | 2017-09-27 11:04:34 -0400 | [diff] [blame] | 1045 | // tail == 0 ~~> work on a full N pixels |
Mike Klein | b5e4842 | 2017-05-30 18:09:29 -0400 | [diff] [blame] | 1046 | // tail != 0 ~~> work on only the first tail pixels |
Mike Klein | 0e4d096 | 2017-09-27 11:04:34 -0400 | [diff] [blame] | 1047 | // tail is always < N. |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1048 | |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1049 | // Any custom ABI to use for all (non-externally-facing) stage functions? |
| 1050 | // Also decide here whether to use narrow (compromise) or wide (ideal) stages. |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 1051 | #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1052 | // This lets us pass vectors more efficiently on 32-bit ARM. |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1053 | // We can still only pass 16 floats, so best as 4x {r,g,b,a}. |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1054 | #define ABI __attribute__((pcs("aapcs-vfp"))) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1055 | #define JUMPER_NARROW_STAGES 1 |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1056 | #elif defined(_MSC_VER) |
| 1057 | // Even if not vectorized, this lets us pass {r,g,b,a} as registers, |
| 1058 | // instead of {b,a} on the stack. Narrow stages work best for __vectorcall. |
| 1059 | #define ABI __vectorcall |
| 1060 | #define JUMPER_NARROW_STAGES 1 |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 1061 | #elif defined(__x86_64__) || defined(SK_CPU_ARM64) |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1062 | // These platforms are ideal for wider stages, and their default ABI is ideal. |
| 1063 | #define ABI |
| 1064 | #define JUMPER_NARROW_STAGES 0 |
| 1065 | #else |
| 1066 | // 32-bit or unknown... shunt them down the narrow path. |
| 1067 | // Odds are these have few registers and are better off there. |
| 1068 | #define ABI |
| 1069 | #define JUMPER_NARROW_STAGES 1 |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1070 | #endif |
| 1071 | |
| 1072 | #if JUMPER_NARROW_STAGES |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1073 | struct Params { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1074 | size_t dx, dy, tail; |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1075 | F dr,dg,db,da; |
| 1076 | }; |
Mike Klein | 376fd31 | 2017-12-11 16:53:26 -0500 | [diff] [blame] | 1077 | using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a); |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1078 | #else |
| 1079 | // We keep program the second argument, so that it's passed in rsi for load_and_inc(). |
Mike Klein | 376fd31 | 2017-12-11 16:53:26 -0500 | [diff] [blame] | 1080 | using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1081 | #endif |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1082 | |
Mike Klein | 376fd31 | 2017-12-11 16:53:26 -0500 | [diff] [blame] | 1083 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1084 | static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { |
Mike Klein | 376fd31 | 2017-12-11 16:53:26 -0500 | [diff] [blame] | 1085 | auto start = (Stage)load_and_inc(program); |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1086 | const size_t x0 = dx; |
| 1087 | for (; dy < ylimit; dy++) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1088 | #if JUMPER_NARROW_STAGES |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1089 | Params params = { x0,dy,0, 0,0,0,0 }; |
| 1090 | while (params.dx + N <= xlimit) { |
Mike Klein | abb8bb3 | 2017-09-27 11:12:01 -0400 | [diff] [blame] | 1091 | start(¶ms,program, 0,0,0,0); |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1092 | params.dx += N; |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1093 | } |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1094 | if (size_t tail = xlimit - params.dx) { |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1095 | params.tail = tail; |
Mike Klein | abb8bb3 | 2017-09-27 11:12:01 -0400 | [diff] [blame] | 1096 | start(¶ms,program, 0,0,0,0); |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1097 | } |
| 1098 | #else |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1099 | dx = x0; |
| 1100 | while (dx + N <= xlimit) { |
| 1101 | start(0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
| 1102 | dx += N; |
Mike Klein | 45c16fa | 2017-07-18 18:15:13 -0400 | [diff] [blame] | 1103 | } |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1104 | if (size_t tail = xlimit - dx) { |
| 1105 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
Mike Klein | 45c16fa | 2017-07-18 18:15:13 -0400 | [diff] [blame] | 1106 | } |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1107 | #endif |
Mike Klein | 3b92b69 | 2017-07-18 11:30:25 -0400 | [diff] [blame] | 1108 | } |
| 1109 | } |
| 1110 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1111 | #if JUMPER_NARROW_STAGES |
| 1112 | #define STAGE(name, ...) \ |
| 1113 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
| 1114 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1115 | static void ABI name(Params* params, void** program, \ |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1116 | F r, F g, F b, F a) { \ |
| 1117 | name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \ |
| 1118 | params->dr, params->dg, params->db, params->da); \ |
| 1119 | auto next = (Stage)load_and_inc(program); \ |
| 1120 | next(params,program, r,g,b,a); \ |
| 1121 | } \ |
| 1122 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1123 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| 1124 | #else |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1125 | #define STAGE(name, ...) \ |
| 1126 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
| 1127 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1128 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1129 | F r, F g, F b, F a, F dr, F dg, F db, F da) { \ |
| 1130 | name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
| 1131 | auto next = (Stage)load_and_inc(program); \ |
| 1132 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
| 1133 | } \ |
| 1134 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1135 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| 1136 | #endif |
Mike Klein | b5e4842 | 2017-05-30 18:09:29 -0400 | [diff] [blame] | 1137 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1138 | |
| 1139 | // just_return() is a simple no-op stage that only exists to end the chain, |
| 1140 | // returning back up to start_pipeline(), and from there to the caller. |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 1141 | #if JUMPER_NARROW_STAGES |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1142 | static void ABI just_return(Params*, void**, F,F,F,F) {} |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1143 | #else |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 1144 | static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {} |
Mike Klein | f1b24e0 | 2017-07-27 12:31:34 -0400 | [diff] [blame] | 1145 | #endif |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1146 | |
| 1147 | |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 1148 | // We could start defining normal Stages now. But first, some helper functions. |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1149 | |
| 1150 | // These load() and store() methods are tail-aware, |
| 1151 | // but focus mainly on keeping the at-stride tail==0 case fast. |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1152 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1153 | template <typename V, typename T> |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 1154 | SI V load(const T* src, size_t tail) { |
Mike Klein | d6e1286 | 2017-08-28 12:18:26 -0400 | [diff] [blame] | 1155 | #if !defined(JUMPER_IS_SCALAR) |
Mike Klein | 0e4d096 | 2017-09-27 11:04:34 -0400 | [diff] [blame] | 1156 | __builtin_assume(tail < N); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1157 | if (__builtin_expect(tail, 0)) { |
| 1158 | V v{}; // Any inactive lanes are zeroed. |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1159 | switch (tail) { |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 1160 | case 7: v[6] = src[6]; [[fallthrough]]; |
| 1161 | case 6: v[5] = src[5]; [[fallthrough]]; |
| 1162 | case 5: v[4] = src[4]; [[fallthrough]]; |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1163 | case 4: memcpy(&v, src, 4*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 1164 | case 3: v[2] = src[2]; [[fallthrough]]; |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1165 | case 2: memcpy(&v, src, 2*sizeof(T)); break; |
| 1166 | case 1: memcpy(&v, src, 1*sizeof(T)); break; |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1167 | } |
| 1168 | return v; |
| 1169 | } |
| 1170 | #endif |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1171 | return sk_unaligned_load<V>(src); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1172 | } |
| 1173 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1174 | template <typename V, typename T> |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 1175 | SI void store(T* dst, V v, size_t tail) { |
Mike Klein | d6e1286 | 2017-08-28 12:18:26 -0400 | [diff] [blame] | 1176 | #if !defined(JUMPER_IS_SCALAR) |
Mike Klein | 0e4d096 | 2017-09-27 11:04:34 -0400 | [diff] [blame] | 1177 | __builtin_assume(tail < N); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1178 | if (__builtin_expect(tail, 0)) { |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1179 | switch (tail) { |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 1180 | case 7: dst[6] = v[6]; [[fallthrough]]; |
| 1181 | case 6: dst[5] = v[5]; [[fallthrough]]; |
| 1182 | case 5: dst[4] = v[4]; [[fallthrough]]; |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1183 | case 4: memcpy(dst, &v, 4*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 1184 | case 3: dst[2] = v[2]; [[fallthrough]]; |
Mike Klein | c4fcbed | 2017-06-26 16:12:48 -0400 | [diff] [blame] | 1185 | case 2: memcpy(dst, &v, 2*sizeof(T)); break; |
| 1186 | case 1: memcpy(dst, &v, 1*sizeof(T)); break; |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1187 | } |
| 1188 | return; |
| 1189 | } |
| 1190 | #endif |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1191 | sk_unaligned_store(dst, v); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1192 | } |
| 1193 | |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1194 | SI F from_byte(U8 b) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1195 | return cast(expand(b)) * (1/255.0f); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1196 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 1197 | SI F from_short(U16 s) { |
| 1198 | return cast(expand(s)) * (1/65535.0f); |
| 1199 | } |
Mike Klein | 64b9748 | 2017-03-14 17:35:04 -0700 | [diff] [blame] | 1200 | SI void from_565(U16 _565, F* r, F* g, F* b) { |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 1201 | U32 wide = expand(_565); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1202 | *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); |
| 1203 | *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); |
| 1204 | *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1205 | } |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 1206 | SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { |
| 1207 | U32 wide = expand(_4444); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1208 | *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); |
| 1209 | *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); |
| 1210 | *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); |
| 1211 | *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 1212 | } |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 1213 | SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1214 | *r = cast((_8888 ) & 0xff) * (1/255.0f); |
| 1215 | *g = cast((_8888 >> 8) & 0xff) * (1/255.0f); |
| 1216 | *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); |
| 1217 | *a = cast((_8888 >> 24) ) * (1/255.0f); |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 1218 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 1219 | SI void from_88(U16 _88, F* r, F* g) { |
| 1220 | U32 wide = expand(_88); |
| 1221 | *r = cast((wide ) & 0xff) * (1/255.0f); |
| 1222 | *g = cast((wide >> 8) & 0xff) * (1/255.0f); |
| 1223 | } |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 1224 | SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) { |
| 1225 | *r = cast((rgba ) & 0x3ff) * (1/1023.0f); |
| 1226 | *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); |
| 1227 | *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); |
| 1228 | *a = cast((rgba >> 30) ) * (1/ 3.0f); |
| 1229 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 1230 | SI void from_1616(U32 _1616, F* r, F* g) { |
| 1231 | *r = cast((_1616 ) & 0xffff) * (1/65535.0f); |
| 1232 | *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f); |
| 1233 | } |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 1234 | SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) { |
| 1235 | *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f); |
| 1236 | *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f); |
| 1237 | *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f); |
| 1238 | *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f); |
| 1239 | } |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 1240 | |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1241 | // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. |
Mike Klein | 45c16fa | 2017-07-18 18:15:13 -0400 | [diff] [blame] | 1242 | template <typename T> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1243 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1244 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
Mike Klein | 45c16fa | 2017-07-18 18:15:13 -0400 | [diff] [blame] | 1245 | } |
| 1246 | |
Mike Klein | 1fa9c43 | 2017-12-11 09:59:47 -0500 | [diff] [blame] | 1247 | // clamp v to [0,limit). |
| 1248 | SI F clamp(F v, F limit) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1249 | F inclusive = sk_bit_cast<F>( sk_bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. |
Mike Klein | 1fa9c43 | 2017-12-11 09:59:47 -0500 | [diff] [blame] | 1250 | return min(max(0, v), inclusive); |
| 1251 | } |
| 1252 | |
Mike Klein | 45c16fa | 2017-07-18 18:15:13 -0400 | [diff] [blame] | 1253 | // Used by gather_ stages to calculate the base pointer and a vector of indices to load. |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 1254 | template <typename T> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1255 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
Mike Klein | f3b4e16 | 2017-09-22 15:32:59 -0400 | [diff] [blame] | 1256 | x = clamp(x, ctx->width); |
| 1257 | y = clamp(y, ctx->height); |
| 1258 | |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 1259 | *ptr = (const T*)ctx->pixels; |
| 1260 | return trunc_(y)*ctx->stride + trunc_(x); |
| 1261 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1262 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1263 | // We often have a nominally [0,1] float value we need to scale and convert to an integer, |
| 1264 | // whether for a table lookup or to pack back down into bytes for storage. |
| 1265 | // |
| 1266 | // In practice, especially when dealing with interesting color spaces, that notionally |
| 1267 | // [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp. |
| 1268 | // |
| 1269 | // You can adjust the expected input to [0,bias] by tweaking that parameter. |
| 1270 | SI U32 to_unorm(F v, F scale, F bias = 1.0f) { |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1271 | // Any time we use round() we probably want to use to_unorm(). |
| 1272 | return round(min(max(0, v), bias), scale); |
| 1273 | } |
| 1274 | |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 1275 | SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); } |
| 1276 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1277 | // Now finally, normal Stages! |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1278 | |
Mike Klein | e8de024 | 2018-03-10 12:37:11 -0500 | [diff] [blame] | 1279 | STAGE(seed_shader, Ctx::None) { |
| 1280 | static const float iota[] = { |
| 1281 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
| 1282 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
| 1283 | }; |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1284 | // It's important for speed to explicitly cast(dx) and cast(dy), |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1285 | // which has the effect of splatting them to vectors before converting to floats. |
| 1286 | // On Intel this breaks a data dependency on previous loop iterations' registers. |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1287 | r = cast(dx) + sk_unaligned_load<F>(iota); |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1288 | g = cast(dy) + 0.5f; |
Mike Klein | 2229b57 | 2017-04-21 10:30:29 -0400 | [diff] [blame] | 1289 | b = 1.0f; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1290 | a = 0; |
| 1291 | dr = dg = db = da = 0; |
| 1292 | } |
| 1293 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1294 | STAGE(dither, const float* rate) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1295 | // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. |
Mike Klein | 856b3c3 | 2017-08-29 13:38:09 -0400 | [diff] [blame] | 1296 | uint32_t iota[] = {0,1,2,3,4,5,6,7}; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1297 | U32 X = dx + sk_unaligned_load<U32>(iota), |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1298 | Y = dy; |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 1299 | |
| 1300 | // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. |
| 1301 | // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. |
| 1302 | |
| 1303 | // We only need X and X^Y from here on, so it's easier to just think of that as "Y". |
| 1304 | Y ^= X; |
| 1305 | |
| 1306 | // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, |
| 1307 | // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda. |
| 1308 | U32 M = (Y & 1) << 5 | (X & 1) << 4 |
| 1309 | | (Y & 2) << 2 | (X & 2) << 1 |
| 1310 | | (Y & 4) >> 1 | (X & 4) >> 2; |
| 1311 | |
Mike Klein | db711c9 | 2017-05-03 17:57:48 -0400 | [diff] [blame] | 1312 | // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. |
| 1313 | // We want to make sure our dither is less than 0.5 in either direction to keep exact values |
| 1314 | // like 0 and 1 unchanged after rounding. |
| 1315 | F dither = cast(M) * (2/128.0f) - (63/128.0f); |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 1316 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1317 | r += *rate*dither; |
| 1318 | g += *rate*dither; |
| 1319 | b += *rate*dither; |
Mike Klein | 7e68bc9 | 2017-05-16 12:03:15 -0400 | [diff] [blame] | 1320 | |
| 1321 | r = max(0, min(r, a)); |
| 1322 | g = max(0, min(g, a)); |
| 1323 | b = max(0, min(b, a)); |
Mike Klein | 581e698 | 2017-05-03 13:05:13 -0400 | [diff] [blame] | 1324 | } |
| 1325 | |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 1326 | // load 4 floats from memory, and splat them into r,g,b,a |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1327 | STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
Mike Klein | 1a2e3e1 | 2017-08-03 11:24:13 -0400 | [diff] [blame] | 1328 | r = c->r; |
| 1329 | g = c->g; |
| 1330 | b = c->b; |
| 1331 | a = c->a; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1332 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1333 | STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
Mike Klein | cd3e13a | 2018-07-10 15:52:06 +0000 | [diff] [blame] | 1334 | r = c->r; |
| 1335 | g = c->g; |
| 1336 | b = c->b; |
| 1337 | a = c->a; |
| 1338 | } |
Mike Reed | 9318a6c | 2019-08-16 16:16:25 -0400 | [diff] [blame] | 1339 | // load 4 floats from memory, and splat them into dr,dg,db,da |
| 1340 | STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
| 1341 | dr = c->r; |
| 1342 | dg = c->g; |
| 1343 | db = c->b; |
| 1344 | da = c->a; |
| 1345 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1346 | |
Mike Reed | c91e387 | 2017-07-05 14:12:37 -0400 | [diff] [blame] | 1347 | // splats opaque-black into r,g,b,a |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1348 | STAGE(black_color, Ctx::None) { |
Mike Reed | c91e387 | 2017-07-05 14:12:37 -0400 | [diff] [blame] | 1349 | r = g = b = 0.0f; |
| 1350 | a = 1.0f; |
| 1351 | } |
| 1352 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1353 | STAGE(white_color, Ctx::None) { |
Mike Reed | c91e387 | 2017-07-05 14:12:37 -0400 | [diff] [blame] | 1354 | r = g = b = a = 1.0f; |
| 1355 | } |
| 1356 | |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 1357 | // load registers r,g,b,a from context (mirrors store_rgba) |
Mike Reed | 5e398c2 | 2019-03-08 11:50:35 -0500 | [diff] [blame] | 1358 | STAGE(load_src, const float* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1359 | r = sk_unaligned_load<F>(ptr + 0*N); |
| 1360 | g = sk_unaligned_load<F>(ptr + 1*N); |
| 1361 | b = sk_unaligned_load<F>(ptr + 2*N); |
| 1362 | a = sk_unaligned_load<F>(ptr + 3*N); |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 1363 | } |
| 1364 | |
| 1365 | // store registers r,g,b,a into context (mirrors load_rgba) |
Mike Reed | 5e398c2 | 2019-03-08 11:50:35 -0500 | [diff] [blame] | 1366 | STAGE(store_src, float* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1367 | sk_unaligned_store(ptr + 0*N, r); |
| 1368 | sk_unaligned_store(ptr + 1*N, g); |
| 1369 | sk_unaligned_store(ptr + 2*N, b); |
| 1370 | sk_unaligned_store(ptr + 3*N, a); |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 1371 | } |
Mike Reed | 121c2af | 2020-03-10 14:02:56 -0400 | [diff] [blame] | 1372 | STAGE(store_src_a, float* ptr) { |
| 1373 | sk_unaligned_store(ptr, a); |
| 1374 | } |
Mike Reed | 9959f72 | 2017-05-15 09:34:22 -0400 | [diff] [blame] | 1375 | |
Mike Reed | 5e398c2 | 2019-03-08 11:50:35 -0500 | [diff] [blame] | 1376 | // load registers dr,dg,db,da from context (mirrors store_dst) |
| 1377 | STAGE(load_dst, const float* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1378 | dr = sk_unaligned_load<F>(ptr + 0*N); |
| 1379 | dg = sk_unaligned_load<F>(ptr + 1*N); |
| 1380 | db = sk_unaligned_load<F>(ptr + 2*N); |
| 1381 | da = sk_unaligned_load<F>(ptr + 3*N); |
Mike Reed | 5e398c2 | 2019-03-08 11:50:35 -0500 | [diff] [blame] | 1382 | } |
| 1383 | |
| 1384 | // store registers dr,dg,db,da into context (mirrors load_dst) |
| 1385 | STAGE(store_dst, float* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1386 | sk_unaligned_store(ptr + 0*N, dr); |
| 1387 | sk_unaligned_store(ptr + 1*N, dg); |
| 1388 | sk_unaligned_store(ptr + 2*N, db); |
| 1389 | sk_unaligned_store(ptr + 3*N, da); |
Mike Reed | 5e398c2 | 2019-03-08 11:50:35 -0500 | [diff] [blame] | 1390 | } |
| 1391 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1392 | // Most blend modes apply the same logic to each channel. |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 1393 | #define BLEND_MODE(name) \ |
| 1394 | SI F name##_channel(F s, F d, F sa, F da); \ |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1395 | STAGE(name, Ctx::None) { \ |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 1396 | r = name##_channel(r,dr,a,da); \ |
| 1397 | g = name##_channel(g,dg,a,da); \ |
| 1398 | b = name##_channel(b,db,a,da); \ |
| 1399 | a = name##_channel(a,da,a,da); \ |
| 1400 | } \ |
| 1401 | SI F name##_channel(F s, F d, F sa, F da) |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1402 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1403 | SI F inv(F x) { return 1.0f - x; } |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 1404 | SI F two(F x) { return x + x; } |
Yuqian Li | 7741c75 | 2017-12-11 14:17:47 -0500 | [diff] [blame] | 1405 | |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1406 | |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 1407 | BLEND_MODE(clear) { return 0; } |
| 1408 | BLEND_MODE(srcatop) { return s*da + d*inv(sa); } |
| 1409 | BLEND_MODE(dstatop) { return d*sa + s*inv(da); } |
| 1410 | BLEND_MODE(srcin) { return s * da; } |
| 1411 | BLEND_MODE(dstin) { return d * sa; } |
| 1412 | BLEND_MODE(srcout) { return s * inv(da); } |
| 1413 | BLEND_MODE(dstout) { return d * inv(sa); } |
| 1414 | BLEND_MODE(srcover) { return mad(d, inv(sa), s); } |
| 1415 | BLEND_MODE(dstover) { return mad(s, inv(da), d); } |
| 1416 | |
| 1417 | BLEND_MODE(modulate) { return s*d; } |
| 1418 | BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } |
Mike Klein | b90c080 | 2019-03-15 14:03:41 +0000 | [diff] [blame] | 1419 | BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa. |
Mike Klein | aaca1e4 | 2017-03-31 09:29:01 -0400 | [diff] [blame] | 1420 | BLEND_MODE(screen) { return s + d - s*d; } |
| 1421 | BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 1422 | #undef BLEND_MODE |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1423 | |
| 1424 | // Most other blend modes apply the same logic to colors, and srcover to alpha. |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 1425 | #define BLEND_MODE(name) \ |
| 1426 | SI F name##_channel(F s, F d, F sa, F da); \ |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1427 | STAGE(name, Ctx::None) { \ |
Mike Klein | 66b09ab | 2017-03-31 10:29:40 -0400 | [diff] [blame] | 1428 | r = name##_channel(r,dr,a,da); \ |
| 1429 | g = name##_channel(g,dg,a,da); \ |
| 1430 | b = name##_channel(b,db,a,da); \ |
| 1431 | a = mad(da, inv(a), a); \ |
| 1432 | } \ |
| 1433 | SI F name##_channel(F s, F d, F sa, F da) |
| 1434 | |
| 1435 | BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; } |
| 1436 | BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; } |
| 1437 | BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } |
| 1438 | BLEND_MODE(exclusion) { return s + d - two(s*d); } |
| 1439 | |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 1440 | BLEND_MODE(colorburn) { |
Florin Malita | 59a62ed | 2017-08-23 12:08:37 -0400 | [diff] [blame] | 1441 | return if_then_else(d == da, d + s*inv(da), |
| 1442 | if_then_else(s == 0, /* s + */ d*inv(sa), |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 1443 | sa*(da - min(da, (da-d)*sa*rcp_fast(s))) + s*inv(da) + d*inv(sa))); |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 1444 | } |
| 1445 | BLEND_MODE(colordodge) { |
Florin Malita | 59a62ed | 2017-08-23 12:08:37 -0400 | [diff] [blame] | 1446 | return if_then_else(d == 0, /* d + */ s*inv(da), |
| 1447 | if_then_else(s == sa, s + d*inv(sa), |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 1448 | sa*min(da, (d*sa)*rcp_fast(sa - s)) + s*inv(da) + d*inv(sa))); |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 1449 | } |
| 1450 | BLEND_MODE(hardlight) { |
| 1451 | return s*inv(da) + d*inv(sa) |
| 1452 | + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); |
| 1453 | } |
| 1454 | BLEND_MODE(overlay) { |
| 1455 | return s*inv(da) + d*inv(sa) |
| 1456 | + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); |
| 1457 | } |
| 1458 | |
| 1459 | BLEND_MODE(softlight) { |
| 1460 | F m = if_then_else(da > 0, d / da, 0), |
| 1461 | s2 = two(s), |
| 1462 | m4 = two(two(m)); |
| 1463 | |
| 1464 | // The logic forks three ways: |
| 1465 | // 1. dark src? |
| 1466 | // 2. light src, dark dst? |
| 1467 | // 3. light src, light dst? |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1468 | F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. |
| 1469 | darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. |
Mike Klein | 395274e | 2021-04-22 12:41:56 -0500 | [diff] [blame] | 1470 | #if defined(SK_RASTER_PIPELINE_LEGACY_RCP_RSQRT) |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 1471 | liteDst = rcp_fast(rsqrt(m)) - m, // Used in case 3. |
Mike Klein | 395274e | 2021-04-22 12:41:56 -0500 | [diff] [blame] | 1472 | #else |
| 1473 | liteDst = sqrt_(m) - m, |
| 1474 | #endif |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 1475 | liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? |
| 1476 | return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)? |
| 1477 | } |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1478 | #undef BLEND_MODE |
Mike Klein | 61b8416 | 2017-03-31 11:48:14 -0400 | [diff] [blame] | 1479 | |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1480 | // We're basing our implemenation of non-separable blend modes on |
| 1481 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
| 1482 | // and |
| 1483 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
| 1484 | // They're equivalent, but ES' math has been better simplified. |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1485 | // |
| 1486 | // Anything extra we add beyond that is to make the math work with premul inputs. |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1487 | |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 1488 | SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); } |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1489 | SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } |
| 1490 | |
| 1491 | SI void set_sat(F* r, F* g, F* b, F s) { |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 1492 | F mn = min(*r, min(*g,*b)), |
| 1493 | mx = max(*r, max(*g,*b)), |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1494 | sat = mx - mn; |
| 1495 | |
| 1496 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
| 1497 | auto scale = [=](F c) { |
| 1498 | return if_then_else(sat == 0, 0, (c - mn) * s / sat); |
| 1499 | }; |
| 1500 | *r = scale(*r); |
| 1501 | *g = scale(*g); |
| 1502 | *b = scale(*b); |
| 1503 | } |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1504 | SI void set_lum(F* r, F* g, F* b, F l) { |
| 1505 | F diff = l - lum(*r, *g, *b); |
| 1506 | *r += diff; |
| 1507 | *g += diff; |
| 1508 | *b += diff; |
| 1509 | } |
| 1510 | SI void clip_color(F* r, F* g, F* b, F a) { |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 1511 | F mn = min(*r, min(*g, *b)), |
| 1512 | mx = max(*r, max(*g, *b)), |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1513 | l = lum(*r, *g, *b); |
| 1514 | |
| 1515 | auto clip = [=](F c) { |
| 1516 | c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) ); |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1517 | c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1518 | c = max(c, 0); // Sometimes without this we may dip just a little negative. |
| 1519 | return c; |
| 1520 | }; |
| 1521 | *r = clip(*r); |
| 1522 | *g = clip(*g); |
| 1523 | *b = clip(*b); |
| 1524 | } |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1525 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1526 | STAGE(hue, Ctx::None) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1527 | F R = r*a, |
| 1528 | G = g*a, |
| 1529 | B = b*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1530 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1531 | set_sat(&R, &G, &B, sat(dr,dg,db)*a); |
| 1532 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
| 1533 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1534 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1535 | r = r*inv(da) + dr*inv(a) + R; |
| 1536 | g = g*inv(da) + dg*inv(a) + G; |
| 1537 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1538 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1539 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1540 | STAGE(saturation, Ctx::None) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1541 | F R = dr*a, |
| 1542 | G = dg*a, |
| 1543 | B = db*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1544 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1545 | set_sat(&R, &G, &B, sat( r, g, b)*da); |
| 1546 | set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.) |
| 1547 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1548 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1549 | r = r*inv(da) + dr*inv(a) + R; |
| 1550 | g = g*inv(da) + dg*inv(a) + G; |
| 1551 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1552 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1553 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1554 | STAGE(color, Ctx::None) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1555 | F R = r*da, |
| 1556 | G = g*da, |
| 1557 | B = b*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1558 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1559 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
| 1560 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1561 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1562 | r = r*inv(da) + dr*inv(a) + R; |
| 1563 | g = g*inv(da) + dg*inv(a) + G; |
| 1564 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1565 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1566 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1567 | STAGE(luminosity, Ctx::None) { |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1568 | F R = dr*a, |
| 1569 | G = dg*a, |
| 1570 | B = db*a; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1571 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1572 | set_lum(&R, &G, &B, lum(r,g,b)*da); |
| 1573 | clip_color(&R,&G,&B, a*da); |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1574 | |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1575 | r = r*inv(da) + dr*inv(a) + R; |
| 1576 | g = g*inv(da) + dg*inv(a) + G; |
| 1577 | b = b*inv(da) + db*inv(a) + B; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1578 | a = a + da - a*da; |
Mike Klein | bb33833 | 2017-05-04 12:42:52 -0400 | [diff] [blame] | 1579 | } |
| 1580 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1581 | STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1582 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
Mike Klein | 5062626 | 2017-05-25 13:06:57 -0400 | [diff] [blame] | 1583 | |
| 1584 | U32 dst = load<U32>(ptr, tail); |
| 1585 | dr = cast((dst ) & 0xff); |
| 1586 | dg = cast((dst >> 8) & 0xff); |
| 1587 | db = cast((dst >> 16) & 0xff); |
| 1588 | da = cast((dst >> 24) ); |
| 1589 | // {dr,dg,db,da} are in [0,255] |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1590 | // { r, g, b, a} are in [0, 1] (but may be out of gamut) |
Mike Klein | 5062626 | 2017-05-25 13:06:57 -0400 | [diff] [blame] | 1591 | |
| 1592 | r = mad(dr, inv(a), r*255.0f); |
| 1593 | g = mad(dg, inv(a), g*255.0f); |
| 1594 | b = mad(db, inv(a), b*255.0f); |
| 1595 | a = mad(da, inv(a), a*255.0f); |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1596 | // { r, g, b, a} are now in [0,255] (but may be out of gamut) |
Mike Klein | 5062626 | 2017-05-25 13:06:57 -0400 | [diff] [blame] | 1597 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1598 | // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased. |
| 1599 | dst = to_unorm(r, 1, 255) |
| 1600 | | to_unorm(g, 1, 255) << 8 |
| 1601 | | to_unorm(b, 1, 255) << 16 |
| 1602 | | to_unorm(a, 1, 255) << 24; |
Mike Klein | 5062626 | 2017-05-25 13:06:57 -0400 | [diff] [blame] | 1603 | store(ptr, dst, tail); |
| 1604 | } |
| 1605 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1606 | STAGE(clamp_0, Ctx::None) { |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1607 | r = max(r, 0); |
| 1608 | g = max(g, 0); |
| 1609 | b = max(b, 0); |
| 1610 | a = max(a, 0); |
| 1611 | } |
| 1612 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1613 | STAGE(clamp_1, Ctx::None) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1614 | r = min(r, 1.0f); |
| 1615 | g = min(g, 1.0f); |
| 1616 | b = min(b, 1.0f); |
| 1617 | a = min(a, 1.0f); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1618 | } |
| 1619 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1620 | STAGE(clamp_a, Ctx::None) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1621 | a = min(a, 1.0f); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1622 | r = min(r, a); |
| 1623 | g = min(g, a); |
| 1624 | b = min(b, a); |
| 1625 | } |
| 1626 | |
Mike Klein | eb50f43 | 2018-09-07 11:08:53 -0400 | [diff] [blame] | 1627 | STAGE(clamp_gamut, Ctx::None) { |
Mike Klein | 6495a4f | 2020-01-08 12:45:36 -0600 | [diff] [blame] | 1628 | a = min(max(a, 0), 1.0f); |
Mike Klein | eb50f43 | 2018-09-07 11:08:53 -0400 | [diff] [blame] | 1629 | r = min(max(r, 0), a); |
| 1630 | g = min(max(g, 0), a); |
| 1631 | b = min(max(b, 0), a); |
| 1632 | } |
| 1633 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1634 | STAGE(set_rgb, const float* rgb) { |
Mike Klein | d9e8225 | 2017-02-22 14:17:32 -0500 | [diff] [blame] | 1635 | r = rgb[0]; |
| 1636 | g = rgb[1]; |
| 1637 | b = rgb[2]; |
| 1638 | } |
Mike Klein | be56949 | 2018-09-14 09:34:21 -0400 | [diff] [blame] | 1639 | STAGE(unbounded_set_rgb, const float* rgb) { |
| 1640 | r = rgb[0]; |
| 1641 | g = rgb[1]; |
| 1642 | b = rgb[2]; |
| 1643 | } |
Mike Klein | 1a3eb52 | 2018-10-18 10:11:00 -0400 | [diff] [blame] | 1644 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1645 | STAGE(swap_rb, Ctx::None) { |
Mike Klein | d9e8225 | 2017-02-22 14:17:32 -0500 | [diff] [blame] | 1646 | auto tmp = r; |
| 1647 | r = b; |
| 1648 | b = tmp; |
| 1649 | } |
Mike Klein | 1a3eb52 | 2018-10-18 10:11:00 -0400 | [diff] [blame] | 1650 | STAGE(swap_rb_dst, Ctx::None) { |
| 1651 | auto tmp = dr; |
| 1652 | dr = db; |
| 1653 | db = tmp; |
| 1654 | } |
Mike Klein | d9e8225 | 2017-02-22 14:17:32 -0500 | [diff] [blame] | 1655 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1656 | STAGE(move_src_dst, Ctx::None) { |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1657 | dr = r; |
| 1658 | dg = g; |
| 1659 | db = b; |
| 1660 | da = a; |
| 1661 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1662 | STAGE(move_dst_src, Ctx::None) { |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1663 | r = dr; |
| 1664 | g = dg; |
| 1665 | b = db; |
| 1666 | a = da; |
| 1667 | } |
Brian Osman | 9f1e06a | 2021-08-10 14:39:18 -0400 | [diff] [blame] | 1668 | STAGE(swap_src_dst, Ctx::None) { |
| 1669 | std::swap(r, dr); |
| 1670 | std::swap(g, dg); |
| 1671 | std::swap(b, db); |
| 1672 | std::swap(a, da); |
| 1673 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1674 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1675 | STAGE(premul, Ctx::None) { |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1676 | r = r * a; |
| 1677 | g = g * a; |
| 1678 | b = b * a; |
| 1679 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1680 | STAGE(premul_dst, Ctx::None) { |
Mike Reed | 883c9bc | 2017-07-19 10:57:53 -0400 | [diff] [blame] | 1681 | dr = dr * da; |
| 1682 | dg = dg * da; |
| 1683 | db = db * da; |
| 1684 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1685 | STAGE(unpremul, Ctx::None) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1686 | float inf = sk_bit_cast<float>(0x7f800000); |
Mike Klein | a65f2f0 | 2017-10-11 13:05:24 -0400 | [diff] [blame] | 1687 | auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0); |
Mike Klein | 08aa88d | 2017-05-12 12:59:24 -0400 | [diff] [blame] | 1688 | r *= scale; |
| 1689 | g *= scale; |
| 1690 | b *= scale; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1691 | } |
| 1692 | |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 1693 | STAGE(force_opaque , Ctx::None) { a = 1; } |
| 1694 | STAGE(force_opaque_dst, Ctx::None) { da = 1; } |
| 1695 | |
Florin Malita | a8392b7 | 2019-10-23 17:37:35 -0400 | [diff] [blame] | 1696 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
| 1697 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
| 1698 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
| 1699 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1700 | STAGE(rgb_to_hsl, Ctx::None) { |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 1701 | F mx = max(r, max(g,b)), |
| 1702 | mn = min(r, min(g,b)), |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1703 | d = mx - mn, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1704 | d_rcp = 1.0f / d; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1705 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1706 | F h = (1/6.0f) * |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1707 | if_then_else(mx == mn, 0, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1708 | if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), |
| 1709 | if_then_else(mx == g, (b-r)*d_rcp + 2.0f, |
| 1710 | (r-g)*d_rcp + 4.0f))); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1711 | |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1712 | F l = (mx + mn) * 0.5f; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1713 | F s = if_then_else(mx == mn, 0, |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1714 | d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1715 | |
| 1716 | r = h; |
| 1717 | g = s; |
| 1718 | b = l; |
| 1719 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1720 | STAGE(hsl_to_rgb, Ctx::None) { |
Florin Malita | a8392b7 | 2019-10-23 17:37:35 -0400 | [diff] [blame] | 1721 | // See GrRGBToHSLFilterEffect.fp |
| 1722 | |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1723 | F h = r, |
| 1724 | s = g, |
Florin Malita | a8392b7 | 2019-10-23 17:37:35 -0400 | [diff] [blame] | 1725 | l = b, |
| 1726 | c = (1.0f - abs_(2.0f * l - 1)) * s; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1727 | |
Florin Malita | a8392b7 | 2019-10-23 17:37:35 -0400 | [diff] [blame] | 1728 | auto hue_to_rgb = [&](F hue) { |
| 1729 | F q = clamp_01(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f); |
| 1730 | return (q - 0.5f) * c + l; |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1731 | }; |
| 1732 | |
Florin Malita | a8392b7 | 2019-10-23 17:37:35 -0400 | [diff] [blame] | 1733 | r = hue_to_rgb(h + 0.0f/3.0f); |
| 1734 | g = hue_to_rgb(h + 2.0f/3.0f); |
| 1735 | b = hue_to_rgb(h + 1.0f/3.0f); |
Mike Klein | db1cbcb | 2017-04-12 08:35:41 -0400 | [diff] [blame] | 1736 | } |
| 1737 | |
Mike Klein | fb126fa | 2017-08-24 13:06:23 -0400 | [diff] [blame] | 1738 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
| 1739 | SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) { |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 1740 | return if_then_else(a < da, min(cr, min(cg,cb)) |
| 1741 | , max(cr, max(cg,cb))); |
Mike Klein | fb126fa | 2017-08-24 13:06:23 -0400 | [diff] [blame] | 1742 | } |
| 1743 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1744 | STAGE(scale_1_float, const float* c) { |
| 1745 | r = r * *c; |
| 1746 | g = g * *c; |
| 1747 | b = b * *c; |
| 1748 | a = a * *c; |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1749 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1750 | STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1751 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1752 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1753 | auto scales = load<U8>(ptr, tail); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1754 | auto c = from_byte(scales); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1755 | |
| 1756 | r = r * c; |
| 1757 | g = g * c; |
| 1758 | b = b * c; |
| 1759 | a = a * c; |
| 1760 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1761 | STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1762 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Klein | fb126fa | 2017-08-24 13:06:23 -0400 | [diff] [blame] | 1763 | |
| 1764 | F cr,cg,cb; |
| 1765 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
| 1766 | |
| 1767 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| 1768 | |
| 1769 | r = r * cr; |
| 1770 | g = g * cg; |
| 1771 | b = b * cb; |
| 1772 | a = a * ca; |
| 1773 | } |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1774 | |
Mike Klein | b9c4a6f | 2017-04-03 13:54:55 -0400 | [diff] [blame] | 1775 | SI F lerp(F from, F to, F t) { |
| 1776 | return mad(to-from, t, from); |
| 1777 | } |
| 1778 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 1779 | STAGE(lerp_1_float, const float* c) { |
| 1780 | r = lerp(dr, r, *c); |
| 1781 | g = lerp(dg, g, *c); |
| 1782 | b = lerp(db, b, *c); |
| 1783 | a = lerp(da, a, *c); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1784 | } |
Mike Reed | 121c2af | 2020-03-10 14:02:56 -0400 | [diff] [blame] | 1785 | STAGE(scale_native, const float scales[]) { |
| 1786 | auto c = sk_unaligned_load<F>(scales); |
| 1787 | r = r * c; |
| 1788 | g = g * c; |
| 1789 | b = b * c; |
| 1790 | a = a * c; |
| 1791 | } |
Mike Reed | 79a7542 | 2019-03-15 15:45:09 -0400 | [diff] [blame] | 1792 | STAGE(lerp_native, const float scales[]) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 1793 | auto c = sk_unaligned_load<F>(scales); |
Mike Reed | 79a7542 | 2019-03-15 15:45:09 -0400 | [diff] [blame] | 1794 | r = lerp(dr, r, c); |
| 1795 | g = lerp(dg, g, c); |
| 1796 | b = lerp(db, b, c); |
| 1797 | a = lerp(da, a, c); |
| 1798 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1799 | STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1800 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
Mike Klein | 2b76736 | 2017-02-22 13:52:40 -0500 | [diff] [blame] | 1801 | |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1802 | auto scales = load<U8>(ptr, tail); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1803 | auto c = from_byte(scales); |
Mike Klein | 2b76736 | 2017-02-22 13:52:40 -0500 | [diff] [blame] | 1804 | |
| 1805 | r = lerp(dr, r, c); |
| 1806 | g = lerp(dg, g, c); |
| 1807 | b = lerp(db, b, c); |
| 1808 | a = lerp(da, a, c); |
| 1809 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1810 | STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1811 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1812 | |
| 1813 | F cr,cg,cb; |
Mike Klein | 5224f46 | 2017-03-07 17:29:54 -0500 | [diff] [blame] | 1814 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1815 | |
Mike Klein | fb126fa | 2017-08-24 13:06:23 -0400 | [diff] [blame] | 1816 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| 1817 | |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1818 | r = lerp(dr, r, cr); |
| 1819 | g = lerp(dg, g, cg); |
| 1820 | b = lerp(db, b, cb); |
Mike Klein | fb126fa | 2017-08-24 13:06:23 -0400 | [diff] [blame] | 1821 | a = lerp(da, a, ca); |
Mike Klein | e3d4421 | 2017-02-24 08:21:18 -0500 | [diff] [blame] | 1822 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 1823 | |
Mike Klein | eda2ac2 | 2018-11-06 11:53:59 -0500 | [diff] [blame] | 1824 | STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
| 1825 | auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), |
| 1826 | aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy); |
| 1827 | |
| 1828 | F mul = from_byte(load<U8>(mptr, tail)), |
| 1829 | add = from_byte(load<U8>(aptr, tail)); |
| 1830 | |
| 1831 | r = mad(r, mul, add); |
| 1832 | g = mad(g, mul, add); |
| 1833 | b = mad(b, mul, add); |
| 1834 | } |
| 1835 | |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 1836 | STAGE(byte_tables, const void* ctx) { |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1837 | struct Tables { const uint8_t *r, *g, *b, *a; }; |
| 1838 | auto tables = (const Tables*)ctx; |
| 1839 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1840 | r = from_byte(gather(tables->r, to_unorm(r, 255))); |
| 1841 | g = from_byte(gather(tables->g, to_unorm(g, 255))); |
| 1842 | b = from_byte(gather(tables->b, to_unorm(b, 255))); |
| 1843 | a = from_byte(gather(tables->a, to_unorm(a, 255))); |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1844 | } |
| 1845 | |
Mike Klein | b1c77e4 | 2018-09-06 15:23:29 -0400 | [diff] [blame] | 1846 | SI F strip_sign(F x, U32* sign) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1847 | U32 bits = sk_bit_cast<U32>(x); |
Mike Klein | b1c77e4 | 2018-09-06 15:23:29 -0400 | [diff] [blame] | 1848 | *sign = bits & 0x80000000; |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1849 | return sk_bit_cast<F>(bits ^ *sign); |
Mike Klein | b1c77e4 | 2018-09-06 15:23:29 -0400 | [diff] [blame] | 1850 | } |
Mike Klein | c4e4063 | 2018-09-05 15:16:52 -0400 | [diff] [blame] | 1851 | |
Mike Klein | b1c77e4 | 2018-09-06 15:23:29 -0400 | [diff] [blame] | 1852 | SI F apply_sign(F x, U32 sign) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 1853 | return sk_bit_cast<F>(sign | sk_bit_cast<U32>(x)); |
Mike Klein | b1c77e4 | 2018-09-06 15:23:29 -0400 | [diff] [blame] | 1854 | } |
Mike Klein | c4e4063 | 2018-09-05 15:16:52 -0400 | [diff] [blame] | 1855 | |
Brian Osman | 5deadca | 2019-01-24 12:18:17 -0500 | [diff] [blame] | 1856 | STAGE(parametric, const skcms_TransferFunction* ctx) { |
Mike Klein | 4eebd9e | 2018-07-11 14:49:51 -0400 | [diff] [blame] | 1857 | auto fn = [&](F v) { |
Mike Klein | c4e4063 | 2018-09-05 15:16:52 -0400 | [diff] [blame] | 1858 | U32 sign; |
| 1859 | v = strip_sign(v, &sign); |
| 1860 | |
Brian Osman | 5deadca | 2019-01-24 12:18:17 -0500 | [diff] [blame] | 1861 | F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f) |
| 1862 | , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e); |
Mike Klein | 33d3d31 | 2018-09-05 17:52:25 -0400 | [diff] [blame] | 1863 | return apply_sign(r, sign); |
Mike Klein | 4eebd9e | 2018-07-11 14:49:51 -0400 | [diff] [blame] | 1864 | }; |
| 1865 | r = fn(r); |
| 1866 | g = fn(g); |
| 1867 | b = fn(b); |
Mike Klein | 4437517 | 2017-04-17 19:32:05 -0400 | [diff] [blame] | 1868 | } |
Mike Klein | 4437517 | 2017-04-17 19:32:05 -0400 | [diff] [blame] | 1869 | |
Mike Klein | 1ce03a6 | 2019-04-23 08:00:35 -0500 | [diff] [blame] | 1870 | STAGE(gamma_, const float* G) { |
Mike Klein | c4e4063 | 2018-09-05 15:16:52 -0400 | [diff] [blame] | 1871 | auto fn = [&](F v) { |
| 1872 | U32 sign; |
| 1873 | v = strip_sign(v, &sign); |
| 1874 | return apply_sign(approx_powf(v, *G), sign); |
| 1875 | }; |
| 1876 | r = fn(r); |
| 1877 | g = fn(g); |
| 1878 | b = fn(b); |
| 1879 | } |
| 1880 | |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 1881 | STAGE(PQish, const skcms_TransferFunction* ctx) { |
| 1882 | auto fn = [&](F v) { |
| 1883 | U32 sign; |
| 1884 | v = strip_sign(v, &sign); |
| 1885 | |
| 1886 | F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0) |
| 1887 | / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)), |
| 1888 | ctx->f); |
| 1889 | |
| 1890 | return apply_sign(r, sign); |
| 1891 | }; |
| 1892 | r = fn(r); |
| 1893 | g = fn(g); |
| 1894 | b = fn(b); |
| 1895 | } |
| 1896 | |
| 1897 | STAGE(HLGish, const skcms_TransferFunction* ctx) { |
| 1898 | auto fn = [&](F v) { |
| 1899 | U32 sign; |
| 1900 | v = strip_sign(v, &sign); |
| 1901 | |
| 1902 | const float R = ctx->a, G = ctx->b, |
Mike Klein | 627c002 | 2021-01-07 10:50:01 -0600 | [diff] [blame] | 1903 | a = ctx->c, b = ctx->d, c = ctx->e, |
| 1904 | K = ctx->f + 1.0f; |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 1905 | |
| 1906 | F r = if_then_else(v*R <= 1, approx_powf(v*R, G) |
| 1907 | , approx_exp((v-c)*a) + b); |
| 1908 | |
Mike Klein | 627c002 | 2021-01-07 10:50:01 -0600 | [diff] [blame] | 1909 | return K * apply_sign(r, sign); |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 1910 | }; |
| 1911 | r = fn(r); |
| 1912 | g = fn(g); |
| 1913 | b = fn(b); |
| 1914 | } |
| 1915 | |
| 1916 | STAGE(HLGinvish, const skcms_TransferFunction* ctx) { |
| 1917 | auto fn = [&](F v) { |
| 1918 | U32 sign; |
| 1919 | v = strip_sign(v, &sign); |
| 1920 | |
| 1921 | const float R = ctx->a, G = ctx->b, |
Mike Klein | 627c002 | 2021-01-07 10:50:01 -0600 | [diff] [blame] | 1922 | a = ctx->c, b = ctx->d, c = ctx->e, |
| 1923 | K = ctx->f + 1.0f; |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 1924 | |
Mike Klein | 627c002 | 2021-01-07 10:50:01 -0600 | [diff] [blame] | 1925 | v /= K; |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 1926 | F r = if_then_else(v <= 1, R * approx_powf(v, G) |
| 1927 | , a * approx_log(v - b) + c); |
| 1928 | |
| 1929 | return apply_sign(r, sign); |
| 1930 | }; |
| 1931 | r = fn(r); |
| 1932 | g = fn(g); |
| 1933 | b = fn(b); |
| 1934 | } |
| 1935 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1936 | STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1937 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 1938 | |
| 1939 | r = g = b = 0.0f; |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1940 | a = from_byte(load<U8>(ptr, tail)); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 1941 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1942 | STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1943 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 1944 | |
| 1945 | dr = dg = db = 0.0f; |
| 1946 | da = from_byte(load<U8>(ptr, tail)); |
| 1947 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1948 | STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 1949 | const uint8_t* ptr; |
| 1950 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 1951 | r = g = b = 0.0f; |
Mike Klein | 40de6da | 2017-04-07 13:09:29 -0400 | [diff] [blame] | 1952 | a = from_byte(gather(ptr, ix)); |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 1953 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1954 | STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1955 | auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 1956 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1957 | U8 packed = pack(pack(to_unorm(a, 255))); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1958 | store(ptr, packed, tail); |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 1959 | } |
Brian Osman | a7a2324 | 2022-02-08 10:34:38 -0500 | [diff] [blame] | 1960 | STAGE(store_r8, const SkRasterPipeline_MemoryCtx* ctx) { |
| 1961 | auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); |
| 1962 | |
| 1963 | U8 packed = pack(pack(to_unorm(r, 255))); |
| 1964 | store(ptr, packed, tail); |
| 1965 | } |
Mike Klein | 420e38f | 2017-02-24 09:05:14 -0500 | [diff] [blame] | 1966 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1967 | STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1968 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 1969 | |
Mike Klein | 5224f46 | 2017-03-07 17:29:54 -0500 | [diff] [blame] | 1970 | from_565(load<U16>(ptr, tail), &r,&g,&b); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1971 | a = 1.0f; |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 1972 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1973 | STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1974 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 1975 | |
| 1976 | from_565(load<U16>(ptr, tail), &dr,&dg,&db); |
| 1977 | da = 1.0f; |
| 1978 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1979 | STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 1980 | const uint16_t* ptr; |
| 1981 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 1982 | from_565(gather(ptr, ix), &r,&g,&b); |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 1983 | a = 1.0f; |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 1984 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1985 | STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1986 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 1987 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 1988 | U16 px = pack( to_unorm(r, 31) << 11 |
| 1989 | | to_unorm(g, 63) << 5 |
| 1990 | | to_unorm(b, 31) ); |
Mike Klein | c31858b | 2017-03-01 13:07:40 -0500 | [diff] [blame] | 1991 | store(ptr, px, tail); |
Mike Klein | 3f81f37 | 2017-02-23 13:03:57 -0500 | [diff] [blame] | 1992 | } |
| 1993 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1994 | STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1995 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 1996 | from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); |
| 1997 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 1998 | STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 1999 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 2000 | from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); |
| 2001 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2002 | STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 21bd3e4 | 2017-04-06 16:32:29 -0400 | [diff] [blame] | 2003 | const uint16_t* ptr; |
| 2004 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 2005 | from_4444(gather(ptr, ix), &r,&g,&b,&a); |
| 2006 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2007 | STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2008 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 2009 | U16 px = pack( to_unorm(r, 15) << 12 |
| 2010 | | to_unorm(g, 15) << 8 |
| 2011 | | to_unorm(b, 15) << 4 |
| 2012 | | to_unorm(a, 15) ); |
Mike Klein | f809fef | 2017-03-31 13:52:45 -0400 | [diff] [blame] | 2013 | store(ptr, px, tail); |
| 2014 | } |
| 2015 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2016 | STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2017 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 2018 | from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); |
| 2019 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2020 | STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2021 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 2022 | from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
| 2023 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2024 | STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | dec4ea8 | 2017-04-06 15:04:05 -0400 | [diff] [blame] | 2025 | const uint32_t* ptr; |
| 2026 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 2027 | from_8888(gather(ptr, ix), &r,&g,&b,&a); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2028 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2029 | STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2030 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
Mike Klein | 3b92b69 | 2017-07-18 11:30:25 -0400 | [diff] [blame] | 2031 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 2032 | U32 px = to_unorm(r, 255) |
| 2033 | | to_unorm(g, 255) << 8 |
| 2034 | | to_unorm(b, 255) << 16 |
| 2035 | | to_unorm(a, 255) << 24; |
Mike Klein | 3b92b69 | 2017-07-18 11:30:25 -0400 | [diff] [blame] | 2036 | store(ptr, px, tail); |
| 2037 | } |
| 2038 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2039 | STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 2040 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
| 2041 | from_88(load<U16>(ptr, tail), &r, &g); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2042 | b = 0; |
| 2043 | a = 1; |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 2044 | } |
| 2045 | STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2046 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
| 2047 | from_88(load<U16>(ptr, tail), &dr, &dg); |
| 2048 | db = 0; |
| 2049 | da = 1; |
| 2050 | } |
| 2051 | STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
| 2052 | const uint16_t* ptr; |
| 2053 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2054 | from_88(gather(ptr, ix), &r, &g); |
| 2055 | b = 0; |
| 2056 | a = 1; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2057 | } |
| 2058 | STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 2059 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy); |
| 2060 | U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 ); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2061 | store(ptr, px, tail); |
| 2062 | } |
| 2063 | |
| 2064 | STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2065 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
| 2066 | r = g = b = 0; |
| 2067 | a = from_short(load<U16>(ptr, tail)); |
| 2068 | } |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 2069 | STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2070 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
| 2071 | dr = dg = db = 0.0f; |
| 2072 | da = from_short(load<U16>(ptr, tail)); |
| 2073 | } |
| 2074 | STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) { |
| 2075 | const uint16_t* ptr; |
| 2076 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2077 | r = g = b = 0.0f; |
| 2078 | a = from_short(gather(ptr, ix)); |
| 2079 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2080 | STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2081 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
| 2082 | |
| 2083 | U16 px = pack(to_unorm(a, 65535)); |
| 2084 | store(ptr, px, tail); |
| 2085 | } |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 2086 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2087 | STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2088 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2089 | b = 0; a = 1; |
| 2090 | from_1616(load<U32>(ptr, tail), &r,&g); |
| 2091 | } |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 2092 | STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2093 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
| 2094 | from_1616(load<U32>(ptr, tail), &dr, &dg); |
| 2095 | db = 0; |
| 2096 | da = 1; |
| 2097 | } |
| 2098 | STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) { |
| 2099 | const uint32_t* ptr; |
| 2100 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2101 | from_1616(gather(ptr, ix), &r, &g); |
| 2102 | b = 0; |
| 2103 | a = 1; |
| 2104 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2105 | STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2106 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
| 2107 | |
| 2108 | U32 px = to_unorm(r, 65535) |
| 2109 | | to_unorm(g, 65535) << 16; |
| 2110 | store(ptr, px, tail); |
| 2111 | } |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 2112 | |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 2113 | STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2114 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 2115 | from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a); |
| 2116 | } |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2117 | STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2118 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
| 2119 | from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da); |
| 2120 | } |
| 2121 | STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) { |
| 2122 | const uint64_t* ptr; |
| 2123 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2124 | from_16161616(gather(ptr, ix), &r, &g, &b, &a); |
| 2125 | } |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 2126 | STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2127 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy); |
| 2128 | |
| 2129 | U16 R = pack(to_unorm(r, 65535)), |
| 2130 | G = pack(to_unorm(g, 65535)), |
| 2131 | B = pack(to_unorm(b, 65535)), |
| 2132 | A = pack(to_unorm(a, 65535)); |
| 2133 | |
| 2134 | store4(ptr,tail, R,G,B,A); |
| 2135 | } |
| 2136 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2137 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2138 | STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 2139 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
| 2140 | from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a); |
| 2141 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2142 | STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 2143 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
| 2144 | from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
| 2145 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2146 | STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 2147 | const uint32_t* ptr; |
| 2148 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 2149 | from_1010102(gather(ptr, ix), &r,&g,&b,&a); |
| 2150 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2151 | STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | ac568a9 | 2018-01-25 09:09:32 -0500 | [diff] [blame] | 2152 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
| 2153 | |
| 2154 | U32 px = to_unorm(r, 1023) |
| 2155 | | to_unorm(g, 1023) << 10 |
| 2156 | | to_unorm(b, 1023) << 20 |
| 2157 | | to_unorm(a, 3) << 30; |
| 2158 | store(ptr, px, tail); |
| 2159 | } |
| 2160 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2161 | STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2162 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2163 | |
Mike Klein | 114e6b3 | 2017-04-03 22:21:15 -0400 | [diff] [blame] | 2164 | U16 R,G,B,A; |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 2165 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
Mike Klein | 114e6b3 | 2017-04-03 22:21:15 -0400 | [diff] [blame] | 2166 | r = from_half(R); |
| 2167 | g = from_half(G); |
| 2168 | b = from_half(B); |
| 2169 | a = from_half(A); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2170 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2171 | STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2172 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 2173 | |
| 2174 | U16 R,G,B,A; |
| 2175 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
| 2176 | dr = from_half(R); |
| 2177 | dg = from_half(G); |
| 2178 | db = from_half(B); |
| 2179 | da = from_half(A); |
| 2180 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2181 | STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 5f055f0 | 2017-04-06 20:02:11 -0400 | [diff] [blame] | 2182 | const uint64_t* ptr; |
| 2183 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 2184 | auto px = gather(ptr, ix); |
| 2185 | |
| 2186 | U16 R,G,B,A; |
| 2187 | load4((const uint16_t*)&px,0, &R,&G,&B,&A); |
| 2188 | r = from_half(R); |
| 2189 | g = from_half(G); |
| 2190 | b = from_half(B); |
| 2191 | a = from_half(A); |
| 2192 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2193 | STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2194 | auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 2195 | store4((uint16_t*)ptr,tail, to_half(r) |
| 2196 | , to_half(g) |
| 2197 | , to_half(b) |
| 2198 | , to_half(a)); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2199 | } |
| 2200 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2201 | STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | b437913 | 2017-10-17 16:06:49 -0400 | [diff] [blame] | 2202 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 2203 | |
Mike Klein | 37155d4 | 2017-12-15 09:55:03 -0500 | [diff] [blame] | 2204 | U16 R = bswap(pack(to_unorm(r, 65535))), |
| 2205 | G = bswap(pack(to_unorm(g, 65535))), |
| 2206 | B = bswap(pack(to_unorm(b, 65535))), |
| 2207 | A = bswap(pack(to_unorm(a, 65535))); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 2208 | |
Mike Klein | b382173 | 2017-04-17 10:58:05 -0400 | [diff] [blame] | 2209 | store4(ptr,tail, R,G,B,A); |
Mike Klein | 3146bb9 | 2017-04-05 14:45:02 -0400 | [diff] [blame] | 2210 | } |
| 2211 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2212 | STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2213 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
| 2214 | |
| 2215 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
| 2216 | r = 0; |
| 2217 | g = 0; |
| 2218 | b = 0; |
| 2219 | a = from_half(A); |
| 2220 | } |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2221 | STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2222 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
| 2223 | |
| 2224 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
| 2225 | dr = dg = db = 0.0f; |
| 2226 | da = from_half(A); |
| 2227 | } |
| 2228 | STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) { |
| 2229 | const uint16_t* ptr; |
| 2230 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2231 | r = g = b = 0.0f; |
| 2232 | a = from_half(gather(ptr, ix)); |
| 2233 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2234 | STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2235 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
| 2236 | store(ptr, to_half(a), tail); |
| 2237 | } |
| 2238 | |
| 2239 | STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2240 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2241 | |
| 2242 | U16 R,G; |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2243 | load2((const uint16_t*)ptr, tail, &R, &G); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2244 | r = from_half(R); |
| 2245 | g = from_half(G); |
| 2246 | b = 0; |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2247 | a = 1; |
| 2248 | } |
| 2249 | STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2250 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
| 2251 | |
| 2252 | U16 R,G; |
| 2253 | load2((const uint16_t*)ptr, tail, &R, &G); |
| 2254 | dr = from_half(R); |
| 2255 | dg = from_half(G); |
| 2256 | db = 0; |
| 2257 | da = 1; |
| 2258 | } |
| 2259 | STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) { |
| 2260 | const uint32_t* ptr; |
| 2261 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
| 2262 | auto px = gather(ptr, ix); |
| 2263 | |
| 2264 | U16 R,G; |
| 2265 | load2((const uint16_t*)&px, 0, &R, &G); |
| 2266 | r = from_half(R); |
| 2267 | g = from_half(G); |
| 2268 | b = 0; |
| 2269 | a = 1; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2270 | } |
| 2271 | STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 2272 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2273 | store2((uint16_t*)ptr, tail, to_half(r) |
| 2274 | , to_half(g)); |
| 2275 | } |
| 2276 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2277 | STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 3785471 | 2018-06-26 11:43:06 -0400 | [diff] [blame] | 2278 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
Mike Klein | 14987eb | 2017-04-06 10:22:26 -0400 | [diff] [blame] | 2279 | load4(ptr,tail, &r,&g,&b,&a); |
| 2280 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2281 | STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 3785471 | 2018-06-26 11:43:06 -0400 | [diff] [blame] | 2282 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
Mike Reed | 279091e | 2017-06-27 16:58:00 -0400 | [diff] [blame] | 2283 | load4(ptr,tail, &dr,&dg,&db,&da); |
| 2284 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2285 | STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 3785471 | 2018-06-26 11:43:06 -0400 | [diff] [blame] | 2286 | const float* ptr; |
| 2287 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
| 2288 | r = gather(ptr, 4*ix + 0); |
| 2289 | g = gather(ptr, 4*ix + 1); |
| 2290 | b = gather(ptr, 4*ix + 2); |
| 2291 | a = gather(ptr, 4*ix + 3); |
| 2292 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2293 | STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 3785471 | 2018-06-26 11:43:06 -0400 | [diff] [blame] | 2294 | auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy); |
Mike Klein | fa6eb91 | 2017-04-05 10:18:27 -0400 | [diff] [blame] | 2295 | store4(ptr,tail, r,g,b,a); |
Mike Klein | 94fc0fe | 2017-03-03 14:05:32 -0500 | [diff] [blame] | 2296 | } |
| 2297 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2298 | STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2299 | auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy); |
| 2300 | load2(ptr, tail, &r, &g); |
| 2301 | b = 0; |
| 2302 | a = 1; |
| 2303 | } |
| 2304 | STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
| 2305 | auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy); |
| 2306 | store2(ptr, tail, r, g); |
| 2307 | } |
| 2308 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2309 | SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) { |
Mike Klein | f3b4e16 | 2017-09-22 15:32:59 -0400 | [diff] [blame] | 2310 | return v - floor_(v*ctx->invScale)*ctx->scale; |
Mike Klein | 0cc60b8 | 2017-06-22 11:00:17 -0700 | [diff] [blame] | 2311 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2312 | SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) { |
Mike Reed | 51e46d5 | 2017-06-23 14:21:25 -0400 | [diff] [blame] | 2313 | auto limit = ctx->scale; |
| 2314 | auto invLimit = ctx->invScale; |
Mike Klein | f3b4e16 | 2017-09-22 15:32:59 -0400 | [diff] [blame] | 2315 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
Mike Klein | 0cc60b8 | 2017-06-22 11:00:17 -0700 | [diff] [blame] | 2316 | } |
Mike Klein | f3b4e16 | 2017-09-22 15:32:59 -0400 | [diff] [blame] | 2317 | // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images). |
| 2318 | // The gather stages will hard clamp the output of these stages to [0,limit)... |
| 2319 | // we just need to do the basic repeat or mirroring. |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2320 | STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); } |
| 2321 | STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); } |
| 2322 | STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); } |
| 2323 | STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); } |
Mike Klein | 0cc60b8 | 2017-06-22 11:00:17 -0700 | [diff] [blame] | 2324 | |
Mike Klein | a3b8895 | 2017-10-05 13:21:31 -0400 | [diff] [blame] | 2325 | STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); } |
| 2326 | STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); } |
| 2327 | STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); } |
Mike Klein | 9f85d68 | 2017-05-23 07:52:01 -0400 | [diff] [blame] | 2328 | |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2329 | // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain: |
| 2330 | // mask == 0x00000000 if the coordinate(s) are out of bounds |
| 2331 | // mask == 0xFFFFFFFF if the coordinate(s) are in bounds |
| 2332 | // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0 |
| 2333 | // if either of the coordinates were out of bounds. |
| 2334 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2335 | STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2336 | auto w = ctx->limit_x; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2337 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w))); |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2338 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2339 | STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2340 | auto h = ctx->limit_y; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2341 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h))); |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2342 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2343 | STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2344 | auto w = ctx->limit_x; |
| 2345 | auto h = ctx->limit_y; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2346 | sk_unaligned_store(ctx->mask, |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2347 | cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h))); |
| 2348 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2349 | STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2350 | auto mask = sk_unaligned_load<U32>(ctx->mask); |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 2351 | r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); |
| 2352 | g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); |
| 2353 | b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); |
| 2354 | a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); |
Mike Reed | dfc0e91 | 2018-02-16 12:40:18 -0500 | [diff] [blame] | 2355 | } |
| 2356 | |
Mike Klein | b1df5e5 | 2018-10-17 17:06:03 -0400 | [diff] [blame] | 2357 | STAGE(alpha_to_gray, Ctx::None) { |
| 2358 | r = g = b = a; |
| 2359 | a = 1; |
| 2360 | } |
| 2361 | STAGE(alpha_to_gray_dst, Ctx::None) { |
| 2362 | dr = dg = db = da; |
| 2363 | da = 1; |
| 2364 | } |
Brian Osman | a7a2324 | 2022-02-08 10:34:38 -0500 | [diff] [blame] | 2365 | STAGE(alpha_to_red, Ctx::None) { |
| 2366 | r = a; |
| 2367 | a = 1; |
| 2368 | } |
| 2369 | STAGE(alpha_to_red_dst, Ctx::None) { |
| 2370 | dr = da; |
| 2371 | da = 1; |
| 2372 | } |
| 2373 | |
Mike Klein | da69d59 | 2019-07-11 07:38:31 -0500 | [diff] [blame] | 2374 | STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2375 | a = r*0.2126f + g*0.7152f + b*0.0722f; |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 2376 | r = g = b = 0; |
| 2377 | } |
Brian Salomon | 01ff538 | 2020-12-15 16:06:26 -0500 | [diff] [blame] | 2378 | STAGE(bt709_luminance_or_luma_to_rgb, Ctx::None) { |
| 2379 | r = g = b = r*0.2126f + g*0.7152f + b*0.0722f; |
| 2380 | } |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 2381 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2382 | STAGE(matrix_translate, const float* m) { |
Mike Reed | 7aad8cc | 2017-07-05 12:33:06 -0400 | [diff] [blame] | 2383 | r += m[0]; |
| 2384 | g += m[1]; |
| 2385 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2386 | STAGE(matrix_scale_translate, const float* m) { |
Mike Klein | f04ff76 | 2017-10-20 15:50:12 -0400 | [diff] [blame] | 2387 | r = mad(r,m[0], m[2]); |
| 2388 | g = mad(g,m[1], m[3]); |
Mike Reed | 7aad8cc | 2017-07-05 12:33:06 -0400 | [diff] [blame] | 2389 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2390 | STAGE(matrix_2x3, const float* m) { |
Herb Derby | 97bf728 | 2021-10-06 11:00:39 -0400 | [diff] [blame] | 2391 | auto R = mad(r,m[0], mad(g,m[1], m[2])), |
| 2392 | G = mad(r,m[3], mad(g,m[4], m[5])); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2393 | r = R; |
| 2394 | g = G; |
| 2395 | } |
Mike Klein | b82edcc | 2018-07-10 18:25:03 +0000 | [diff] [blame] | 2396 | STAGE(matrix_3x3, const float* m) { |
| 2397 | auto R = mad(r,m[0], mad(g,m[3], b*m[6])), |
| 2398 | G = mad(r,m[1], mad(g,m[4], b*m[7])), |
| 2399 | B = mad(r,m[2], mad(g,m[5], b*m[8])); |
| 2400 | r = R; |
| 2401 | g = G; |
| 2402 | b = B; |
| 2403 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2404 | STAGE(matrix_3x4, const float* m) { |
Mike Klein | b8d5275 | 2017-02-16 10:21:29 -0500 | [diff] [blame] | 2405 | auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), |
| 2406 | G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), |
| 2407 | B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2408 | r = R; |
| 2409 | g = G; |
| 2410 | b = B; |
| 2411 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2412 | STAGE(matrix_4x5, const float* m) { |
Mike Reed | 361a640 | 2019-04-23 12:19:00 -0400 | [diff] [blame] | 2413 | auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))), |
| 2414 | G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))), |
| 2415 | B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))), |
| 2416 | A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19])))); |
Mike Klein | e9ed07d | 2017-03-07 12:28:11 -0500 | [diff] [blame] | 2417 | r = R; |
| 2418 | g = G; |
| 2419 | b = B; |
| 2420 | a = A; |
| 2421 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2422 | STAGE(matrix_4x3, const float* m) { |
Mike Reed | 0264095 | 2017-05-19 15:32:13 -0400 | [diff] [blame] | 2423 | auto X = r, |
| 2424 | Y = g; |
| 2425 | |
| 2426 | r = mad(X, m[0], mad(Y, m[4], m[ 8])); |
| 2427 | g = mad(X, m[1], mad(Y, m[5], m[ 9])); |
| 2428 | b = mad(X, m[2], mad(Y, m[6], m[10])); |
| 2429 | a = mad(X, m[3], mad(Y, m[7], m[11])); |
| 2430 | } |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2431 | STAGE(matrix_perspective, const float* m) { |
Mike Klein | 11d2df0 | 2017-02-24 11:51:36 -0500 | [diff] [blame] | 2432 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
Mike Klein | 11d2df0 | 2017-02-24 11:51:36 -0500 | [diff] [blame] | 2433 | auto R = mad(r,m[0], mad(g,m[1], m[2])), |
| 2434 | G = mad(r,m[3], mad(g,m[4], m[5])), |
| 2435 | Z = mad(r,m[6], mad(g,m[7], m[8])); |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 2436 | r = R * rcp_precise(Z); |
| 2437 | g = G * rcp_precise(Z); |
Mike Klein | 11d2df0 | 2017-02-24 11:51:36 -0500 | [diff] [blame] | 2438 | } |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2439 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2440 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 2441 | F* r, F* g, F* b, F* a) { |
| 2442 | F fr, br, fg, bg, fb, bb, fa, ba; |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 2443 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 2444 | if (c->stopCount <=8) { |
| 2445 | fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); |
| 2446 | br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); |
| 2447 | fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); |
| 2448 | bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); |
| 2449 | fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); |
| 2450 | bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); |
| 2451 | fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); |
| 2452 | ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); |
| 2453 | } else |
| 2454 | #endif |
| 2455 | { |
| 2456 | fr = gather(c->fs[0], idx); |
| 2457 | br = gather(c->bs[0], idx); |
| 2458 | fg = gather(c->fs[1], idx); |
| 2459 | bg = gather(c->bs[1], idx); |
| 2460 | fb = gather(c->fs[2], idx); |
| 2461 | bb = gather(c->bs[2], idx); |
| 2462 | fa = gather(c->fs[3], idx); |
| 2463 | ba = gather(c->bs[3], idx); |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 2464 | } |
| 2465 | |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 2466 | *r = mad(t, fr, br); |
| 2467 | *g = mad(t, fg, bg); |
| 2468 | *b = mad(t, fb, bb); |
| 2469 | *a = mad(t, fa, ba); |
| 2470 | } |
| 2471 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2472 | STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 2473 | auto t = r; |
| 2474 | auto idx = trunc_(t * (c->stopCount-1)); |
| 2475 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
| 2476 | } |
| 2477 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2478 | STAGE(gradient, const SkRasterPipeline_GradientCtx* c) { |
Herb Derby | 4de1304 | 2017-05-15 10:49:39 -0400 | [diff] [blame] | 2479 | auto t = r; |
| 2480 | U32 idx = 0; |
| 2481 | |
| 2482 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
| 2483 | for (size_t i = 1; i < c->stopCount; i++) { |
| 2484 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
| 2485 | } |
| 2486 | |
| 2487 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 2488 | } |
| 2489 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2490 | STAGE(evenly_spaced_2_stop_gradient, const void* ctx) { |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 2491 | struct Ctx { float f[4], b[4]; }; |
Mike Klein | 8a823fa | 2017-04-05 17:29:26 -0400 | [diff] [blame] | 2492 | auto c = (const Ctx*)ctx; |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2493 | |
| 2494 | auto t = r; |
Herb Derby | 7b4202d | 2017-04-10 10:52:34 -0400 | [diff] [blame] | 2495 | r = mad(t, c->f[0], c->b[0]); |
| 2496 | g = mad(t, c->f[1], c->b[1]); |
| 2497 | b = mad(t, c->f[2], c->b[2]); |
| 2498 | a = mad(t, c->f[3], c->b[3]); |
Mike Klein | e1caee1 | 2017-02-15 13:31:12 -0500 | [diff] [blame] | 2499 | } |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2500 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2501 | STAGE(xy_to_unit_angle, Ctx::None) { |
Herb Derby | 7eb8698 | 2017-05-02 19:04:39 -0400 | [diff] [blame] | 2502 | F X = r, |
| 2503 | Y = g; |
| 2504 | F xabs = abs_(X), |
| 2505 | yabs = abs_(Y); |
| 2506 | |
| 2507 | F slope = min(xabs, yabs)/max(xabs, yabs); |
| 2508 | F s = slope * slope; |
| 2509 | |
| 2510 | // Use a 7th degree polynomial to approximate atan. |
| 2511 | // This was generated using sollya.gforge.inria.fr. |
| 2512 | // A float optimized polynomial was generated using the following command. |
| 2513 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
| 2514 | F phi = slope |
| 2515 | * (0.15912117063999176025390625f + s |
| 2516 | * (-5.185396969318389892578125e-2f + s |
| 2517 | * (2.476101927459239959716796875e-2f + s |
| 2518 | * (-7.0547382347285747528076171875e-3f)))); |
| 2519 | |
| 2520 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
| 2521 | phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi); |
| 2522 | phi = if_then_else(Y < 0.0f , 1.0f - phi , phi); |
| 2523 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
| 2524 | r = phi; |
| 2525 | } |
| 2526 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2527 | STAGE(xy_to_radius, Ctx::None) { |
Herb Derby | 090fbf8 | 2017-05-08 15:10:36 -0400 | [diff] [blame] | 2528 | F X2 = r * r, |
| 2529 | Y2 = g * g; |
Mike Klein | fd35c74 | 2017-05-15 15:55:54 -0400 | [diff] [blame] | 2530 | r = sqrt_(X2 + Y2); |
Herb Derby | 090fbf8 | 2017-05-08 15:10:36 -0400 | [diff] [blame] | 2531 | } |
| 2532 | |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2533 | // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works. |
| 2534 | |
| 2535 | STAGE(negate_x, Ctx::None) { r = -r; } |
| 2536 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2537 | STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2538 | F x = r, y = g, &t = r; |
| 2539 | t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0 |
| 2540 | } |
| 2541 | |
| 2542 | STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) { |
| 2543 | F x = r, y = g, &t = r; |
| 2544 | t = x + y*y / x; // (x^2 + y^2) / x |
| 2545 | } |
| 2546 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2547 | STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2548 | F x = r, y = g, &t = r; |
| 2549 | t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
| 2550 | } |
| 2551 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2552 | STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2553 | F x = r, y = g, &t = r; |
| 2554 | t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
| 2555 | } |
| 2556 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2557 | STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2558 | F x = r, y = g, &t = r; |
| 2559 | t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
| 2560 | } |
| 2561 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2562 | STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2563 | F& t = r; |
| 2564 | t = t + ctx->fP1; // ctx->fP1 = f |
| 2565 | } |
| 2566 | |
| 2567 | STAGE(alter_2pt_conical_unswap, Ctx::None) { |
| 2568 | F& t = r; |
| 2569 | t = 1 - t; |
| 2570 | } |
| 2571 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2572 | STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2573 | F& t = r; |
| 2574 | auto is_degenerate = (t != t); // NaN |
| 2575 | t = if_then_else(is_degenerate, F(0), t); |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2576 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2577 | } |
| 2578 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2579 | STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) { |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2580 | F& t = r; |
| 2581 | auto is_degenerate = (t <= 0) | (t != t); |
| 2582 | t = if_then_else(is_degenerate, F(0), t); |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2583 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
Yuqian Li | d208a88 | 2018-01-04 10:08:42 -0500 | [diff] [blame] | 2584 | } |
| 2585 | |
Mike Klein | f7729c2 | 2017-09-27 11:42:30 -0400 | [diff] [blame] | 2586 | STAGE(apply_vector_mask, const uint32_t* ctx) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2587 | const U32 mask = sk_unaligned_load<U32>(ctx); |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 2588 | r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); |
| 2589 | g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); |
| 2590 | b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); |
| 2591 | a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); |
Florin Malita | 2e40900 | 2017-06-28 14:46:54 -0400 | [diff] [blame] | 2592 | } |
| 2593 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2594 | STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) { |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2595 | // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). |
| 2596 | // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid |
| 2597 | // surrounding (x,y) at (0.5,0.5) off-center. |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2598 | F fx = fract(r + 0.5f), |
| 2599 | fy = fract(g + 0.5f); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2600 | |
| 2601 | // Samplers will need to load x and fx, or y and fy. |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2602 | sk_unaligned_store(c->x, r); |
| 2603 | sk_unaligned_store(c->y, g); |
| 2604 | sk_unaligned_store(c->fx, fx); |
| 2605 | sk_unaligned_store(c->fy, fy); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2606 | } |
| 2607 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2608 | STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) { |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2609 | // Bilinear and bicubic filters are both separable, so we produce independent contributions |
| 2610 | // from x and y, multiplying them together here to get each pixel's total scale factor. |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2611 | auto scale = sk_unaligned_load<F>(c->scalex) |
| 2612 | * sk_unaligned_load<F>(c->scaley); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2613 | dr = mad(scale, r, dr); |
| 2614 | dg = mad(scale, g, dg); |
| 2615 | db = mad(scale, b, db); |
| 2616 | da = mad(scale, a, da); |
| 2617 | } |
| 2618 | |
| 2619 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
| 2620 | // are combined in direct proportion to their area overlapping that logical query pixel. |
| 2621 | // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. |
| 2622 | // The y-axis is symmetric. |
| 2623 | |
| 2624 | template <int kScale> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2625 | SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2626 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
| 2627 | F fx = sk_unaligned_load<F>(ctx->fx); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2628 | |
| 2629 | F scalex; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2630 | if (kScale == -1) { scalex = 1.0f - fx; } |
| 2631 | if (kScale == +1) { scalex = fx; } |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2632 | sk_unaligned_store(ctx->scalex, scalex); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2633 | } |
| 2634 | template <int kScale> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2635 | SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2636 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
| 2637 | F fy = sk_unaligned_load<F>(ctx->fy); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2638 | |
| 2639 | F scaley; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2640 | if (kScale == -1) { scaley = 1.0f - fy; } |
| 2641 | if (kScale == +1) { scaley = fy; } |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2642 | sk_unaligned_store(ctx->scaley, scaley); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2643 | } |
| 2644 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2645 | STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); } |
| 2646 | STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); } |
| 2647 | STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); } |
| 2648 | STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); } |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2649 | |
| 2650 | |
| 2651 | // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample |
| 2652 | // pixel center are combined with a non-uniform cubic filter, with higher values near the center. |
| 2653 | // |
| 2654 | // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. |
| 2655 | // See GrCubicEffect for details of this particular filter. |
| 2656 | |
| 2657 | SI F bicubic_near(F t) { |
| 2658 | // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2659 | return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2660 | } |
| 2661 | SI F bicubic_far(F t) { |
| 2662 | // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2663 | return (t*t)*mad((7/18.0f), t, (-6/18.0f)); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2664 | } |
| 2665 | |
| 2666 | template <int kScale> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2667 | SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2668 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
| 2669 | F fx = sk_unaligned_load<F>(ctx->fx); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2670 | |
| 2671 | F scalex; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2672 | if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } |
| 2673 | if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } |
| 2674 | if (kScale == +1) { scalex = bicubic_near( fx); } |
| 2675 | if (kScale == +3) { scalex = bicubic_far ( fx); } |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2676 | sk_unaligned_store(ctx->scalex, scalex); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2677 | } |
| 2678 | template <int kScale> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2679 | SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2680 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
| 2681 | F fy = sk_unaligned_load<F>(ctx->fy); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2682 | |
| 2683 | F scaley; |
Mike Klein | fe560a8 | 2017-05-01 12:56:35 -0400 | [diff] [blame] | 2684 | if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } |
| 2685 | if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } |
| 2686 | if (kScale == +1) { scaley = bicubic_near( fy); } |
| 2687 | if (kScale == +3) { scaley = bicubic_far ( fy); } |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 2688 | sk_unaligned_store(ctx->scaley, scaley); |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2689 | } |
| 2690 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2691 | STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); } |
| 2692 | STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); } |
| 2693 | STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); } |
| 2694 | STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); } |
Mike Klein | 0a90449 | 2017-04-12 12:52:48 -0400 | [diff] [blame] | 2695 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2696 | STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); } |
| 2697 | STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); } |
| 2698 | STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); } |
| 2699 | STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); } |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 2700 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 2701 | STAGE(callback, SkRasterPipeline_CallbackCtx* c) { |
Mike Klein | c17dc24 | 2017-04-20 16:21:57 -0400 | [diff] [blame] | 2702 | store4(c->rgba,0, r,g,b,a); |
Mike Klein | 0e4d096 | 2017-09-27 11:04:34 -0400 | [diff] [blame] | 2703 | c->fn(c, tail ? tail : N); |
Mike Klein | c17dc24 | 2017-04-20 16:21:57 -0400 | [diff] [blame] | 2704 | load4(c->read_from,0, &r,&g,&b,&a); |
Mike Klein | 7fee90c | 2017-04-07 16:55:09 -0400 | [diff] [blame] | 2705 | } |
Mike Klein | c2f876b | 2017-08-09 18:23:25 -0400 | [diff] [blame] | 2706 | |
Mike Klein | 3cbcb73 | 2017-10-25 12:38:25 -0400 | [diff] [blame] | 2707 | STAGE(gauss_a_to_rgba, Ctx::None) { |
| 2708 | // x = 1 - x; |
| 2709 | // exp(-x * x * 4) - 0.018f; |
| 2710 | // ... now approximate with quartic |
| 2711 | // |
| 2712 | const float c4 = -2.26661229133605957031f; |
| 2713 | const float c3 = 2.89795351028442382812f; |
| 2714 | const float c2 = 0.21345567703247070312f; |
| 2715 | const float c1 = 0.15489584207534790039f; |
| 2716 | const float c0 = 0.00030726194381713867f; |
| 2717 | a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0); |
| 2718 | r = a; |
| 2719 | g = a; |
| 2720 | b = a; |
| 2721 | } |
Mike Klein | 1fa9c43 | 2017-12-11 09:59:47 -0500 | [diff] [blame] | 2722 | |
Mike Klein | 0100562 | 2019-08-13 12:22:17 -0400 | [diff] [blame] | 2723 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { |
| 2724 | // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here. |
| 2725 | switch (mode) { |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 2726 | case SkTileMode::kDecal: |
Mike Klein | 0100562 | 2019-08-13 12:22:17 -0400 | [diff] [blame] | 2727 | case SkTileMode::kClamp: return v; |
| 2728 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; |
| 2729 | case SkTileMode::kMirror: |
| 2730 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
| 2731 | } |
| 2732 | SkUNREACHABLE; |
| 2733 | } |
| 2734 | |
| 2735 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, |
| 2736 | F* r, F* g, F* b, F* a) { |
| 2737 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); |
| 2738 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); |
| 2739 | |
| 2740 | switch (ctx->ct) { |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 2741 | default: *r = *g = *b = *a = 0; |
Mike Klein | 0100562 | 2019-08-13 12:22:17 -0400 | [diff] [blame] | 2742 | break; |
| 2743 | |
| 2744 | case kRGBA_8888_SkColorType: |
| 2745 | case kBGRA_8888_SkColorType: { |
| 2746 | const uint32_t* ptr; |
| 2747 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 2748 | from_8888(gather(ptr, ix), r,g,b,a); |
| 2749 | if (ctx->ct == kBGRA_8888_SkColorType) { |
| 2750 | std::swap(*r,*b); |
| 2751 | } |
| 2752 | } break; |
| 2753 | } |
| 2754 | } |
| 2755 | |
| 2756 | template <int D> |
| 2757 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, |
| 2758 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], |
| 2759 | F* r, F* g, F* b, F* a) { |
| 2760 | |
| 2761 | float start = -0.5f*(D-1); |
| 2762 | |
| 2763 | *r = *g = *b = *a = 0; |
| 2764 | F y = cy + start; |
| 2765 | for (int j = 0; j < D; j++, y += 1.0f) { |
| 2766 | F x = cx + start; |
| 2767 | for (int i = 0; i < D; i++, x += 1.0f) { |
| 2768 | F R,G,B,A; |
| 2769 | sample(ctx, x,y, &R,&G,&B,&A); |
| 2770 | |
| 2771 | F w = wx[i] * wy[j]; |
| 2772 | *r = mad(w,R,*r); |
| 2773 | *g = mad(w,G,*g); |
| 2774 | *b = mad(w,B,*b); |
| 2775 | *a = mad(w,A,*a); |
| 2776 | } |
| 2777 | } |
| 2778 | } |
| 2779 | |
| 2780 | STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { |
| 2781 | F x = r, fx = fract(x + 0.5f), |
| 2782 | y = g, fy = fract(y + 0.5f); |
| 2783 | const F wx[] = {1.0f - fx, fx}; |
| 2784 | const F wy[] = {1.0f - fy, fy}; |
| 2785 | |
| 2786 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
| 2787 | } |
| 2788 | STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) { |
| 2789 | F x = r, fx = fract(x + 0.5f), |
| 2790 | y = g, fy = fract(y + 0.5f); |
| 2791 | const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) }; |
| 2792 | const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) }; |
| 2793 | |
| 2794 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
| 2795 | } |
| 2796 | |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2797 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
Mike Klein | dfa1de9 | 2019-10-17 12:34:22 -0500 | [diff] [blame] | 2798 | STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2799 | // (cx,cy) are the center of our sample. |
| 2800 | F cx = r, |
| 2801 | cy = g; |
Mike Klein | dfa1de9 | 2019-10-17 12:34:22 -0500 | [diff] [blame] | 2802 | |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2803 | // All sample points are at the same fractional offset (fx,fy). |
| 2804 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
| 2805 | F fx = fract(cx + 0.5f), |
| 2806 | fy = fract(cy + 0.5f); |
| 2807 | |
| 2808 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
| 2809 | r = g = b = a = 0; |
| 2810 | |
John Stiles | 14f8d79 | 2021-08-10 16:22:22 -0400 | [diff] [blame] | 2811 | for (float py = -0.5f; py <= +0.5f; py += 1.0f) |
| 2812 | for (float px = -0.5f; px <= +0.5f; px += 1.0f) { |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2813 | // (x,y) are the coordinates of this sample point. |
John Stiles | 14f8d79 | 2021-08-10 16:22:22 -0400 | [diff] [blame] | 2814 | F x = cx + px, |
| 2815 | y = cy + py; |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2816 | |
| 2817 | // ix_and_ptr() will clamp to the image's bounds for us. |
| 2818 | const uint32_t* ptr; |
| 2819 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 2820 | |
| 2821 | F sr,sg,sb,sa; |
| 2822 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
| 2823 | |
| 2824 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
| 2825 | // are combined in direct proportion to their area overlapping that logical query pixel. |
| 2826 | // At positive offsets, the x-axis contribution to that rectangle is fx, |
| 2827 | // or (1-fx) at negative x. Same deal for y. |
John Stiles | 14f8d79 | 2021-08-10 16:22:22 -0400 | [diff] [blame] | 2828 | F sx = (px > 0) ? fx : 1.0f - fx, |
| 2829 | sy = (py > 0) ? fy : 1.0f - fy, |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2830 | area = sx * sy; |
| 2831 | |
| 2832 | r += sr * area; |
| 2833 | g += sg * area; |
| 2834 | b += sb * area; |
| 2835 | a += sa * area; |
| 2836 | } |
Mike Klein | dfa1de9 | 2019-10-17 12:34:22 -0500 | [diff] [blame] | 2837 | } |
Mike Reed | 78eedba | 2019-07-31 16:39:15 -0400 | [diff] [blame] | 2838 | |
Mike Klein | ad82b40 | 2019-10-17 20:13:14 +0000 | [diff] [blame] | 2839 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
| 2840 | STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
| 2841 | // (cx,cy) are the center of our sample. |
| 2842 | F cx = r, |
| 2843 | cy = g; |
| 2844 | |
| 2845 | // All sample points are at the same fractional offset (fx,fy). |
| 2846 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
| 2847 | F fx = fract(cx + 0.5f), |
| 2848 | fy = fract(cy + 0.5f); |
| 2849 | |
| 2850 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
| 2851 | r = g = b = a = 0; |
| 2852 | |
| 2853 | const F scaley[4] = { |
| 2854 | bicubic_far (1.0f - fy), bicubic_near(1.0f - fy), |
| 2855 | bicubic_near( fy), bicubic_far ( fy), |
| 2856 | }; |
| 2857 | const F scalex[4] = { |
| 2858 | bicubic_far (1.0f - fx), bicubic_near(1.0f - fx), |
| 2859 | bicubic_near( fx), bicubic_far ( fx), |
| 2860 | }; |
| 2861 | |
| 2862 | F sample_y = cy - 1.5f; |
| 2863 | for (int yy = 0; yy <= 3; ++yy) { |
| 2864 | F sample_x = cx - 1.5f; |
| 2865 | for (int xx = 0; xx <= 3; ++xx) { |
| 2866 | F scale = scalex[xx] * scaley[yy]; |
| 2867 | |
| 2868 | // ix_and_ptr() will clamp to the image's bounds for us. |
| 2869 | const uint32_t* ptr; |
| 2870 | U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y); |
| 2871 | |
| 2872 | F sr,sg,sb,sa; |
| 2873 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
| 2874 | |
| 2875 | r = mad(scale, sr, r); |
| 2876 | g = mad(scale, sg, g); |
| 2877 | b = mad(scale, sb, b); |
| 2878 | a = mad(scale, sa, a); |
| 2879 | |
| 2880 | sample_x += 1; |
| 2881 | } |
| 2882 | sample_y += 1; |
| 2883 | } |
Mike Reed | 78eedba | 2019-07-31 16:39:15 -0400 | [diff] [blame] | 2884 | } |
| 2885 | |
Jim Van Verth | d6245fc | 2022-02-15 16:30:59 -0500 | [diff] [blame] | 2886 | // ~~~~~~ skgpu::Swizzle stage ~~~~~~ // |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2887 | |
| 2888 | STAGE(swizzle, void* ctx) { |
| 2889 | auto ir = r, ig = g, ib = b, ia = a; |
| 2890 | F* o[] = {&r, &g, &b, &a}; |
| 2891 | char swiz[4]; |
| 2892 | memcpy(swiz, &ctx, sizeof(swiz)); |
| 2893 | |
| 2894 | for (int i = 0; i < 4; ++i) { |
| 2895 | switch (swiz[i]) { |
| 2896 | case 'r': *o[i] = ir; break; |
| 2897 | case 'g': *o[i] = ig; break; |
| 2898 | case 'b': *o[i] = ib; break; |
| 2899 | case 'a': *o[i] = ia; break; |
Brian Salomon | f30b1c1 | 2019-06-20 12:25:02 -0400 | [diff] [blame] | 2900 | case '0': *o[i] = F(0); break; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 2901 | case '1': *o[i] = F(1); break; |
| 2902 | default: break; |
| 2903 | } |
| 2904 | } |
| 2905 | } |
| 2906 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2907 | namespace lowp { |
Mike Klein | 419709d | 2018-10-11 22:05:14 -0400 | [diff] [blame] | 2908 | #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2909 | // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually), |
| 2910 | // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the |
| 2911 | // highp float pipeline. |
| 2912 | #define M(st) static void (*st)(void) = nullptr; |
| 2913 | SK_RASTER_PIPELINE_STAGES(M) |
| 2914 | #undef M |
| 2915 | static void (*just_return)(void) = nullptr; |
| 2916 | |
| 2917 | static void start_pipeline(size_t,size_t,size_t,size_t, void**) {} |
| 2918 | |
| 2919 | #else // We are compiling vector code with Clang... let's make some lowp stages! |
| 2920 | |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 2921 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2922 | using U8 = uint8_t __attribute__((ext_vector_type(16))); |
| 2923 | using U16 = uint16_t __attribute__((ext_vector_type(16))); |
| 2924 | using I16 = int16_t __attribute__((ext_vector_type(16))); |
| 2925 | using I32 = int32_t __attribute__((ext_vector_type(16))); |
| 2926 | using U32 = uint32_t __attribute__((ext_vector_type(16))); |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 2927 | using I64 = int64_t __attribute__((ext_vector_type(16))); |
| 2928 | using U64 = uint64_t __attribute__((ext_vector_type(16))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2929 | using F = float __attribute__((ext_vector_type(16))); |
| 2930 | #else |
| 2931 | using U8 = uint8_t __attribute__((ext_vector_type(8))); |
| 2932 | using U16 = uint16_t __attribute__((ext_vector_type(8))); |
| 2933 | using I16 = int16_t __attribute__((ext_vector_type(8))); |
| 2934 | using I32 = int32_t __attribute__((ext_vector_type(8))); |
| 2935 | using U32 = uint32_t __attribute__((ext_vector_type(8))); |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 2936 | using I64 = int64_t __attribute__((ext_vector_type(8))); |
| 2937 | using U64 = uint64_t __attribute__((ext_vector_type(8))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2938 | using F = float __attribute__((ext_vector_type(8))); |
| 2939 | #endif |
| 2940 | |
| 2941 | static const size_t N = sizeof(U16) / sizeof(uint16_t); |
| 2942 | |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2943 | // Once again, some platforms benefit from a restricted Stage calling convention, |
| 2944 | // but others can pass tons and tons of registers and we're happy to exploit that. |
| 2945 | // It's exactly the same decision and implementation strategy as the F stages above. |
| 2946 | #if JUMPER_NARROW_STAGES |
| 2947 | struct Params { |
| 2948 | size_t dx, dy, tail; |
| 2949 | U16 dr,dg,db,da; |
| 2950 | }; |
| 2951 | using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); |
| 2952 | #else |
| 2953 | // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. |
| 2954 | using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, |
| 2955 | U16 r, U16 g, U16 b, U16 a, |
| 2956 | U16 dr, U16 dg, U16 db, U16 da); |
| 2957 | #endif |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2958 | |
| 2959 | static void start_pipeline(const size_t x0, const size_t y0, |
| 2960 | const size_t xlimit, const size_t ylimit, void** program) { |
| 2961 | auto start = (Stage)load_and_inc(program); |
| 2962 | for (size_t dy = y0; dy < ylimit; dy++) { |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2963 | #if JUMPER_NARROW_STAGES |
| 2964 | Params params = { x0,dy,0, 0,0,0,0 }; |
| 2965 | for (; params.dx + N <= xlimit; params.dx += N) { |
| 2966 | start(¶ms,program, 0,0,0,0); |
| 2967 | } |
| 2968 | if (size_t tail = xlimit - params.dx) { |
| 2969 | params.tail = tail; |
| 2970 | start(¶ms,program, 0,0,0,0); |
| 2971 | } |
| 2972 | #else |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2973 | size_t dx = x0; |
| 2974 | for (; dx + N <= xlimit; dx += N) { |
| 2975 | start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
| 2976 | } |
| 2977 | if (size_t tail = xlimit - dx) { |
| 2978 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
| 2979 | } |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2980 | #endif |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2981 | } |
| 2982 | } |
| 2983 | |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2984 | #if JUMPER_NARROW_STAGES |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 2985 | static void ABI just_return(Params*, void**, U16,U16,U16,U16) {} |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2986 | #else |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 2987 | static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 2988 | #endif |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 2989 | |
| 2990 | // All stages use the same function call ABI to chain into each other, but there are three types: |
| 2991 | // GG: geometry in, geometry out -- think, a matrix |
| 2992 | // GP: geometry in, pixels out. -- think, a memory gather |
| 2993 | // PP: pixels in, pixels out. -- think, a blend mode |
| 2994 | // |
| 2995 | // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) |
| 2996 | // |
| 2997 | // These three STAGE_ macros let you define each type of stage, |
| 2998 | // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. |
| 2999 | |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3000 | #if JUMPER_NARROW_STAGES |
Mike Klein | 8354c52 | 2018-12-19 10:45:14 -0500 | [diff] [blame] | 3001 | #define STAGE_GG(name, ...) \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3002 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
Mike Klein | 8354c52 | 2018-12-19 10:45:14 -0500 | [diff] [blame] | 3003 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
| 3004 | auto x = join<F>(r,g), \ |
| 3005 | y = join<F>(b,a); \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3006 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \ |
Mike Klein | 8354c52 | 2018-12-19 10:45:14 -0500 | [diff] [blame] | 3007 | split(x, &r,&g); \ |
| 3008 | split(y, &b,&a); \ |
| 3009 | auto next = (Stage)load_and_inc(program); \ |
| 3010 | next(params,program, r,g,b,a); \ |
| 3011 | } \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3012 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3013 | |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3014 | #define STAGE_GP(name, ...) \ |
| 3015 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
| 3016 | U16& r, U16& g, U16& b, U16& a, \ |
| 3017 | U16& dr, U16& dg, U16& db, U16& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 3018 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3019 | auto x = join<F>(r,g), \ |
| 3020 | y = join<F>(b,a); \ |
| 3021 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \ |
| 3022 | params->dr,params->dg,params->db,params->da); \ |
| 3023 | auto next = (Stage)load_and_inc(program); \ |
| 3024 | next(params,program, r,g,b,a); \ |
| 3025 | } \ |
| 3026 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
| 3027 | U16& r, U16& g, U16& b, U16& a, \ |
| 3028 | U16& dr, U16& dg, U16& db, U16& da) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3029 | |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3030 | #define STAGE_PP(name, ...) \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3031 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3032 | U16& r, U16& g, U16& b, U16& a, \ |
| 3033 | U16& dr, U16& dg, U16& db, U16& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 3034 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3035 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3036 | params->dr,params->dg,params->db,params->da); \ |
| 3037 | auto next = (Stage)load_and_inc(program); \ |
| 3038 | next(params,program, r,g,b,a); \ |
| 3039 | } \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3040 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3041 | U16& r, U16& g, U16& b, U16& a, \ |
| 3042 | U16& dr, U16& dg, U16& db, U16& da) |
| 3043 | #else |
| 3044 | #define STAGE_GG(name, ...) \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3045 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 3046 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3047 | U16 r, U16 g, U16 b, U16 a, \ |
| 3048 | U16 dr, U16 dg, U16 db, U16 da) { \ |
| 3049 | auto x = join<F>(r,g), \ |
| 3050 | y = join<F>(b,a); \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3051 | name##_k(Ctx{program}, dx,dy,tail, x,y); \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3052 | split(x, &r,&g); \ |
| 3053 | split(y, &b,&a); \ |
| 3054 | auto next = (Stage)load_and_inc(program); \ |
| 3055 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
| 3056 | } \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3057 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3058 | |
| 3059 | #define STAGE_GP(name, ...) \ |
| 3060 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
| 3061 | U16& r, U16& g, U16& b, U16& a, \ |
| 3062 | U16& dr, U16& dg, U16& db, U16& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 3063 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3064 | U16 r, U16 g, U16 b, U16 a, \ |
| 3065 | U16 dr, U16 dg, U16 db, U16 da) { \ |
| 3066 | auto x = join<F>(r,g), \ |
| 3067 | y = join<F>(b,a); \ |
| 3068 | name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ |
| 3069 | auto next = (Stage)load_and_inc(program); \ |
| 3070 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
| 3071 | } \ |
| 3072 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
| 3073 | U16& r, U16& g, U16& b, U16& a, \ |
| 3074 | U16& dr, U16& dg, U16& db, U16& da) |
| 3075 | |
| 3076 | #define STAGE_PP(name, ...) \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3077 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3078 | U16& r, U16& g, U16& b, U16& a, \ |
| 3079 | U16& dr, U16& dg, U16& db, U16& da); \ |
Mike Klein | 4d4b3aa | 2018-03-21 13:07:35 -0400 | [diff] [blame] | 3080 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3081 | U16 r, U16 g, U16 b, U16 a, \ |
| 3082 | U16 dr, U16 dg, U16 db, U16 da) { \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3083 | name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3084 | auto next = (Stage)load_and_inc(program); \ |
| 3085 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
| 3086 | } \ |
Mike Klein | 4c249ff | 2019-03-18 11:57:58 -0500 | [diff] [blame] | 3087 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
Mike Klein | a46623b | 2018-03-10 10:27:24 -0500 | [diff] [blame] | 3088 | U16& r, U16& g, U16& b, U16& a, \ |
| 3089 | U16& dr, U16& dg, U16& db, U16& da) |
| 3090 | #endif |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3091 | |
| 3092 | // ~~~~~~ Commonly used helper functions ~~~~~~ // |
| 3093 | |
Brian Osman | 1b61d86 | 2021-11-03 15:19:28 -0400 | [diff] [blame] | 3094 | /** |
| 3095 | * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow, |
| 3096 | * thanks to a division by a non-power of two: |
| 3097 | * [1] (v + 127) / 255 |
| 3098 | * |
| 3099 | * There is a two-step process that computes the correct answer for all inputs: |
| 3100 | * [2] (v + 128 + ((v + 128) >> 8)) >> 8 |
| 3101 | * |
| 3102 | * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time: |
| 3103 | * [3] (v + 255) >> 8; |
| 3104 | * |
| 3105 | * We offer two different implementations here, depending on the requirements of the calling stage. |
| 3106 | */ |
| 3107 | |
| 3108 | /** |
| 3109 | * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast |
| 3110 | * as [3]), and uses [3] elsewhere. |
| 3111 | */ |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3112 | SI U16 div255(U16 v) { |
Brian Osman | 1b61d86 | 2021-11-03 15:19:28 -0400 | [diff] [blame] | 3113 | #if defined(JUMPER_IS_NEON) |
| 3114 | // With NEON we can compute [2] just as fast as [3], so let's be correct. |
| 3115 | // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up: |
Mike Klein | d8853ec | 2018-03-10 11:34:53 -0500 | [diff] [blame] | 3116 | return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3117 | #else |
Brian Osman | 1b61d86 | 2021-11-03 15:19:28 -0400 | [diff] [blame] | 3118 | // Otherwise, use [3], which is never wrong by more than 1: |
| 3119 | return (v+255)/256; |
| 3120 | #endif |
| 3121 | } |
| 3122 | |
| 3123 | /** |
| 3124 | * div255_accurate guarantees the right answer on all platforms, at the expense of performance. |
| 3125 | */ |
| 3126 | SI U16 div255_accurate(U16 v) { |
| 3127 | #if defined(JUMPER_IS_NEON) |
| 3128 | // Our NEON implementation of div255 is already correct for all inputs: |
| 3129 | return div255(v); |
| 3130 | #else |
| 3131 | // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics: |
| 3132 | v += 128; |
| 3133 | return (v+(v/256))/256; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3134 | #endif |
| 3135 | } |
| 3136 | |
| 3137 | SI U16 inv(U16 v) { return 255-v; } |
| 3138 | |
| 3139 | SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } |
| 3140 | SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } |
| 3141 | |
| 3142 | SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } |
| 3143 | SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3144 | |
| 3145 | SI U16 from_float(float f) { return f * 255.0f + 0.5f; } |
| 3146 | |
| 3147 | SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } |
| 3148 | |
| 3149 | template <typename D, typename S> |
| 3150 | SI D cast(S src) { |
| 3151 | return __builtin_convertvector(src, D); |
| 3152 | } |
| 3153 | |
| 3154 | template <typename D, typename S> |
| 3155 | SI void split(S v, D* lo, D* hi) { |
| 3156 | static_assert(2*sizeof(D) == sizeof(S), ""); |
| 3157 | memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); |
| 3158 | memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); |
| 3159 | } |
| 3160 | template <typename D, typename S> |
| 3161 | SI D join(S lo, S hi) { |
| 3162 | static_assert(sizeof(D) == 2*sizeof(S), ""); |
| 3163 | D v; |
| 3164 | memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); |
| 3165 | memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); |
| 3166 | return v; |
| 3167 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3168 | |
| 3169 | SI F if_then_else(I32 c, F t, F e) { |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 3170 | return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) ); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3171 | } |
| 3172 | SI F max(F x, F y) { return if_then_else(x < y, y, x); } |
| 3173 | SI F min(F x, F y) { return if_then_else(x < y, x, y); } |
| 3174 | |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 3175 | SI I32 if_then_else(I32 c, I32 t, I32 e) { |
| 3176 | return (t & c) | (e & ~c); |
| 3177 | } |
| 3178 | SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); } |
| 3179 | SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); } |
| 3180 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3181 | SI F mad(F f, F m, F a) { return f*m+a; } |
| 3182 | SI U32 trunc_(F x) { return (U32)cast<I32>(x); } |
| 3183 | |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 3184 | // Use approximate instructions and one Newton-Raphson step to calculate 1/x. |
| 3185 | SI F rcp_precise(F x) { |
| 3186 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
| 3187 | __m256 lo,hi; |
| 3188 | split(x, &lo,&hi); |
| 3189 | return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); |
| 3190 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
| 3191 | __m128 lo,hi; |
| 3192 | split(x, &lo,&hi); |
| 3193 | return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); |
| 3194 | #elif defined(JUMPER_IS_NEON) |
| 3195 | float32x4_t lo,hi; |
| 3196 | split(x, &lo,&hi); |
| 3197 | return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3198 | #else |
| 3199 | return 1.0f / x; |
| 3200 | #endif |
| 3201 | } |
| 3202 | SI F sqrt_(F x) { |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3203 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3204 | __m256 lo,hi; |
| 3205 | split(x, &lo,&hi); |
| 3206 | return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi)); |
Mike Klein | 83e86eb | 2018-08-31 10:19:21 -0400 | [diff] [blame] | 3207 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3208 | __m128 lo,hi; |
| 3209 | split(x, &lo,&hi); |
| 3210 | return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi)); |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 3211 | #elif defined(SK_CPU_ARM64) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3212 | float32x4_t lo,hi; |
| 3213 | split(x, &lo,&hi); |
| 3214 | return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi)); |
Mike Klein | 73d7ffc | 2018-07-25 09:19:23 -0400 | [diff] [blame] | 3215 | #elif defined(JUMPER_IS_NEON) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3216 | auto sqrt = [](float32x4_t v) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3217 | auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). |
| 3218 | est *= vrsqrtsq_f32(v,est*est); |
| 3219 | est *= vrsqrtsq_f32(v,est*est); |
| 3220 | return v*est; // sqrt(v) == v*rsqrt(v). |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3221 | }; |
| 3222 | float32x4_t lo,hi; |
| 3223 | split(x, &lo,&hi); |
| 3224 | return join<F>(sqrt(lo), sqrt(hi)); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3225 | #else |
| 3226 | return F{ |
| 3227 | sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), |
| 3228 | sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), |
| 3229 | }; |
| 3230 | #endif |
| 3231 | } |
| 3232 | |
| 3233 | SI F floor_(F x) { |
Mike Klein | 15eb1e9 | 2018-08-31 11:21:27 -0400 | [diff] [blame] | 3234 | #if defined(SK_CPU_ARM64) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3235 | float32x4_t lo,hi; |
| 3236 | split(x, &lo,&hi); |
| 3237 | return join<F>(vrndmq_f32(lo), vrndmq_f32(hi)); |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3238 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3239 | __m256 lo,hi; |
| 3240 | split(x, &lo,&hi); |
| 3241 | return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi)); |
Mike Klein | 83e86eb | 2018-08-31 10:19:21 -0400 | [diff] [blame] | 3242 | #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
Mike Klein | e304a8a | 2018-05-31 10:49:51 -0400 | [diff] [blame] | 3243 | __m128 lo,hi; |
| 3244 | split(x, &lo,&hi); |
| 3245 | return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi)); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3246 | #else |
| 3247 | F roundtrip = cast<F>(cast<I32>(x)); |
| 3248 | return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); |
| 3249 | #endif |
| 3250 | } |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 3251 | |
| 3252 | // scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally |
| 3253 | // this multiply is: |
| 3254 | // (2 * a * b + (1 << 15)) >> 16 |
| 3255 | // The result is a number on [-1, 1). |
| 3256 | // Note: on neon this is a saturating multiply while the others are not. |
| 3257 | SI I16 scaled_mult(I16 a, I16 b) { |
| 3258 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
| 3259 | return _mm256_mulhrs_epi16(a, b); |
| 3260 | #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
| 3261 | return _mm_mulhrs_epi16(a, b); |
| 3262 | #elif defined(SK_CPU_ARM64) |
| 3263 | return vqrdmulhq_s16(a, b); |
| 3264 | #elif defined(JUMPER_IS_NEON) |
| 3265 | return vqrdmulhq_s16(a, b); |
| 3266 | #else |
| 3267 | const I32 roundingTerm = 1 << 14; |
| 3268 | return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15); |
| 3269 | #endif |
| 3270 | } |
| 3271 | |
| 3272 | // This sum is to support lerp where the result will always be a positive number. In general, |
| 3273 | // a sum like this would require an additional bit, but because we know the range of the result |
| 3274 | // we know that the extra bit will always be zero. |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 3275 | SI U16 constrained_add(I16 a, U16 b) { |
Herb Derby | 37035fc | 2021-09-29 17:49:12 -0400 | [diff] [blame] | 3276 | #if defined(SK_DEBUG) |
| 3277 | for (size_t i = 0; i < N; i++) { |
| 3278 | // Ensure that a + b is on the interval [0, UINT16_MAX] |
| 3279 | int ia = a[i], |
| 3280 | ib = b[i]; |
| 3281 | // Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is |
| 3282 | // 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be |
| 3283 | // interpreted as an uint32_t. |
| 3284 | SkASSERT(-ib <= ia && ia <= 65535 - ib); |
| 3285 | } |
| 3286 | #endif |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 3287 | return b + a; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 3288 | } |
| 3289 | |
Mike Klein | 8e3426f | 2018-04-16 12:56:24 -0400 | [diff] [blame] | 3290 | SI F fract(F x) { return x - floor_(x); } |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 3291 | SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3292 | |
| 3293 | // ~~~~~~ Basic / misc. stages ~~~~~~ // |
| 3294 | |
Mike Klein | e8de024 | 2018-03-10 12:37:11 -0500 | [diff] [blame] | 3295 | STAGE_GG(seed_shader, Ctx::None) { |
| 3296 | static const float iota[] = { |
| 3297 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
| 3298 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
| 3299 | }; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3300 | x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3301 | y = cast<F>(I32(dy)) + 0.5f; |
| 3302 | } |
| 3303 | |
| 3304 | STAGE_GG(matrix_translate, const float* m) { |
| 3305 | x += m[0]; |
| 3306 | y += m[1]; |
| 3307 | } |
| 3308 | STAGE_GG(matrix_scale_translate, const float* m) { |
| 3309 | x = mad(x,m[0], m[2]); |
| 3310 | y = mad(y,m[1], m[3]); |
| 3311 | } |
| 3312 | STAGE_GG(matrix_2x3, const float* m) { |
Herb Derby | 97bf728 | 2021-10-06 11:00:39 -0400 | [diff] [blame] | 3313 | auto X = mad(x,m[0], mad(y,m[1], m[2])), |
| 3314 | Y = mad(x,m[3], mad(y,m[4], m[5])); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3315 | x = X; |
| 3316 | y = Y; |
| 3317 | } |
| 3318 | STAGE_GG(matrix_perspective, const float* m) { |
| 3319 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
| 3320 | auto X = mad(x,m[0], mad(y,m[1], m[2])), |
| 3321 | Y = mad(x,m[3], mad(y,m[4], m[5])), |
| 3322 | Z = mad(x,m[6], mad(y,m[7], m[8])); |
Herb Derby | 9f6be8e | 2021-09-15 17:25:01 -0400 | [diff] [blame] | 3323 | x = X * rcp_precise(Z); |
| 3324 | y = Y * rcp_precise(Z); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3325 | } |
| 3326 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3327 | STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3328 | r = c->rgba[0]; |
| 3329 | g = c->rgba[1]; |
| 3330 | b = c->rgba[2]; |
| 3331 | a = c->rgba[3]; |
| 3332 | } |
Mike Reed | 9318a6c | 2019-08-16 16:16:25 -0400 | [diff] [blame] | 3333 | STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
| 3334 | dr = c->rgba[0]; |
| 3335 | dg = c->rgba[1]; |
| 3336 | db = c->rgba[2]; |
| 3337 | da = c->rgba[3]; |
| 3338 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3339 | STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } |
| 3340 | STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } |
| 3341 | |
| 3342 | STAGE_PP(set_rgb, const float rgb[3]) { |
| 3343 | r = from_float(rgb[0]); |
| 3344 | g = from_float(rgb[1]); |
| 3345 | b = from_float(rgb[2]); |
| 3346 | } |
| 3347 | |
Mike Klein | ea045b5 | 2018-08-23 12:13:58 -0400 | [diff] [blame] | 3348 | STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ } |
| 3349 | STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ } |
| 3350 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3351 | STAGE_PP(clamp_a, Ctx::None) { |
| 3352 | r = min(r, a); |
| 3353 | g = min(g, a); |
| 3354 | b = min(b, a); |
| 3355 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3356 | |
Mike Klein | eb50f43 | 2018-09-07 11:08:53 -0400 | [diff] [blame] | 3357 | STAGE_PP(clamp_gamut, Ctx::None) { |
| 3358 | // It shouldn't be possible to get out-of-gamut |
| 3359 | // colors when working in lowp. |
| 3360 | } |
| 3361 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3362 | STAGE_PP(premul, Ctx::None) { |
Brian Osman | 1b61d86 | 2021-11-03 15:19:28 -0400 | [diff] [blame] | 3363 | r = div255_accurate(r * a); |
| 3364 | g = div255_accurate(g * a); |
| 3365 | b = div255_accurate(b * a); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3366 | } |
| 3367 | STAGE_PP(premul_dst, Ctx::None) { |
Brian Osman | 1b61d86 | 2021-11-03 15:19:28 -0400 | [diff] [blame] | 3368 | dr = div255_accurate(dr * da); |
| 3369 | dg = div255_accurate(dg * da); |
| 3370 | db = div255_accurate(db * da); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3371 | } |
| 3372 | |
| 3373 | STAGE_PP(force_opaque , Ctx::None) { a = 255; } |
| 3374 | STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } |
| 3375 | |
| 3376 | STAGE_PP(swap_rb, Ctx::None) { |
| 3377 | auto tmp = r; |
| 3378 | r = b; |
| 3379 | b = tmp; |
| 3380 | } |
Mike Klein | 1a3eb52 | 2018-10-18 10:11:00 -0400 | [diff] [blame] | 3381 | STAGE_PP(swap_rb_dst, Ctx::None) { |
| 3382 | auto tmp = dr; |
| 3383 | dr = db; |
| 3384 | db = tmp; |
| 3385 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3386 | |
| 3387 | STAGE_PP(move_src_dst, Ctx::None) { |
| 3388 | dr = r; |
| 3389 | dg = g; |
| 3390 | db = b; |
| 3391 | da = a; |
| 3392 | } |
| 3393 | |
| 3394 | STAGE_PP(move_dst_src, Ctx::None) { |
| 3395 | r = dr; |
| 3396 | g = dg; |
| 3397 | b = db; |
| 3398 | a = da; |
| 3399 | } |
| 3400 | |
Brian Osman | 9f1e06a | 2021-08-10 14:39:18 -0400 | [diff] [blame] | 3401 | STAGE_PP(swap_src_dst, Ctx::None) { |
| 3402 | std::swap(r, dr); |
| 3403 | std::swap(g, dg); |
| 3404 | std::swap(b, db); |
| 3405 | std::swap(a, da); |
| 3406 | } |
| 3407 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3408 | // ~~~~~~ Blend modes ~~~~~~ // |
| 3409 | |
| 3410 | // The same logic applied to all 4 channels. |
| 3411 | #define BLEND_MODE(name) \ |
| 3412 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
| 3413 | STAGE_PP(name, Ctx::None) { \ |
| 3414 | r = name##_channel(r,dr,a,da); \ |
| 3415 | g = name##_channel(g,dg,a,da); \ |
| 3416 | b = name##_channel(b,db,a,da); \ |
| 3417 | a = name##_channel(a,da,a,da); \ |
| 3418 | } \ |
| 3419 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
| 3420 | |
| 3421 | BLEND_MODE(clear) { return 0; } |
| 3422 | BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } |
| 3423 | BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } |
| 3424 | BLEND_MODE(srcin) { return div255( s*da ); } |
| 3425 | BLEND_MODE(dstin) { return div255( d*sa ); } |
| 3426 | BLEND_MODE(srcout) { return div255( s*inv(da) ); } |
| 3427 | BLEND_MODE(dstout) { return div255( d*inv(sa) ); } |
| 3428 | BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } |
| 3429 | BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } |
| 3430 | BLEND_MODE(modulate) { return div255( s*d ); } |
| 3431 | BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } |
Mike Klein | b90c080 | 2019-03-15 14:03:41 +0000 | [diff] [blame] | 3432 | BLEND_MODE(plus_) { return min(s+d, 255); } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3433 | BLEND_MODE(screen) { return s + d - div255( s*d ); } |
| 3434 | BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } |
| 3435 | #undef BLEND_MODE |
| 3436 | |
| 3437 | // The same logic applied to color, and srcover for alpha. |
| 3438 | #define BLEND_MODE(name) \ |
| 3439 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
| 3440 | STAGE_PP(name, Ctx::None) { \ |
| 3441 | r = name##_channel(r,dr,a,da); \ |
| 3442 | g = name##_channel(g,dg,a,da); \ |
| 3443 | b = name##_channel(b,db,a,da); \ |
| 3444 | a = a + div255( da*inv(a) ); \ |
| 3445 | } \ |
| 3446 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
| 3447 | |
| 3448 | BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } |
| 3449 | BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } |
| 3450 | BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } |
| 3451 | BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } |
| 3452 | |
| 3453 | BLEND_MODE(hardlight) { |
| 3454 | return div255( s*inv(da) + d*inv(sa) + |
| 3455 | if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
| 3456 | } |
| 3457 | BLEND_MODE(overlay) { |
| 3458 | return div255( s*inv(da) + d*inv(sa) + |
| 3459 | if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
| 3460 | } |
| 3461 | #undef BLEND_MODE |
| 3462 | |
| 3463 | // ~~~~~~ Helpers for interacting with memory ~~~~~~ // |
| 3464 | |
| 3465 | template <typename T> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3466 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3467 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
| 3468 | } |
| 3469 | |
| 3470 | template <typename T> |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3471 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3472 | // Exclusive -> inclusive. |
John Stiles | 36e0849 | 2020-07-24 09:56:05 -0400 | [diff] [blame] | 3473 | const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1), |
| 3474 | h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1); |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3475 | |
| 3476 | x = min(max(0, x), w); |
| 3477 | y = min(max(0, y), h); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3478 | |
| 3479 | *ptr = (const T*)ctx->pixels; |
| 3480 | return trunc_(y)*ctx->stride + trunc_(x); |
| 3481 | } |
| 3482 | |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 3483 | template <typename T> |
| 3484 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) { |
| 3485 | // Exclusive -> inclusive. |
| 3486 | const I32 w = ctx->width - 1, |
| 3487 | h = ctx->height - 1; |
| 3488 | |
| 3489 | U32 ax = cast<U32>(min(max(0, x), w)), |
| 3490 | ay = cast<U32>(min(max(0, y), h)); |
| 3491 | |
| 3492 | *ptr = (const T*)ctx->pixels; |
| 3493 | return ay * ctx->stride + ax; |
| 3494 | } |
| 3495 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3496 | template <typename V, typename T> |
| 3497 | SI V load(const T* ptr, size_t tail) { |
| 3498 | V v = 0; |
| 3499 | switch (tail & (N-1)) { |
| 3500 | case 0: memcpy(&v, ptr, sizeof(v)); break; |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3501 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3502 | case 15: v[14] = ptr[14]; [[fallthrough]]; |
| 3503 | case 14: v[13] = ptr[13]; [[fallthrough]]; |
| 3504 | case 13: v[12] = ptr[12]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3505 | case 12: memcpy(&v, ptr, 12*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3506 | case 11: v[10] = ptr[10]; [[fallthrough]]; |
| 3507 | case 10: v[ 9] = ptr[ 9]; [[fallthrough]]; |
| 3508 | case 9: v[ 8] = ptr[ 8]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3509 | case 8: memcpy(&v, ptr, 8*sizeof(T)); break; |
| 3510 | #endif |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3511 | case 7: v[ 6] = ptr[ 6]; [[fallthrough]]; |
| 3512 | case 6: v[ 5] = ptr[ 5]; [[fallthrough]]; |
| 3513 | case 5: v[ 4] = ptr[ 4]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3514 | case 4: memcpy(&v, ptr, 4*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3515 | case 3: v[ 2] = ptr[ 2]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3516 | case 2: memcpy(&v, ptr, 2*sizeof(T)); break; |
| 3517 | case 1: v[ 0] = ptr[ 0]; |
| 3518 | } |
| 3519 | return v; |
| 3520 | } |
| 3521 | template <typename V, typename T> |
| 3522 | SI void store(T* ptr, size_t tail, V v) { |
| 3523 | switch (tail & (N-1)) { |
| 3524 | case 0: memcpy(ptr, &v, sizeof(v)); break; |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3525 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3526 | case 15: ptr[14] = v[14]; [[fallthrough]]; |
| 3527 | case 14: ptr[13] = v[13]; [[fallthrough]]; |
| 3528 | case 13: ptr[12] = v[12]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3529 | case 12: memcpy(ptr, &v, 12*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3530 | case 11: ptr[10] = v[10]; [[fallthrough]]; |
| 3531 | case 10: ptr[ 9] = v[ 9]; [[fallthrough]]; |
| 3532 | case 9: ptr[ 8] = v[ 8]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3533 | case 8: memcpy(ptr, &v, 8*sizeof(T)); break; |
| 3534 | #endif |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3535 | case 7: ptr[ 6] = v[ 6]; [[fallthrough]]; |
| 3536 | case 6: ptr[ 5] = v[ 5]; [[fallthrough]]; |
| 3537 | case 5: ptr[ 4] = v[ 4]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3538 | case 4: memcpy(ptr, &v, 4*sizeof(T)); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3539 | case 3: ptr[ 2] = v[ 2]; [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3540 | case 2: memcpy(ptr, &v, 2*sizeof(T)); break; |
| 3541 | case 1: ptr[ 0] = v[ 0]; |
| 3542 | } |
| 3543 | } |
| 3544 | |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3545 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3546 | template <typename V, typename T> |
| 3547 | SI V gather(const T* ptr, U32 ix) { |
| 3548 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
| 3549 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], |
| 3550 | ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], |
| 3551 | ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; |
| 3552 | } |
| 3553 | |
| 3554 | template<> |
Kevin Lubick | b5502b2 | 2018-03-12 10:17:06 -0400 | [diff] [blame] | 3555 | F gather(const float* ptr, U32 ix) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3556 | __m256i lo, hi; |
| 3557 | split(ix, &lo, &hi); |
| 3558 | |
Kevin Lubick | b5502b2 | 2018-03-12 10:17:06 -0400 | [diff] [blame] | 3559 | return join<F>(_mm256_i32gather_ps(ptr, lo, 4), |
| 3560 | _mm256_i32gather_ps(ptr, hi, 4)); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3561 | } |
| 3562 | |
| 3563 | template<> |
Kevin Lubick | b5502b2 | 2018-03-12 10:17:06 -0400 | [diff] [blame] | 3564 | U32 gather(const uint32_t* ptr, U32 ix) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3565 | __m256i lo, hi; |
| 3566 | split(ix, &lo, &hi); |
| 3567 | |
Kevin Lubick | b5502b2 | 2018-03-12 10:17:06 -0400 | [diff] [blame] | 3568 | return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4), |
| 3569 | _mm256_i32gather_epi32(ptr, hi, 4)); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3570 | } |
| 3571 | #else |
| 3572 | template <typename V, typename T> |
| 3573 | SI V gather(const T* ptr, U32 ix) { |
| 3574 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
| 3575 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; |
| 3576 | } |
| 3577 | #endif |
| 3578 | |
| 3579 | |
| 3580 | // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // |
| 3581 | |
| 3582 | SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 3583 | #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3584 | // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. |
| 3585 | __m256i _01,_23; |
| 3586 | split(rgba, &_01, &_23); |
| 3587 | __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), |
| 3588 | _13 = _mm256_permute2x128_si256(_01,_23, 0x31); |
| 3589 | rgba = join<U32>(_02, _13); |
| 3590 | |
| 3591 | auto cast_U16 = [](U32 v) -> U16 { |
| 3592 | __m256i _02,_13; |
| 3593 | split(v, &_02,&_13); |
| 3594 | return _mm256_packus_epi32(_02,_13); |
| 3595 | }; |
| 3596 | #else |
| 3597 | auto cast_U16 = [](U32 v) -> U16 { |
| 3598 | return cast<U16>(v); |
| 3599 | }; |
| 3600 | #endif |
| 3601 | *r = cast_U16(rgba & 65535) & 255; |
| 3602 | *g = cast_U16(rgba & 65535) >> 8; |
| 3603 | *b = cast_U16(rgba >> 16) & 255; |
| 3604 | *a = cast_U16(rgba >> 16) >> 8; |
| 3605 | } |
| 3606 | |
| 3607 | SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
Mike Klein | 73d7ffc | 2018-07-25 09:19:23 -0400 | [diff] [blame] | 3608 | #if 1 && defined(JUMPER_IS_NEON) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3609 | uint8x8x4_t rgba; |
| 3610 | switch (tail & (N-1)) { |
| 3611 | case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3612 | case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; |
| 3613 | case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; |
| 3614 | case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; |
| 3615 | case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; |
| 3616 | case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; |
| 3617 | case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3618 | case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); |
| 3619 | } |
| 3620 | *r = cast<U16>(rgba.val[0]); |
| 3621 | *g = cast<U16>(rgba.val[1]); |
| 3622 | *b = cast<U16>(rgba.val[2]); |
| 3623 | *a = cast<U16>(rgba.val[3]); |
| 3624 | #else |
| 3625 | from_8888(load<U32>(ptr, tail), r,g,b,a); |
| 3626 | #endif |
| 3627 | } |
| 3628 | SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
Mike Klein | 73d7ffc | 2018-07-25 09:19:23 -0400 | [diff] [blame] | 3629 | #if 1 && defined(JUMPER_IS_NEON) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3630 | uint8x8x4_t rgba = {{ |
| 3631 | cast<U8>(r), |
| 3632 | cast<U8>(g), |
| 3633 | cast<U8>(b), |
| 3634 | cast<U8>(a), |
| 3635 | }}; |
| 3636 | switch (tail & (N-1)) { |
| 3637 | case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3638 | case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; |
| 3639 | case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; |
| 3640 | case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; |
| 3641 | case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; |
| 3642 | case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; |
| 3643 | case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3644 | case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); |
| 3645 | } |
| 3646 | #else |
| 3647 | store(ptr, tail, cast<U32>(r | (g<<8)) << 0 |
| 3648 | | cast<U32>(b | (a<<8)) << 16); |
| 3649 | #endif |
| 3650 | } |
| 3651 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3652 | STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3653 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
| 3654 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3655 | STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3656 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
| 3657 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3658 | STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3659 | store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); |
| 3660 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3661 | STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3662 | const uint32_t* ptr; |
| 3663 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 3664 | from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); |
| 3665 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3666 | |
| 3667 | // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // |
| 3668 | |
| 3669 | SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { |
| 3670 | // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 |
| 3671 | U16 R = (rgb >> 11) & 31, |
| 3672 | G = (rgb >> 5) & 63, |
| 3673 | B = (rgb >> 0) & 31; |
| 3674 | |
| 3675 | // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. |
| 3676 | *r = (R << 3) | (R >> 2); |
| 3677 | *g = (G << 2) | (G >> 4); |
| 3678 | *b = (B << 3) | (B >> 2); |
| 3679 | } |
| 3680 | SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
| 3681 | from_565(load<U16>(ptr, tail), r,g,b); |
| 3682 | } |
| 3683 | SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { |
Mike Klein | 1c94143 | 2019-02-27 14:22:55 -0600 | [diff] [blame] | 3684 | // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f. |
| 3685 | // (Don't feel like you need to find some fundamental truth in these... |
| 3686 | // they were brute-force searched.) |
| 3687 | U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half. |
| 3688 | G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly. |
| 3689 | B = (b * 9 + 36) / 74; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3690 | // Pack them back into 15|rrrrr gggggg bbbbb|0. |
| 3691 | store(ptr, tail, R << 11 |
| 3692 | | G << 5 |
| 3693 | | B << 0); |
| 3694 | } |
| 3695 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3696 | STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3697 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); |
| 3698 | a = 255; |
| 3699 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3700 | STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3701 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); |
| 3702 | da = 255; |
| 3703 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3704 | STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3705 | store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); |
| 3706 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3707 | STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3708 | const uint16_t* ptr; |
| 3709 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 3710 | from_565(gather<U16>(ptr, ix), &r, &g, &b); |
| 3711 | a = 255; |
| 3712 | } |
| 3713 | |
| 3714 | SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { |
| 3715 | // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. |
| 3716 | U16 R = (rgba >> 12) & 15, |
| 3717 | G = (rgba >> 8) & 15, |
| 3718 | B = (rgba >> 4) & 15, |
| 3719 | A = (rgba >> 0) & 15; |
| 3720 | |
| 3721 | // Scale [0,15] to [0,255]. |
| 3722 | *r = (R << 4) | R; |
| 3723 | *g = (G << 4) | G; |
| 3724 | *b = (B << 4) | B; |
| 3725 | *a = (A << 4) | A; |
| 3726 | } |
| 3727 | SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
| 3728 | from_4444(load<U16>(ptr, tail), r,g,b,a); |
| 3729 | } |
| 3730 | SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
Mike Klein | 1c94143 | 2019-02-27 14:22:55 -0600 | [diff] [blame] | 3731 | // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f). |
| 3732 | U16 R = (r + 8) / 17, |
| 3733 | G = (g + 8) / 17, |
| 3734 | B = (b + 8) / 17, |
| 3735 | A = (a + 8) / 17; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3736 | // Pack them back into 15|rrrr gggg bbbb aaaa|0. |
| 3737 | store(ptr, tail, R << 12 |
| 3738 | | G << 8 |
| 3739 | | B << 4 |
| 3740 | | A << 0); |
| 3741 | } |
| 3742 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3743 | STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3744 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
| 3745 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3746 | STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3747 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
| 3748 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3749 | STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3750 | store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); |
| 3751 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3752 | STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3753 | const uint16_t* ptr; |
| 3754 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 3755 | from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); |
| 3756 | } |
| 3757 | |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3758 | SI void from_88(U16 rg, U16* r, U16* g) { |
| 3759 | *r = (rg & 0xFF); |
| 3760 | *g = (rg >> 8); |
| 3761 | } |
| 3762 | |
| 3763 | SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
| 3764 | #if 1 && defined(JUMPER_IS_NEON) |
| 3765 | uint8x8x2_t rg; |
| 3766 | switch (tail & (N-1)) { |
| 3767 | case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3768 | case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; |
| 3769 | case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; |
| 3770 | case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; |
| 3771 | case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; |
| 3772 | case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; |
| 3773 | case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3774 | case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0); |
| 3775 | } |
| 3776 | *r = cast<U16>(rg.val[0]); |
| 3777 | *g = cast<U16>(rg.val[1]); |
| 3778 | #else |
| 3779 | from_88(load<U16>(ptr, tail), r,g); |
| 3780 | #endif |
| 3781 | } |
| 3782 | |
| 3783 | SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
| 3784 | #if 1 && defined(JUMPER_IS_NEON) |
| 3785 | uint8x8x2_t rg = {{ |
| 3786 | cast<U8>(r), |
| 3787 | cast<U8>(g), |
| 3788 | }}; |
| 3789 | switch (tail & (N-1)) { |
| 3790 | case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break; |
John Stiles | 30212b7 | 2020-06-11 17:55:07 -0400 | [diff] [blame] | 3791 | case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; |
| 3792 | case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; |
| 3793 | case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; |
| 3794 | case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; |
| 3795 | case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; |
| 3796 | case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3797 | case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0); |
| 3798 | } |
| 3799 | #else |
| 3800 | store(ptr, tail, cast<U16>(r | (g<<8)) << 0); |
| 3801 | #endif |
| 3802 | } |
| 3803 | |
| 3804 | STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 3805 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g); |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3806 | b = 0; |
Brian Salomon | f30b1c1 | 2019-06-20 12:25:02 -0400 | [diff] [blame] | 3807 | a = 255; |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 3808 | } |
| 3809 | STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
| 3810 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg); |
| 3811 | db = 0; |
| 3812 | da = 255; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3813 | } |
| 3814 | STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
| 3815 | store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g); |
| 3816 | } |
Robert Phillips | d470e1b | 2019-09-04 15:05:35 -0400 | [diff] [blame] | 3817 | STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
| 3818 | const uint16_t* ptr; |
| 3819 | U32 ix = ix_and_ptr(&ptr, ctx, x, y); |
| 3820 | from_88(gather<U16>(ptr, ix), &r, &g); |
| 3821 | b = 0; |
| 3822 | a = 255; |
| 3823 | } |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 3824 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3825 | // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // |
| 3826 | |
| 3827 | SI U16 load_8(const uint8_t* ptr, size_t tail) { |
| 3828 | return cast<U16>(load<U8>(ptr, tail)); |
| 3829 | } |
| 3830 | SI void store_8(uint8_t* ptr, size_t tail, U16 v) { |
| 3831 | store(ptr, tail, cast<U8>(v)); |
| 3832 | } |
| 3833 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3834 | STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3835 | r = g = b = 0; |
| 3836 | a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
| 3837 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3838 | STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3839 | dr = dg = db = 0; |
| 3840 | da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
| 3841 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3842 | STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3843 | store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); |
| 3844 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3845 | STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3846 | const uint8_t* ptr; |
| 3847 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
| 3848 | r = g = b = 0; |
| 3849 | a = cast<U16>(gather<U8>(ptr, ix)); |
| 3850 | } |
Brian Osman | a7a2324 | 2022-02-08 10:34:38 -0500 | [diff] [blame] | 3851 | STAGE_PP(store_r8, const SkRasterPipeline_MemoryCtx* ctx) { |
| 3852 | store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, r); |
| 3853 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3854 | |
Mike Klein | b1df5e5 | 2018-10-17 17:06:03 -0400 | [diff] [blame] | 3855 | STAGE_PP(alpha_to_gray, Ctx::None) { |
| 3856 | r = g = b = a; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3857 | a = 255; |
| 3858 | } |
Mike Klein | b1df5e5 | 2018-10-17 17:06:03 -0400 | [diff] [blame] | 3859 | STAGE_PP(alpha_to_gray_dst, Ctx::None) { |
| 3860 | dr = dg = db = da; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3861 | da = 255; |
| 3862 | } |
Brian Osman | a7a2324 | 2022-02-08 10:34:38 -0500 | [diff] [blame] | 3863 | STAGE_PP(alpha_to_red, Ctx::None) { |
| 3864 | r = a; |
| 3865 | a = 255; |
| 3866 | } |
| 3867 | STAGE_PP(alpha_to_red_dst, Ctx::None) { |
| 3868 | dr = da; |
| 3869 | da = 255; |
| 3870 | } |
| 3871 | |
Mike Klein | da69d59 | 2019-07-11 07:38:31 -0500 | [diff] [blame] | 3872 | STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3873 | a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. |
| 3874 | r = g = b = 0; |
| 3875 | } |
Brian Salomon | 01ff538 | 2020-12-15 16:06:26 -0500 | [diff] [blame] | 3876 | STAGE_PP(bt709_luminance_or_luma_to_rgb, Ctx::None) { |
| 3877 | r = g = b =(r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. |
| 3878 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3879 | |
| 3880 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
| 3881 | |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3882 | STAGE_PP(load_src, const uint16_t* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3883 | r = sk_unaligned_load<U16>(ptr + 0*N); |
| 3884 | g = sk_unaligned_load<U16>(ptr + 1*N); |
| 3885 | b = sk_unaligned_load<U16>(ptr + 2*N); |
| 3886 | a = sk_unaligned_load<U16>(ptr + 3*N); |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3887 | } |
| 3888 | STAGE_PP(store_src, uint16_t* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3889 | sk_unaligned_store(ptr + 0*N, r); |
| 3890 | sk_unaligned_store(ptr + 1*N, g); |
| 3891 | sk_unaligned_store(ptr + 2*N, b); |
| 3892 | sk_unaligned_store(ptr + 3*N, a); |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3893 | } |
Mike Reed | 121c2af | 2020-03-10 14:02:56 -0400 | [diff] [blame] | 3894 | STAGE_PP(store_src_a, uint16_t* ptr) { |
| 3895 | sk_unaligned_store(ptr, a); |
| 3896 | } |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3897 | STAGE_PP(load_dst, const uint16_t* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3898 | dr = sk_unaligned_load<U16>(ptr + 0*N); |
| 3899 | dg = sk_unaligned_load<U16>(ptr + 1*N); |
| 3900 | db = sk_unaligned_load<U16>(ptr + 2*N); |
| 3901 | da = sk_unaligned_load<U16>(ptr + 3*N); |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3902 | } |
| 3903 | STAGE_PP(store_dst, uint16_t* ptr) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3904 | sk_unaligned_store(ptr + 0*N, dr); |
| 3905 | sk_unaligned_store(ptr + 1*N, dg); |
| 3906 | sk_unaligned_store(ptr + 2*N, db); |
| 3907 | sk_unaligned_store(ptr + 3*N, da); |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3908 | } |
| 3909 | |
| 3910 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
| 3911 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3912 | STAGE_PP(scale_1_float, const float* f) { |
| 3913 | U16 c = from_float(*f); |
| 3914 | r = div255( r * c ); |
| 3915 | g = div255( g * c ); |
| 3916 | b = div255( b * c ); |
| 3917 | a = div255( a * c ); |
| 3918 | } |
| 3919 | STAGE_PP(lerp_1_float, const float* f) { |
| 3920 | U16 c = from_float(*f); |
| 3921 | r = lerp(dr, r, c); |
| 3922 | g = lerp(dg, g, c); |
| 3923 | b = lerp(db, b, c); |
| 3924 | a = lerp(da, a, c); |
| 3925 | } |
Mike Reed | 121c2af | 2020-03-10 14:02:56 -0400 | [diff] [blame] | 3926 | STAGE_PP(scale_native, const uint16_t scales[]) { |
| 3927 | auto c = sk_unaligned_load<U16>(scales); |
| 3928 | r = div255( r * c ); |
| 3929 | g = div255( g * c ); |
| 3930 | b = div255( b * c ); |
| 3931 | a = div255( a * c ); |
| 3932 | } |
| 3933 | |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3934 | STAGE_PP(lerp_native, const uint16_t scales[]) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 3935 | auto c = sk_unaligned_load<U16>(scales); |
Mike Reed | 895e1ee | 2019-03-16 13:16:54 -0400 | [diff] [blame] | 3936 | r = lerp(dr, r, c); |
| 3937 | g = lerp(dg, g, c); |
| 3938 | b = lerp(db, b, c); |
| 3939 | a = lerp(da, a, c); |
| 3940 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3941 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3942 | STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3943 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
| 3944 | r = div255( r * c ); |
| 3945 | g = div255( g * c ); |
| 3946 | b = div255( b * c ); |
| 3947 | a = div255( a * c ); |
| 3948 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3949 | STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3950 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
| 3951 | r = lerp(dr, r, c); |
| 3952 | g = lerp(dg, g, c); |
| 3953 | b = lerp(db, b, c); |
| 3954 | a = lerp(da, a, c); |
| 3955 | } |
| 3956 | |
| 3957 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
| 3958 | SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { |
Mike Klein | 5d835d0 | 2019-10-16 13:28:55 -0500 | [diff] [blame] | 3959 | return if_then_else(a < da, min(cr, min(cg,cb)) |
| 3960 | , max(cr, max(cg,cb))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3961 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3962 | STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3963 | U16 cr,cg,cb; |
| 3964 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
| 3965 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| 3966 | |
| 3967 | r = div255( r * cr ); |
| 3968 | g = div255( g * cg ); |
| 3969 | b = div255( b * cb ); |
| 3970 | a = div255( a * ca ); |
| 3971 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 3972 | STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3973 | U16 cr,cg,cb; |
| 3974 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
| 3975 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
| 3976 | |
| 3977 | r = lerp(dr, r, cr); |
| 3978 | g = lerp(dg, g, cg); |
| 3979 | b = lerp(db, b, cb); |
| 3980 | a = lerp(da, a, ca); |
| 3981 | } |
| 3982 | |
Mike Klein | eda2ac2 | 2018-11-06 11:53:59 -0500 | [diff] [blame] | 3983 | STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
| 3984 | U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail), |
| 3985 | add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail); |
| 3986 | |
| 3987 | r = min(div255(r*mul) + add, a); |
| 3988 | g = min(div255(g*mul) + add, a); |
| 3989 | b = min(div255(b*mul) + add, a); |
| 3990 | } |
| 3991 | |
| 3992 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 3993 | // ~~~~~~ Gradient stages ~~~~~~ // |
| 3994 | |
| 3995 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
| 3996 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
| 3997 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
| 3998 | |
| 3999 | STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } |
| 4000 | STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } |
| 4001 | STAGE_GG(mirror_x_1, Ctx::None) { |
| 4002 | auto two = [](F x){ return x+x; }; |
| 4003 | x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); |
| 4004 | } |
| 4005 | |
| 4006 | SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); } |
| 4007 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4008 | STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4009 | auto w = ctx->limit_x; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 4010 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4011 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4012 | STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4013 | auto h = ctx->limit_y; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 4014 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4015 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4016 | STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4017 | auto w = ctx->limit_x; |
| 4018 | auto h = ctx->limit_y; |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 4019 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h))); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4020 | } |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4021 | STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
Mike Klein | 7a177b4 | 2019-06-17 17:17:47 -0500 | [diff] [blame] | 4022 | auto mask = sk_unaligned_load<U16>(ctx->mask); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4023 | r = r & mask; |
| 4024 | g = g & mask; |
| 4025 | b = b & mask; |
| 4026 | a = a & mask; |
| 4027 | } |
| 4028 | |
Mike Klein | 24de648 | 2018-09-07 12:05:29 -0400 | [diff] [blame] | 4029 | SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul, |
| 4030 | U16* r, U16* g, U16* b, U16* a) { |
| 4031 | auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); }; |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4032 | |
Mike Klein | 24de648 | 2018-09-07 12:05:29 -0400 | [diff] [blame] | 4033 | F limit = interpolatedInPremul ? A |
| 4034 | : 1; |
| 4035 | *r = round(min(max(0,R), limit)); |
| 4036 | *g = round(min(max(0,G), limit)); |
| 4037 | *b = round(min(max(0,B), limit)); |
| 4038 | *a = round(A); // we assume alpha is already in [0,1]. |
| 4039 | } |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4040 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4041 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4042 | U16* r, U16* g, U16* b, U16* a) { |
| 4043 | |
| 4044 | F fr, fg, fb, fa, br, bg, bb, ba; |
Mike Klein | 51d35ed | 2020-04-24 08:16:22 -0500 | [diff] [blame] | 4045 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4046 | if (c->stopCount <=8) { |
| 4047 | __m256i lo, hi; |
| 4048 | split(idx, &lo, &hi); |
| 4049 | |
| 4050 | fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), |
| 4051 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); |
| 4052 | br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), |
| 4053 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); |
| 4054 | fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), |
| 4055 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); |
| 4056 | bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), |
| 4057 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); |
| 4058 | fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), |
| 4059 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); |
| 4060 | bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), |
| 4061 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); |
| 4062 | fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), |
| 4063 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); |
| 4064 | ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), |
| 4065 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); |
| 4066 | } else |
| 4067 | #endif |
| 4068 | { |
| 4069 | fr = gather<F>(c->fs[0], idx); |
| 4070 | fg = gather<F>(c->fs[1], idx); |
| 4071 | fb = gather<F>(c->fs[2], idx); |
| 4072 | fa = gather<F>(c->fs[3], idx); |
| 4073 | br = gather<F>(c->bs[0], idx); |
| 4074 | bg = gather<F>(c->bs[1], idx); |
| 4075 | bb = gather<F>(c->bs[2], idx); |
| 4076 | ba = gather<F>(c->bs[3], idx); |
| 4077 | } |
Mike Klein | 24de648 | 2018-09-07 12:05:29 -0400 | [diff] [blame] | 4078 | round_F_to_U16(mad(t, fr, br), |
| 4079 | mad(t, fg, bg), |
| 4080 | mad(t, fb, bb), |
| 4081 | mad(t, fa, ba), |
| 4082 | c->interpolatedInPremul, |
| 4083 | r,g,b,a); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4084 | } |
| 4085 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4086 | STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4087 | auto t = x; |
| 4088 | U32 idx = 0; |
| 4089 | |
| 4090 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
| 4091 | for (size_t i = 1; i < c->stopCount; i++) { |
| 4092 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
| 4093 | } |
| 4094 | |
| 4095 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
| 4096 | } |
| 4097 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4098 | STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4099 | auto t = x; |
| 4100 | auto idx = trunc_(t * (c->stopCount-1)); |
| 4101 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
| 4102 | } |
| 4103 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4104 | STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4105 | auto t = x; |
Mike Klein | 24de648 | 2018-09-07 12:05:29 -0400 | [diff] [blame] | 4106 | round_F_to_U16(mad(t, c->f[0], c->b[0]), |
| 4107 | mad(t, c->f[1], c->b[1]), |
| 4108 | mad(t, c->f[2], c->b[2]), |
| 4109 | mad(t, c->f[3], c->b[3]), |
| 4110 | c->interpolatedInPremul, |
| 4111 | &r,&g,&b,&a); |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4112 | } |
| 4113 | |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4114 | SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4115 | #if !defined(SK_SUPPORT_LEGACY_BILERP_HIGHP) |
| 4116 | STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
| 4117 | // Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed |
| 4118 | // point number. |
| 4119 | I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768, |
| 4120 | qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768; |
| 4121 | |
| 4122 | // Calculate screen coordinates sx & sy by flooring qx and qy. |
| 4123 | I32 sx = qx >> 16, |
| 4124 | sy = qy >> 16; |
| 4125 | |
| 4126 | // We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1). |
| 4127 | // This will put tx in Q15 format for use with q_mult. |
| 4128 | // Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval |
| 4129 | // [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows |
| 4130 | // the same math: |
| 4131 | // tx = 2 * {qx} - 1, so |
| 4132 | // {qx} = (tx + 1) / 2. |
| 4133 | // Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1 |
| 4134 | // is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in |
| 4135 | // order to use the full 16-bit resolution. |
| 4136 | I16 tx = cast<I16>(qx ^ 0x8000), |
| 4137 | ty = cast<I16>(qy ^ 0x8000); |
| 4138 | |
| 4139 | // Substituting the {qx} by the equation for tx from above into the lerp equation where v is |
| 4140 | // the lerped value: |
| 4141 | // v = {qx}*(R - L) + L, |
| 4142 | // v = 1/2*(tx + 1)*(R - L) + L |
| 4143 | // 2 * v = (tx + 1)*(R - L) + 2*L |
| 4144 | // = tx*R - tx*L + R - L + 2*L |
| 4145 | // = tx*(R - L) + (R + L). |
| 4146 | // Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form |
| 4147 | // for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In |
| 4148 | // code, we can multiply by 2^7 to get the value directly. |
| 4149 | // 2 * v = tx*(R - L) + (R + L) |
| 4150 | // 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9 |
| 4151 | // 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L)) |
| 4152 | // v = 1/2 * (tx*(R - L) + (R + L)) |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4153 | auto lerpX = [&](U16 left, U16 right) -> U16 { |
| 4154 | I16 width = (I16)(right - left) << 7; |
| 4155 | U16 middle = (right + left) << 7; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4156 | // The constrained_add is the most subtle part of lerp. The first term is on the interval |
| 4157 | // [-1, 1), and the second term is on the interval is on the interval [0, 1) because |
| 4158 | // both terms are too high by a factor of 2 which will be handled below. (Both R and L are |
| 4159 | // on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below |
| 4160 | // should overflow, but because we know that sum produces an output on the |
| 4161 | // interval [0, 1) we know that the extra bit that would be needed will always be 0. So |
| 4162 | // we need to be careful to treat this sum as an unsigned positive number in the divide |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4163 | // by 2 below. Add +1 for rounding. |
| 4164 | U16 v2 = constrained_add(scaled_mult(tx, width), middle) + 1; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4165 | // Divide by 2 to calculate v and at the same time bring the intermediate value onto the |
| 4166 | // interval [0, 1/2] to set up for the lerpY. |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4167 | return v2 >> 1; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4168 | }; |
| 4169 | |
| 4170 | const uint32_t* ptr; |
| 4171 | U32 ix = ix_and_ptr(&ptr, ctx, sx, sy); |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4172 | U16 leftR, leftG, leftB, leftA; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4173 | from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA); |
| 4174 | |
| 4175 | ix = ix_and_ptr(&ptr, ctx, sx+1, sy); |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4176 | U16 rightR, rightG, rightB, rightA; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4177 | from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA); |
| 4178 | |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4179 | U16 topR = lerpX(leftR, rightR), |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4180 | topG = lerpX(leftG, rightG), |
| 4181 | topB = lerpX(leftB, rightB), |
| 4182 | topA = lerpX(leftA, rightA); |
| 4183 | |
| 4184 | ix = ix_and_ptr(&ptr, ctx, sx, sy+1); |
| 4185 | from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA); |
| 4186 | |
| 4187 | ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1); |
| 4188 | from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA); |
| 4189 | |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4190 | U16 bottomR = lerpX(leftR, rightR), |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4191 | bottomG = lerpX(leftG, rightG), |
| 4192 | bottomB = lerpX(leftB, rightB), |
| 4193 | bottomA = lerpX(leftA, rightA); |
| 4194 | |
| 4195 | // lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting |
| 4196 | // in a value on [0, 255]. |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4197 | auto lerpY = [&](U16 top, U16 bottom) -> U16 { |
| 4198 | I16 width = (I16)bottom - top; |
| 4199 | U16 middle = bottom + top; |
| 4200 | // Add + 0x80 for rounding. |
| 4201 | U16 blend = constrained_add(scaled_mult(ty, width), middle) + 0x80; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4202 | |
Herb Derby | 2f5cfb6 | 2021-09-28 17:37:00 -0400 | [diff] [blame] | 4203 | return blend >> 8; |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4204 | }; |
| 4205 | |
| 4206 | r = lerpY(topR, bottomR); |
| 4207 | g = lerpY(topG, bottomG); |
| 4208 | b = lerpY(topB, bottomB); |
| 4209 | a = lerpY(topA, bottomA); |
| 4210 | } |
| 4211 | #endif // SK_SUPPORT_LEGACY_BILERP_HIGHP |
| 4212 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4213 | STAGE_GG(xy_to_unit_angle, Ctx::None) { |
| 4214 | F xabs = abs_(x), |
| 4215 | yabs = abs_(y); |
| 4216 | |
| 4217 | F slope = min(xabs, yabs)/max(xabs, yabs); |
| 4218 | F s = slope * slope; |
| 4219 | |
| 4220 | // Use a 7th degree polynomial to approximate atan. |
| 4221 | // This was generated using sollya.gforge.inria.fr. |
| 4222 | // A float optimized polynomial was generated using the following command. |
| 4223 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
| 4224 | F phi = slope |
| 4225 | * (0.15912117063999176025390625f + s |
| 4226 | * (-5.185396969318389892578125e-2f + s |
| 4227 | * (2.476101927459239959716796875e-2f + s |
| 4228 | * (-7.0547382347285747528076171875e-3f)))); |
| 4229 | |
| 4230 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
| 4231 | phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); |
| 4232 | phi = if_then_else(y < 0.0f , 1.0f - phi , phi); |
| 4233 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
| 4234 | x = phi; |
| 4235 | } |
| 4236 | STAGE_GG(xy_to_radius, Ctx::None) { |
| 4237 | x = sqrt_(x*x + y*y); |
| 4238 | } |
| 4239 | |
| 4240 | // ~~~~~~ Compound stages ~~~~~~ // |
| 4241 | |
Mike Klein | b11ab57 | 2018-10-24 06:42:14 -0400 | [diff] [blame] | 4242 | STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4243 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
| 4244 | |
| 4245 | load_8888_(ptr, tail, &dr,&dg,&db,&da); |
| 4246 | r = r + div255( dr*inv(a) ); |
| 4247 | g = g + div255( dg*inv(a) ); |
| 4248 | b = b + div255( db*inv(a) ); |
| 4249 | a = a + div255( da*inv(a) ); |
| 4250 | store_8888_(ptr, tail, r,g,b,a); |
| 4251 | } |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4252 | |
Jim Van Verth | d6245fc | 2022-02-15 16:30:59 -0500 | [diff] [blame] | 4253 | // ~~~~~~ skgpu::Swizzle stage ~~~~~~ // |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4254 | |
| 4255 | STAGE_PP(swizzle, void* ctx) { |
| 4256 | auto ir = r, ig = g, ib = b, ia = a; |
| 4257 | U16* o[] = {&r, &g, &b, &a}; |
| 4258 | char swiz[4]; |
| 4259 | memcpy(swiz, &ctx, sizeof(swiz)); |
| 4260 | |
| 4261 | for (int i = 0; i < 4; ++i) { |
| 4262 | switch (swiz[i]) { |
| 4263 | case 'r': *o[i] = ir; break; |
| 4264 | case 'g': *o[i] = ig; break; |
| 4265 | case 'b': *o[i] = ib; break; |
| 4266 | case 'a': *o[i] = ia; break; |
Brian Salomon | f30b1c1 | 2019-06-20 12:25:02 -0400 | [diff] [blame] | 4267 | case '0': *o[i] = U16(0); break; |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4268 | case '1': *o[i] = U16(255); break; |
| 4269 | default: break; |
| 4270 | } |
| 4271 | } |
| 4272 | } |
| 4273 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4274 | // Now we'll add null stand-ins for stages we haven't implemented in lowp. |
| 4275 | // If a pipeline uses these stages, it'll boot it out of lowp into highp. |
Mike Klein | 8b0f9d1 | 2019-01-03 11:26:57 -0500 | [diff] [blame] | 4276 | #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr; |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4277 | NOT_IMPLEMENTED(callback) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4278 | NOT_IMPLEMENTED(unbounded_set_rgb) |
| 4279 | NOT_IMPLEMENTED(unbounded_uniform_color) |
| 4280 | NOT_IMPLEMENTED(unpremul) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4281 | NOT_IMPLEMENTED(dither) |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 4282 | NOT_IMPLEMENTED(load_16161616) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4283 | NOT_IMPLEMENTED(load_16161616_dst) |
Brian Salomon | d608e22 | 2019-06-12 17:42:58 -0400 | [diff] [blame] | 4284 | NOT_IMPLEMENTED(store_16161616) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4285 | NOT_IMPLEMENTED(gather_16161616) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4286 | NOT_IMPLEMENTED(load_a16) |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 4287 | NOT_IMPLEMENTED(load_a16_dst) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4288 | NOT_IMPLEMENTED(store_a16) |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 4289 | NOT_IMPLEMENTED(gather_a16) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4290 | NOT_IMPLEMENTED(load_rg1616) |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 4291 | NOT_IMPLEMENTED(load_rg1616_dst) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4292 | NOT_IMPLEMENTED(store_rg1616) |
Robert Phillips | 429f0d3 | 2019-09-11 17:03:28 -0400 | [diff] [blame] | 4293 | NOT_IMPLEMENTED(gather_rg1616) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4294 | NOT_IMPLEMENTED(load_f16) |
| 4295 | NOT_IMPLEMENTED(load_f16_dst) |
| 4296 | NOT_IMPLEMENTED(store_f16) |
| 4297 | NOT_IMPLEMENTED(gather_f16) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4298 | NOT_IMPLEMENTED(load_af16) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4299 | NOT_IMPLEMENTED(load_af16_dst) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4300 | NOT_IMPLEMENTED(store_af16) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4301 | NOT_IMPLEMENTED(gather_af16) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4302 | NOT_IMPLEMENTED(load_rgf16) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4303 | NOT_IMPLEMENTED(load_rgf16_dst) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4304 | NOT_IMPLEMENTED(store_rgf16) |
Robert Phillips | 17a3a0b | 2019-09-18 13:56:54 -0400 | [diff] [blame] | 4305 | NOT_IMPLEMENTED(gather_rgf16) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4306 | NOT_IMPLEMENTED(load_f32) |
| 4307 | NOT_IMPLEMENTED(load_f32_dst) |
| 4308 | NOT_IMPLEMENTED(store_f32) |
| 4309 | NOT_IMPLEMENTED(gather_f32) |
Brian Salomon | 217522c | 2019-06-11 15:55:30 -0400 | [diff] [blame] | 4310 | NOT_IMPLEMENTED(load_rgf32) |
| 4311 | NOT_IMPLEMENTED(store_rgf32) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4312 | NOT_IMPLEMENTED(load_1010102) |
| 4313 | NOT_IMPLEMENTED(load_1010102_dst) |
| 4314 | NOT_IMPLEMENTED(store_1010102) |
| 4315 | NOT_IMPLEMENTED(gather_1010102) |
| 4316 | NOT_IMPLEMENTED(store_u16_be) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4317 | NOT_IMPLEMENTED(byte_tables) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4318 | NOT_IMPLEMENTED(colorburn) |
| 4319 | NOT_IMPLEMENTED(colordodge) |
| 4320 | NOT_IMPLEMENTED(softlight) |
| 4321 | NOT_IMPLEMENTED(hue) |
| 4322 | NOT_IMPLEMENTED(saturation) |
| 4323 | NOT_IMPLEMENTED(color) |
| 4324 | NOT_IMPLEMENTED(luminosity) |
| 4325 | NOT_IMPLEMENTED(matrix_3x3) |
| 4326 | NOT_IMPLEMENTED(matrix_3x4) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4327 | NOT_IMPLEMENTED(matrix_4x5) |
| 4328 | NOT_IMPLEMENTED(matrix_4x3) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4329 | NOT_IMPLEMENTED(parametric) |
Mike Klein | 1ce03a6 | 2019-04-23 08:00:35 -0500 | [diff] [blame] | 4330 | NOT_IMPLEMENTED(gamma_) |
Brian Osman | 11e6aa8 | 2019-10-16 13:58:42 -0400 | [diff] [blame] | 4331 | NOT_IMPLEMENTED(PQish) |
| 4332 | NOT_IMPLEMENTED(HLGish) |
| 4333 | NOT_IMPLEMENTED(HLGinvish) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4334 | NOT_IMPLEMENTED(rgb_to_hsl) |
| 4335 | NOT_IMPLEMENTED(hsl_to_rgb) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4336 | NOT_IMPLEMENTED(gauss_a_to_rgba) |
| 4337 | NOT_IMPLEMENTED(mirror_x) |
| 4338 | NOT_IMPLEMENTED(repeat_x) |
| 4339 | NOT_IMPLEMENTED(mirror_y) |
| 4340 | NOT_IMPLEMENTED(repeat_y) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4341 | NOT_IMPLEMENTED(negate_x) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4342 | NOT_IMPLEMENTED(bilinear) |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4343 | #if defined(SK_SUPPORT_LEGACY_BILERP_HIGHP) |
Herb Derby | 907c593 | 2021-09-27 21:09:55 +0000 | [diff] [blame] | 4344 | NOT_IMPLEMENTED(bilerp_clamp_8888) |
Herb Derby | 86eb628 | 2021-08-27 18:21:02 -0400 | [diff] [blame] | 4345 | #endif |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4346 | NOT_IMPLEMENTED(bicubic) |
Mike Reed | 78eedba | 2019-07-31 16:39:15 -0400 | [diff] [blame] | 4347 | NOT_IMPLEMENTED(bicubic_clamp_8888) |
Mike Klein | 3d95597 | 2021-02-08 15:17:45 -0600 | [diff] [blame] | 4348 | NOT_IMPLEMENTED(bilinear_nx) |
| 4349 | NOT_IMPLEMENTED(bilinear_ny) |
| 4350 | NOT_IMPLEMENTED(bilinear_px) |
| 4351 | NOT_IMPLEMENTED(bilinear_py) |
| 4352 | NOT_IMPLEMENTED(bicubic_n3x) |
| 4353 | NOT_IMPLEMENTED(bicubic_n1x) |
| 4354 | NOT_IMPLEMENTED(bicubic_p1x) |
| 4355 | NOT_IMPLEMENTED(bicubic_p3x) |
| 4356 | NOT_IMPLEMENTED(bicubic_n3y) |
| 4357 | NOT_IMPLEMENTED(bicubic_n1y) |
| 4358 | NOT_IMPLEMENTED(bicubic_p1y) |
| 4359 | NOT_IMPLEMENTED(bicubic_p3y) |
| 4360 | NOT_IMPLEMENTED(save_xy) |
| 4361 | NOT_IMPLEMENTED(accumulate) |
Mike Klein | 05bf931 | 2018-12-19 10:05:03 -0500 | [diff] [blame] | 4362 | NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved) |
| 4363 | NOT_IMPLEMENTED(xy_to_2pt_conical_strip) |
| 4364 | NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle) |
| 4365 | NOT_IMPLEMENTED(xy_to_2pt_conical_smaller) |
| 4366 | NOT_IMPLEMENTED(xy_to_2pt_conical_greater) |
| 4367 | NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal) |
| 4368 | NOT_IMPLEMENTED(alter_2pt_conical_unswap) |
| 4369 | NOT_IMPLEMENTED(mask_2pt_conical_nan) |
| 4370 | NOT_IMPLEMENTED(mask_2pt_conical_degenerates) |
| 4371 | NOT_IMPLEMENTED(apply_vector_mask) |
| 4372 | #undef NOT_IMPLEMENTED |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4373 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4374 | #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages |
| 4375 | } // namespace lowp |
| 4376 | |
| 4377 | } // namespace SK_OPTS_NS |
| 4378 | |
Mike Klein | b296d92 | 2021-01-14 09:24:46 -0600 | [diff] [blame] | 4379 | #undef SI |
| 4380 | |
Mike Klein | 1b9b7d5 | 2018-02-27 10:37:40 -0500 | [diff] [blame] | 4381 | #endif//SkRasterPipeline_opts_DEFINED |