Mike Klein | f720098 | 2017-01-15 18:14:07 -0500 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 2017 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | // This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file |
| 9 | // first before trying to understand this one. We'll note only key differences here. |
| 10 | |
| 11 | #include "SkSplicer_shared.h" |
| 12 | #include <string.h> |
| 13 | |
| 14 | #if !defined(__clang__) |
| 15 | #error This file is not like the rest of Skia. It must be compiled with clang. |
| 16 | #endif |
| 17 | |
| 18 | #if defined(__aarch64__) |
| 19 | #include <arm_neon.h> |
| 20 | |
| 21 | // In this file, F is a vector of SkFixed15. |
| 22 | // See SkFixed15.h for notes on its various operations. |
| 23 | struct F { |
| 24 | using V = uint16_t __attribute__((ext_vector_type(8))); |
| 25 | |
| 26 | V vec; |
| 27 | |
| 28 | F(uint16x8_t v) : vec(v) {} |
| 29 | operator V() const { return vec; } |
| 30 | |
| 31 | F() = default; |
| 32 | F(uint16_t v) : vec(v) {} |
| 33 | |
| 34 | F operator+(F o) const { return vqaddq_u16(vec, o.vec); } |
| 35 | F operator-(F o) const { return vqsubq_u16(vec, o.vec); } |
| 36 | F operator*(F o) const { |
| 37 | return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)), |
| 38 | vandq_s16(vec, o.vec), 15); |
| 39 | } |
| 40 | F operator>>(int k) const { return vec >> k; } |
| 41 | F operator<<(int k) const { return vec << k; } |
| 42 | }; |
| 43 | static F min(F a, F b) { return vminq_u16(a,b); } |
| 44 | static F max(F a, F b) { return vmaxq_u16(a,b); } |
| 45 | |
| 46 | #elif defined(__ARM_NEON__) |
| 47 | #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) |
| 48 | #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. |
| 49 | #endif |
| 50 | #include <arm_neon.h> |
| 51 | |
| 52 | struct F { |
| 53 | using V = uint16_t __attribute__((ext_vector_type(4))); |
| 54 | |
| 55 | V vec; |
| 56 | |
| 57 | F(uint16x4_t v) : vec(v) {} |
| 58 | operator V() const { return vec; } |
| 59 | |
| 60 | F() = default; |
| 61 | F(uint16_t v) : vec(v) {} |
| 62 | |
| 63 | F operator+(F o) const { return vqadd_u16(vec, o.vec); } |
| 64 | F operator-(F o) const { return vqsub_u16(vec, o.vec); } |
| 65 | F operator*(F o) const { |
| 66 | return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)), |
| 67 | vand_s16(vec, o.vec), 15); |
| 68 | } |
| 69 | F operator>>(int k) const { return vec >> k; } |
| 70 | F operator<<(int k) const { return vec << k; } |
| 71 | }; |
| 72 | static F min(F a, F b) { return vmin_u16(a,b); } |
| 73 | static F max(F a, F b) { return vmax_u16(a,b); } |
| 74 | |
| 75 | #else |
| 76 | #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) |
| 77 | #error On x86, compile with -mavx2 -mfma -mf16c. |
| 78 | #endif |
| 79 | #include <immintrin.h> |
| 80 | |
| 81 | struct F { |
| 82 | using V = uint16_t __attribute__((ext_vector_type(16))); |
| 83 | |
| 84 | V vec; |
| 85 | |
| 86 | F(__m256 v) : vec(v) {} |
| 87 | operator V() const { return vec; } |
| 88 | |
| 89 | F() = default; |
| 90 | F(uint16_t v) : vec(v) {} |
| 91 | |
| 92 | F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); } |
| 93 | F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); } |
| 94 | F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); } |
| 95 | F operator>>(int k) const { return vec >> k; } |
| 96 | F operator<<(int k) const { return vec << k; } |
| 97 | }; |
| 98 | static F min(F a, F b) { return _mm256_min_epu16(a,b); } |
| 99 | static F max(F a, F b) { return _mm256_max_epu16(a,b); } |
| 100 | #endif |
| 101 | |
| 102 | // No platform actually supports FMA for SkFixed15. |
| 103 | // This fma() method just makes it easier to port stages to lowp. |
| 104 | static F fma(F f, F m, F a) { return f*m+a; } |
| 105 | |
| 106 | #if defined(__ARM_NEON__) |
| 107 | #define C extern "C" __attribute__((pcs("aapcs-vfp"))) |
| 108 | #else |
| 109 | #define C extern "C" |
| 110 | #endif |
| 111 | |
| 112 | // We use a set of constants suitable for SkFixed15 math. |
| 113 | using K = const SkSplicer_constants_lowp; |
| 114 | using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F); |
| 115 | |
| 116 | // The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in |
| 117 | // registers. This shouldn't affect performance or how you write STAGEs in any way. |
| 118 | C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V); |
| 119 | |
| 120 | #define STAGE(name) \ |
| 121 | static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ |
| 122 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
| 123 | C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \ |
| 124 | F::V R, F::V G, F::V B, F::V A, \ |
| 125 | F::V DR, F::V DG, F::V DB, F::V DA) { \ |
| 126 | F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \ |
| 127 | name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ |
| 128 | done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ |
| 129 | } \ |
| 130 | static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ |
| 131 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| 132 | |
| 133 | STAGE(inc_x) { |
| 134 | x += sizeof(F) / sizeof(uint16_t); |
| 135 | } |
| 136 | |
| 137 | STAGE(clear) { |
| 138 | r = g = b = a = 0; |
| 139 | } |
| 140 | |
| 141 | STAGE(plus_) { |
| 142 | r = r + dr; |
| 143 | g = g + dg; |
| 144 | b = b + db; |
| 145 | a = a + da; |
| 146 | } |
| 147 | |
| 148 | STAGE(srcover) { |
| 149 | auto A = F(k->_1) - a; |
| 150 | r = fma(dr, A, r); |
| 151 | g = fma(dg, A, g); |
| 152 | b = fma(db, A, b); |
| 153 | a = fma(da, A, a); |
| 154 | } |
| 155 | STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); } |
| 156 | |
| 157 | STAGE(clamp_1) { |
| 158 | r = min(r, k->_1); |
| 159 | g = min(g, k->_1); |
| 160 | b = min(b, k->_1); |
| 161 | a = min(a, k->_1); |
| 162 | } |
| 163 | |
| 164 | STAGE(clamp_a) { |
| 165 | a = min(a, k->_1); |
| 166 | r = min(r, a); |
| 167 | g = min(g, a); |
| 168 | b = min(b, a); |
| 169 | } |
| 170 | |
| 171 | STAGE(swap) { |
| 172 | auto swap = [](F& v, F& dv) { |
| 173 | auto tmp = v; |
| 174 | v = dv; |
| 175 | dv = tmp; |
| 176 | }; |
| 177 | swap(r, dr); |
| 178 | swap(g, dg); |
| 179 | swap(b, db); |
| 180 | swap(a, da); |
| 181 | } |
| 182 | STAGE(move_src_dst) { |
| 183 | dr = r; |
| 184 | dg = g; |
| 185 | db = b; |
| 186 | da = a; |
| 187 | } |
| 188 | STAGE(move_dst_src) { |
| 189 | r = dr; |
| 190 | g = dg; |
| 191 | b = db; |
| 192 | a = da; |
| 193 | } |
| 194 | |
| 195 | STAGE(premul) { |
| 196 | r = r * a; |
| 197 | g = g * a; |
| 198 | b = b * a; |
| 199 | } |
| 200 | |
| 201 | STAGE(load_8888) { |
| 202 | auto ptr = *(const uint32_t**)ctx + x; |
| 203 | |
| 204 | #if defined(__aarch64__) |
| 205 | auto to_fixed15 = [](uint8x8_t u8) { |
| 206 | // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h) |
| 207 | // |
| 208 | // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0 |
| 209 | // and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions! |
| 210 | auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128 |
| 211 | return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding |
| 212 | }; |
| 213 | |
| 214 | uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr); |
| 215 | r = to_fixed15(rgba.val[0]); |
| 216 | g = to_fixed15(rgba.val[1]); |
| 217 | b = to_fixed15(rgba.val[2]); |
| 218 | a = to_fixed15(rgba.val[3]); |
| 219 | |
| 220 | #elif defined(__ARM_NEON__) |
| 221 | auto to_fixed15 = [](uint8x8_t u8) { |
| 222 | // Same as aarch64, but only keeping the bottom 4 lanes. |
| 223 | auto u16 = vshll_n_u8(u8, 7); |
| 224 | return vget_low_u16(vrsraq_n_u16(u16, u16, 8)); |
| 225 | }; |
| 226 | |
| 227 | // I can't get quite the code generation I want using vld4_lane_u8(), |
| 228 | // so we're going to drop into assembly to do the loads. :/ |
| 229 | |
| 230 | uint8x8_t R,G,B,A; |
| 231 | asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n" |
| 232 | "vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n" |
| 233 | "vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n" |
| 234 | "vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n" |
| 235 | : "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A)); |
| 236 | r = to_fixed15(R); |
| 237 | g = to_fixed15(G); |
| 238 | b = to_fixed15(B); |
| 239 | a = to_fixed15(A); |
| 240 | |
| 241 | #else |
| 242 | auto to_fixed15 = [k](__m128i u8) { |
| 243 | F u16 = _mm256_cvtepu8_epi16(u8); |
| 244 | return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8); |
| 245 | }; |
| 246 | |
| 247 | // TODO: shorter, more confusing, faster with 256-bit loads and shuffles |
| 248 | |
| 249 | // Load 16 interplaced pixels. |
| 250 | auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0), |
| 251 | _4567 = _mm_loadu_si128((const __m128i*)ptr + 1), |
| 252 | _89AB = _mm_loadu_si128((const __m128i*)ptr + 2), |
| 253 | _CDEF = _mm_loadu_si128((const __m128i*)ptr + 3); |
| 254 | |
| 255 | // We've got an awful lot of unpacking to do to transpose this... |
| 256 | auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15 |
| 257 | _2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37 |
| 258 | _8C9D = _mm_unpacklo_epi8(_89AB, _CDEF), |
| 259 | _AEBF = _mm_unpackhi_epi8(_89AB, _CDEF); |
| 260 | |
| 261 | auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246 |
| 262 | _1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357 |
| 263 | _8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF), |
| 264 | _9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF); |
| 265 | |
| 266 | auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567 |
| 267 | ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567 |
| 268 | rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF |
| 269 | ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF |
| 270 | |
| 271 | r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF)); |
| 272 | g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF)); |
| 273 | b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF)); |
| 274 | a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF)); |
| 275 | #endif |
| 276 | } |
| 277 | |
| 278 | STAGE(store_8888) { |
| 279 | auto ptr = *(uint32_t**)ctx + x; |
| 280 | |
| 281 | #if defined(__aarch64__) |
| 282 | auto from_fixed15 = [](F v) { |
| 283 | // The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7. |
| 284 | // But what's really most important is that all bytes round trip. |
| 285 | |
| 286 | // We can do this in NEON in one instruction, a saturating narrowing right shift: |
| 287 | return vqshrn_n_u16(v, 7); |
| 288 | }; |
| 289 | |
| 290 | uint8x8x4_t rgba = {{ |
| 291 | from_fixed15(r), |
| 292 | from_fixed15(g), |
| 293 | from_fixed15(b), |
| 294 | from_fixed15(a), |
| 295 | }}; |
| 296 | vst4_u8((uint8_t*)ptr, rgba); |
| 297 | #elif defined(__ARM_NEON__) |
| 298 | auto from_fixed15 = [](F v) { |
| 299 | // Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes. |
| 300 | F whatever; |
| 301 | return vqshrn_n_u16(vcombine_u8(v, whatever), 7); |
| 302 | }; |
| 303 | |
| 304 | // As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8(). |
| 305 | asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n" |
| 306 | "vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n" |
| 307 | "vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n" |
| 308 | "vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n" |
| 309 | : "+r"(ptr) |
| 310 | : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a)) |
| 311 | : "memory"); |
| 312 | |
| 313 | #else |
| 314 | auto from_fixed15 = [](F v) { |
| 315 | // See the note in aarch64's from_fixed15(). The same roundtrip goal applies here. |
| 316 | // Here we take a different approach: (v saturated+ v) >> 8. |
| 317 | v = (v+v) >> 8; |
| 318 | return _mm_packus_epi16(_mm256_extracti128_si256(v, 0), |
| 319 | _mm256_extracti128_si256(v, 1)); |
| 320 | }; |
| 321 | |
| 322 | auto R = from_fixed15(r), |
| 323 | G = from_fixed15(g), |
| 324 | B = from_fixed15(b), |
| 325 | A = from_fixed15(a); |
| 326 | |
| 327 | auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7 |
| 328 | rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF |
| 329 | ba_01234567 = _mm_unpacklo_epi8(B,A), |
| 330 | ba_89ABCDEF = _mm_unpackhi_epi8(B,A); |
| 331 | _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567)); |
| 332 | _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567)); |
| 333 | _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF)); |
| 334 | _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF)); |
| 335 | #endif |
| 336 | } |