| /* |
| * Copyright 2017 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| // This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file |
| // first before trying to understand this one. We'll note only key differences here. |
| |
| #include "SkSplicer_shared.h" |
| #include <string.h> |
| |
| #if !defined(__clang__) |
| #error This file is not like the rest of Skia. It must be compiled with clang. |
| #endif |
| |
| #if defined(__aarch64__) |
| #include <arm_neon.h> |
| |
| // In this file, F is a vector of SkFixed15. |
| // See SkFixed15.h for notes on its various operations. |
| struct F { |
| using V = uint16_t __attribute__((ext_vector_type(8))); |
| |
| V vec; |
| |
| F(uint16x8_t v) : vec(v) {} |
| operator V() const { return vec; } |
| |
| F() = default; |
| F(uint16_t v) : vec(v) {} |
| |
| F operator+(F o) const { return vqaddq_u16(vec, o.vec); } |
| F operator-(F o) const { return vqsubq_u16(vec, o.vec); } |
| F operator*(F o) const { |
| return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)), |
| vandq_s16(vec, o.vec), 15); |
| } |
| F operator>>(int k) const { return vec >> k; } |
| F operator<<(int k) const { return vec << k; } |
| }; |
| static F min(F a, F b) { return vminq_u16(a,b); } |
| static F max(F a, F b) { return vmaxq_u16(a,b); } |
| |
| #elif defined(__ARM_NEON__) |
| #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) |
| #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. |
| #endif |
| #include <arm_neon.h> |
| |
| struct F { |
| using V = uint16_t __attribute__((ext_vector_type(4))); |
| |
| V vec; |
| |
| F(uint16x4_t v) : vec(v) {} |
| operator V() const { return vec; } |
| |
| F() = default; |
| F(uint16_t v) : vec(v) {} |
| |
| F operator+(F o) const { return vqadd_u16(vec, o.vec); } |
| F operator-(F o) const { return vqsub_u16(vec, o.vec); } |
| F operator*(F o) const { |
| return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)), |
| vand_s16(vec, o.vec), 15); |
| } |
| F operator>>(int k) const { return vec >> k; } |
| F operator<<(int k) const { return vec << k; } |
| }; |
| static F min(F a, F b) { return vmin_u16(a,b); } |
| static F max(F a, F b) { return vmax_u16(a,b); } |
| |
| #else |
| #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) |
| #error On x86, compile with -mavx2 -mfma -mf16c. |
| #endif |
| #include <immintrin.h> |
| |
| struct F { |
| using V = uint16_t __attribute__((ext_vector_type(16))); |
| |
| V vec; |
| |
| F(__m256 v) : vec(v) {} |
| operator V() const { return vec; } |
| |
| F() = default; |
| F(uint16_t v) : vec(v) {} |
| |
| F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); } |
| F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); } |
| F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); } |
| F operator>>(int k) const { return vec >> k; } |
| F operator<<(int k) const { return vec << k; } |
| }; |
| static F min(F a, F b) { return _mm256_min_epu16(a,b); } |
| static F max(F a, F b) { return _mm256_max_epu16(a,b); } |
| #endif |
| |
| // No platform actually supports FMA for SkFixed15. |
| // This fma() method just makes it easier to port stages to lowp. |
| static F fma(F f, F m, F a) { return f*m+a; } |
| |
| #if defined(__ARM_NEON__) |
| #define C extern "C" __attribute__((pcs("aapcs-vfp"))) |
| #else |
| #define C extern "C" |
| #endif |
| |
| // We use a set of constants suitable for SkFixed15 math. |
| using K = const SkSplicer_constants_lowp; |
| using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F); |
| |
| // The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in |
| // registers. This shouldn't affect performance or how you write STAGEs in any way. |
| C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V); |
| |
| #define STAGE(name) \ |
| static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ |
| F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
| C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \ |
| F::V R, F::V G, F::V B, F::V A, \ |
| F::V DR, F::V DG, F::V DB, F::V DA) { \ |
| F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \ |
| name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ |
| done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ |
| } \ |
| static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ |
| F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
| |
| STAGE(inc_x) { |
| x += sizeof(F) / sizeof(uint16_t); |
| } |
| |
| STAGE(clear) { |
| r = g = b = a = 0; |
| } |
| |
| STAGE(plus_) { |
| r = r + dr; |
| g = g + dg; |
| b = b + db; |
| a = a + da; |
| } |
| |
| STAGE(srcover) { |
| auto A = F(k->_1) - a; |
| r = fma(dr, A, r); |
| g = fma(dg, A, g); |
| b = fma(db, A, b); |
| a = fma(da, A, a); |
| } |
| STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); } |
| |
| STAGE(clamp_1) { |
| r = min(r, k->_1); |
| g = min(g, k->_1); |
| b = min(b, k->_1); |
| a = min(a, k->_1); |
| } |
| |
| STAGE(clamp_a) { |
| a = min(a, k->_1); |
| r = min(r, a); |
| g = min(g, a); |
| b = min(b, a); |
| } |
| |
| STAGE(swap) { |
| auto swap = [](F& v, F& dv) { |
| auto tmp = v; |
| v = dv; |
| dv = tmp; |
| }; |
| swap(r, dr); |
| swap(g, dg); |
| swap(b, db); |
| swap(a, da); |
| } |
| STAGE(move_src_dst) { |
| dr = r; |
| dg = g; |
| db = b; |
| da = a; |
| } |
| STAGE(move_dst_src) { |
| r = dr; |
| g = dg; |
| b = db; |
| a = da; |
| } |
| |
| STAGE(premul) { |
| r = r * a; |
| g = g * a; |
| b = b * a; |
| } |
| |
| STAGE(load_8888) { |
| auto ptr = *(const uint32_t**)ctx + x; |
| |
| #if defined(__aarch64__) |
| auto to_fixed15 = [](uint8x8_t u8) { |
| // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h) |
| // |
| // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0 |
| // and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions! |
| auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128 |
| return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding |
| }; |
| |
| uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr); |
| r = to_fixed15(rgba.val[0]); |
| g = to_fixed15(rgba.val[1]); |
| b = to_fixed15(rgba.val[2]); |
| a = to_fixed15(rgba.val[3]); |
| |
| #elif defined(__ARM_NEON__) |
| auto to_fixed15 = [](uint8x8_t u8) { |
| // Same as aarch64, but only keeping the bottom 4 lanes. |
| auto u16 = vshll_n_u8(u8, 7); |
| return vget_low_u16(vrsraq_n_u16(u16, u16, 8)); |
| }; |
| |
| // I can't get quite the code generation I want using vld4_lane_u8(), |
| // so we're going to drop into assembly to do the loads. :/ |
| |
| uint8x8_t R,G,B,A; |
| asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n" |
| "vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n" |
| "vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n" |
| "vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n" |
| : "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A)); |
| r = to_fixed15(R); |
| g = to_fixed15(G); |
| b = to_fixed15(B); |
| a = to_fixed15(A); |
| |
| #else |
| auto to_fixed15 = [k](__m128i u8) { |
| F u16 = _mm256_cvtepu8_epi16(u8); |
| return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8); |
| }; |
| |
| // TODO: shorter, more confusing, faster with 256-bit loads and shuffles |
| |
| // Load 16 interplaced pixels. |
| auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0), |
| _4567 = _mm_loadu_si128((const __m128i*)ptr + 1), |
| _89AB = _mm_loadu_si128((const __m128i*)ptr + 2), |
| _CDEF = _mm_loadu_si128((const __m128i*)ptr + 3); |
| |
| // We've got an awful lot of unpacking to do to transpose this... |
| auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15 |
| _2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37 |
| _8C9D = _mm_unpacklo_epi8(_89AB, _CDEF), |
| _AEBF = _mm_unpackhi_epi8(_89AB, _CDEF); |
| |
| auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246 |
| _1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357 |
| _8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF), |
| _9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF); |
| |
| auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567 |
| ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567 |
| rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF |
| ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF |
| |
| r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF)); |
| g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF)); |
| b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF)); |
| a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF)); |
| #endif |
| } |
| |
| STAGE(store_8888) { |
| auto ptr = *(uint32_t**)ctx + x; |
| |
| #if defined(__aarch64__) |
| auto from_fixed15 = [](F v) { |
| // The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7. |
| // But what's really most important is that all bytes round trip. |
| |
| // We can do this in NEON in one instruction, a saturating narrowing right shift: |
| return vqshrn_n_u16(v, 7); |
| }; |
| |
| uint8x8x4_t rgba = {{ |
| from_fixed15(r), |
| from_fixed15(g), |
| from_fixed15(b), |
| from_fixed15(a), |
| }}; |
| vst4_u8((uint8_t*)ptr, rgba); |
| #elif defined(__ARM_NEON__) |
| auto from_fixed15 = [](F v) { |
| // Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes. |
| F whatever; |
| return vqshrn_n_u16(vcombine_u8(v, whatever), 7); |
| }; |
| |
| // As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8(). |
| asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n" |
| "vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n" |
| "vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n" |
| "vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n" |
| : "+r"(ptr) |
| : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a)) |
| : "memory"); |
| |
| #else |
| auto from_fixed15 = [](F v) { |
| // See the note in aarch64's from_fixed15(). The same roundtrip goal applies here. |
| // Here we take a different approach: (v saturated+ v) >> 8. |
| v = (v+v) >> 8; |
| return _mm_packus_epi16(_mm256_extracti128_si256(v, 0), |
| _mm256_extracti128_si256(v, 1)); |
| }; |
| |
| auto R = from_fixed15(r), |
| G = from_fixed15(g), |
| B = from_fixed15(b), |
| A = from_fixed15(a); |
| |
| auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7 |
| rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF |
| ba_01234567 = _mm_unpacklo_epi8(B,A), |
| ba_89ABCDEF = _mm_unpackhi_epi8(B,A); |
| _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567)); |
| _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567)); |
| _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF)); |
| _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF)); |
| #endif |
| } |