Port S32A_opaque blit row to SkOpts.
This should be a pixel-for-pixel (i.e. bug-for-bug) port.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1820313002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1820313002
diff --git a/gyp/opts.gypi b/gyp/opts.gypi
index 3401a80..f3f61ef 100644
--- a/gyp/opts.gypi
+++ b/gyp/opts.gypi
@@ -49,7 +49,6 @@
'<(skia_src_path)/opts/SkOpts_ssse3.cpp',
],
'sse41_sources': [
- '<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
'<(skia_src_path)/opts/SkOpts_sse41.cpp',
],
# These targets are empty, but XCode doesn't like that, so add an empty file to each.
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 75330ae..80c7242 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -52,35 +52,6 @@
}
}
-static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- SkASSERT(255 == alpha);
- if (count > 0) {
-#ifdef UNROLL
- if (count & 1) {
- *dst = SkPMSrcOver(*(src++), *dst);
- dst += 1;
- count -= 1;
- }
-
- const SkPMColor* SK_RESTRICT srcEnd = src + count;
- while (src != srcEnd) {
- *dst = SkPMSrcOver(*(src++), *dst);
- dst += 1;
- *dst = SkPMSrcOver(*(src++), *dst);
- dst += 1;
- }
-#else
- do {
- *dst = SkPMSrcOver(*src, *dst);
- src += 1;
- dst += 1;
- } while (--count > 0);
-#endif
- }
-}
-
static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
@@ -115,7 +86,7 @@
static const SkBlitRow::Proc32 gDefault_Procs32[] = {
S32_Opaque_BlitRow32,
S32_Blend_BlitRow32,
- S32A_Opaque_BlitRow32,
+ nullptr,
S32A_Blend_BlitRow32
};
@@ -124,6 +95,11 @@
// just so we don't crash
flags &= kFlags32_Mask;
+ if (flags == 2) {
+ // S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
+ return SkOpts::blit_row_s32a_opaque;
+ }
+
SkBlitRow::Proc32 proc = PlatformProcs32(flags);
if (nullptr == proc) {
proc = gDefault_Procs32[flags];
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index e34b6d7..50659d4 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -63,6 +63,7 @@
#endif
namespace SkOpts {
+
// Define default function pointer values here...
// If our global compile options are set high enough, these defaults might even be
// CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults.
@@ -84,7 +85,8 @@
decltype(blit_mask_d32_a8) blit_mask_d32_a8 = sk_default::blit_mask_d32_a8;
- decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
+ decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
+ decltype(blit_row_s32a_opaque) blit_row_s32a_opaque = sk_default::blit_row_s32a_opaque;
decltype(matrix_translate) matrix_translate = sk_default::matrix_translate;
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 42c47a5..3989047 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -39,6 +39,7 @@
extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
extern void (*blit_row_color32)(SkPMColor*, const SkPMColor*, int, SkPMColor);
+ extern void (*blit_row_s32a_opaque)(SkPMColor*, const SkPMColor*, int, U8CPU);
// This function is an optimized version of SkColorCubeFilter::filterSpan
extern void (*color_cube_filter_span)(const SkPMColor[],
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index f000f30..94d9ee9 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -9,6 +9,12 @@
#define SkBlitRow_opts_DEFINED
#include "Sk4px.h"
+#include "SkColorPriv.h"
+#include "SkMSAN.h"
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ #include "SkColor_opts_SSE2.h"
+#endif
namespace SK_OPTS_NS {
@@ -17,7 +23,8 @@
// and it's quite a bit faster than blend_perfect.
//
// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one.
-static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
+static inline
+void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
unsigned invA = 255 - SkGetPackedA32(color);
invA += invA >> 7;
SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally.
@@ -30,6 +37,167 @@
});
}
+static inline
+void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
+ SkASSERT(alpha == 0xFF);
+ sk_msan_assert_initialized(src, src+len);
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+ while (len >= 16) {
+ // Load 16 source pixels.
+ auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
+ s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
+ s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
+ s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
+
+ const auto alphaMask = _mm_set1_epi32(0xFF000000);
+
+ auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+ if (_mm_testz_si128(ORed, alphaMask)) {
+ // All 16 source pixels are transparent. Nothing to do.
+ src += 16;
+ dst += 16;
+ len -= 16;
+ continue;
+ }
+
+ auto d0 = (__m128i*)(dst) + 0,
+ d1 = (__m128i*)(dst) + 1,
+ d2 = (__m128i*)(dst) + 2,
+ d3 = (__m128i*)(dst) + 3;
+
+ auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+ if (_mm_testc_si128(ANDed, alphaMask)) {
+ // All 16 source pixels are opaque. SrcOver becomes Src.
+ _mm_storeu_si128(d0, s0);
+ _mm_storeu_si128(d1, s1);
+ _mm_storeu_si128(d2, s2);
+ _mm_storeu_si128(d3, s3);
+ src += 16;
+ dst += 16;
+ len -= 16;
+ continue;
+ }
+
+ // TODO: This math is wrong.
+ // Do SrcOver.
+ _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
+ _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
+ _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
+ _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
+ src += 16;
+ dst += 16;
+ len -= 16;
+ }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ while (len >= 16) {
+ // Load 16 source pixels.
+ auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
+ s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
+ s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
+ s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
+
+ const auto alphaMask = _mm_set1_epi32(0xFF000000);
+
+ auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
+ _mm_setzero_si128()))) {
+ // All 16 source pixels are transparent. Nothing to do.
+ src += 16;
+ dst += 16;
+ len -= 16;
+ continue;
+ }
+
+ auto d0 = (__m128i*)(dst) + 0,
+ d1 = (__m128i*)(dst) + 1,
+ d2 = (__m128i*)(dst) + 2,
+ d3 = (__m128i*)(dst) + 3;
+
+ auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
+ alphaMask))) {
+ // All 16 source pixels are opaque. SrcOver becomes Src.
+ _mm_storeu_si128(d0, s0);
+ _mm_storeu_si128(d1, s1);
+ _mm_storeu_si128(d2, s2);
+ _mm_storeu_si128(d3, s3);
+ src += 16;
+ dst += 16;
+ len -= 16;
+ continue;
+ }
+
+ // TODO: This math is wrong.
+ // Do SrcOver.
+ _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
+ _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
+ _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
+ _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
+
+ src += 16;
+ dst += 16;
+ len -= 16;
+ }
+
+#elif defined(SK_ARM_HAS_NEON)
+ while (len >= 4) {
+ if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) {
+ // All 16 source pixels are transparent. Nothing to do.
+ src += 4;
+ dst += 4;
+ len -= 4;
+ continue;
+ }
+
+ if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) {
+ // All 16 source pixels are opaque. SrcOver becomes Src.
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ src += 4;
+ dst += 4;
+ len -= 4;
+ continue;
+ }
+
+ // Load 4 source and destination pixels.
+ auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)),
+ src2 = vreinterpret_u8_u32(vld1_u32(src+2)),
+ dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)),
+ dst2 = vreinterpret_u8_u32(vld1_u32(dst+2));
+
+ // TODO: This math is wrong.
+ const uint8x8_t alphas = vcreate_u8(0x0707070703030303);
+ auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)),
+ invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas));
+
+ auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)),
+ dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2));
+
+ dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8));
+ dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8));
+
+ vst1_u32(dst+0, vreinterpret_u32_u8(dst0));
+ vst1_u32(dst+2, vreinterpret_u32_u8(dst2));
+
+ src += 4;
+ dst += 4;
+ len -= 4;
+ }
+#endif
+
+ while (len-- > 0) {
+ if (*src) {
+ *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
+ }
+ src++;
+ dst++;
+ }
+}
+
} // SK_OPTS_NS
#endif//SkBlitRow_opts_DEFINED
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 1929918..a6fbc25 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -67,61 +67,6 @@
}
}
-void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- sk_msan_assert_initialized(src, src+count);
-
- SkASSERT(alpha == 255);
- if (count <= 0) {
- return;
- }
-
- int count16 = count / 16;
- __m128i* dst4 = (__m128i*)dst;
- const __m128i* src4 = (const __m128i*)src;
-
- for (int i = 0; i < count16 * 4; i += 4) {
- // Load 16 source pixels.
- __m128i s0 = _mm_loadu_si128(src4+i+0),
- s1 = _mm_loadu_si128(src4+i+1),
- s2 = _mm_loadu_si128(src4+i+2),
- s3 = _mm_loadu_si128(src4+i+3);
-
- const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
- const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
- __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
- if (0xffff == _mm_movemask_epi8(cmp)) {
- // All 16 source pixels are fully transparent. There's nothing to do!
- continue;
- }
- const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
- cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
- if (0xffff == _mm_movemask_epi8(cmp)) {
- // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
- _mm_storeu_si128(dst4+i+0, s0);
- _mm_storeu_si128(dst4+i+1, s1);
- _mm_storeu_si128(dst4+i+2, s2);
- _mm_storeu_si128(dst4+i+3, s3);
- continue;
- }
- // The general slow case: do the blend for all 16 pixels.
- _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
- _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
- _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
- _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
- }
-
- // Wrap up the last <= 15 pixels.
- SkASSERT(count - (count16*16) <= 15);
- for (int i = count16*16; i < count; i++) {
- // This check is not really necessarily, but it prevents pointless autovectorization.
- if (src[i] & 0xFF000000) {
- dst[i] = SkPMSrcOver(src[i], dst[i]);
- }
- }
-}
-
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 560edf4..652ff6e 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -14,10 +14,6 @@
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
-void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha);
-
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
deleted file mode 100644
index e5d8809..0000000
--- a/src/opts/SkBlitRow_opts_SSE4.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBlitRow_opts_SSE4.h"
-
-// Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub methods.
-// The stubs should never be called, so we make them crash just to confirm that.
-#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
- sk_throw();
-}
-
-#else
-
-#include <smmintrin.h> // SSE4.1 intrinsics
-#include "SkColorPriv.h"
-#include "SkColor_opts_SSE2.h"
-#include "SkMSAN.h"
-
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count,
- U8CPU alpha) {
- sk_msan_assert_initialized(src, src+count);
-
- SkASSERT(alpha == 255);
- // As long as we can, we'll work on 16 pixel pairs at once.
- int count16 = count / 16;
- __m128i* dst4 = (__m128i*)dst;
- const __m128i* src4 = (const __m128i*)src;
-
- for (int i = 0; i < count16 * 4; i += 4) {
- // Load 16 source pixels.
- __m128i s0 = _mm_loadu_si128(src4+i+0),
- s1 = _mm_loadu_si128(src4+i+1),
- s2 = _mm_loadu_si128(src4+i+2),
- s3 = _mm_loadu_si128(src4+i+3);
-
- const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
- const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
- if (_mm_testz_si128(ORed, alphaMask)) {
- // All 16 source pixels are fully transparent. There's nothing to do!
- continue;
- }
- const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
- if (_mm_testc_si128(ANDed, alphaMask)) {
- // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
- _mm_storeu_si128(dst4+i+0, s0);
- _mm_storeu_si128(dst4+i+1, s1);
- _mm_storeu_si128(dst4+i+2, s2);
- _mm_storeu_si128(dst4+i+3, s3);
- continue;
- }
- // The general slow case: do the blend for all 16 pixels.
- _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
- _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
- _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
- _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
- }
-
- // Wrap up the last <= 15 pixels.
- for (int i = count16*16; i < count; i++) {
- // This check is not really necessarily, but it prevents pointless autovectorization.
- if (src[i] & 0xFF000000) {
- dst[i] = SkPMSrcOver(src[i], dst[i]);
- }
- }
-}
-
-#endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h
deleted file mode 100644
index 577ace6..0000000
--- a/src/opts/SkBlitRow_opts_SSE4.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkBlitRow_opts_SSE4_DEFINED
-#define SkBlitRow_opts_SSE4_DEFINED
-
-#include "SkBlitRow.h"
-
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
- const SkPMColor* SK_RESTRICT,
- int count,
- U8CPU alpha);
-#endif
-
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 95bd229..3cb5a92 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -871,282 +871,6 @@
}
}
-void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
-
- SkASSERT(255 == alpha);
- if (count > 0) {
-
-
- uint8x8_t alpha_mask;
-
- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
- alpha_mask = vld1_u8(alpha_mask_setup);
-
- /* do the NEON unrolled code */
-#define UNROLL 4
- while (count >= UNROLL) {
- uint8x8_t src_raw, dst_raw, dst_final;
- uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
-
- /* The two prefetches below may make the code slighlty
- * slower for small values of count but are worth having
- * in the general case.
- */
- __builtin_prefetch(src+32);
- __builtin_prefetch(dst+32);
-
- /* get the source */
- src_raw = vreinterpret_u8_u32(vld1_u32(src));
-#if UNROLL > 2
- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
-#endif
-
- /* get and hold the dst too */
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
-#if UNROLL > 2
- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-#endif
-
- /* 1st and 2nd bits of the unrolling */
- {
- uint8x8_t dst_cooked;
- uint16x8_t dst_wide;
- uint8x8_t alpha_narrow;
- uint16x8_t alpha_wide;
-
- /* get the alphas spread out properly */
- alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
- /* spread the dest */
- dst_wide = vmovl_u8(dst_raw);
-
- /* alpha mul the dest */
- dst_wide = vmulq_u16 (dst_wide, alpha_wide);
- dst_cooked = vshrn_n_u16(dst_wide, 8);
-
- /* sum -- ignoring any byte lane overflows */
- dst_final = vadd_u8(src_raw, dst_cooked);
- }
-
-#if UNROLL > 2
- /* the 3rd and 4th bits of our unrolling */
- {
- uint8x8_t dst_cooked;
- uint16x8_t dst_wide;
- uint8x8_t alpha_narrow;
- uint16x8_t alpha_wide;
-
- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
- /* spread the dest */
- dst_wide = vmovl_u8(dst_raw_2);
-
- /* alpha mul the dest */
- dst_wide = vmulq_u16 (dst_wide, alpha_wide);
- dst_cooked = vshrn_n_u16(dst_wide, 8);
-
- /* sum -- ignoring any byte lane overflows */
- dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
- }
-#endif
-
- vst1_u32(dst, vreinterpret_u32_u8(dst_final));
-#if UNROLL > 2
- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
-#endif
-
- src += UNROLL;
- dst += UNROLL;
- count -= UNROLL;
- }
-#undef UNROLL
-
- /* do any residual iterations */
- while (--count >= 0) {
- *dst = SkPMSrcOver(*src, *dst);
- src += 1;
- dst += 1;
- }
- }
-}
-
-void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- SkASSERT(255 == alpha);
-
- if (count <= 0)
- return;
-
- /* Use these to check if src is transparent or opaque */
- const unsigned int ALPHA_OPAQ = 0xFF000000;
- const unsigned int ALPHA_TRANS = 0x00FFFFFF;
-
-#define UNROLL 4
- const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
- const SkPMColor* SK_RESTRICT src_temp = src;
-
- /* set up the NEON variables */
- uint8x8_t alpha_mask;
- static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
- alpha_mask = vld1_u8(alpha_mask_setup);
-
- uint8x8_t src_raw, dst_raw, dst_final;
- uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
- uint8x8_t dst_cooked;
- uint16x8_t dst_wide;
- uint8x8_t alpha_narrow;
- uint16x8_t alpha_wide;
-
- /* choose the first processing type */
- if( src >= src_end)
- goto TAIL;
- if(*src <= ALPHA_TRANS)
- goto ALPHA_0;
- if(*src >= ALPHA_OPAQ)
- goto ALPHA_255;
- /* fall-thru */
-
-ALPHA_1_TO_254:
- do {
-
- /* get the source */
- src_raw = vreinterpret_u8_u32(vld1_u32(src));
- src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
-
- /* get and hold the dst too */
- dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
- dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-
-
- /* get the alphas spread out properly */
- alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
- /* we collapsed (255-a)+1 ... */
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
- /* spread the dest */
- dst_wide = vmovl_u8(dst_raw);
-
- /* alpha mul the dest */
- dst_wide = vmulq_u16 (dst_wide, alpha_wide);
- dst_cooked = vshrn_n_u16(dst_wide, 8);
-
- /* sum -- ignoring any byte lane overflows */
- dst_final = vadd_u8(src_raw, dst_cooked);
-
- alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
- /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
- /* we collapsed (255-a)+1 ... */
- alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
- /* spread the dest */
- dst_wide = vmovl_u8(dst_raw_2);
-
- /* alpha mul the dest */
- dst_wide = vmulq_u16 (dst_wide, alpha_wide);
- dst_cooked = vshrn_n_u16(dst_wide, 8);
-
- /* sum -- ignoring any byte lane overflows */
- dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
-
- vst1_u32(dst, vreinterpret_u32_u8(dst_final));
- vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
-
- src += UNROLL;
- dst += UNROLL;
-
- /* if 2 of the next pixels aren't between 1 and 254
- it might make sense to go to the optimized loops */
- if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
- break;
-
- } while(src < src_end);
-
- if (src >= src_end)
- goto TAIL;
-
- if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
- goto ALPHA_255;
-
- /*fall-thru*/
-
-ALPHA_0:
-
- /*In this state, we know the current alpha is 0 and
- we optimize for the next alpha also being zero. */
- src_temp = src; //so we don't have to increment dst every time
- do {
- if(*(++src) > ALPHA_TRANS)
- break;
- if(*(++src) > ALPHA_TRANS)
- break;
- if(*(++src) > ALPHA_TRANS)
- break;
- if(*(++src) > ALPHA_TRANS)
- break;
- } while(src < src_end);
-
- dst += (src - src_temp);
-
- /* no longer alpha 0, so determine where to go next. */
- if( src >= src_end)
- goto TAIL;
- if(*src >= ALPHA_OPAQ)
- goto ALPHA_255;
- else
- goto ALPHA_1_TO_254;
-
-ALPHA_255:
- while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
- dst[0]=src[0];
- dst[1]=src[1];
- dst[2]=src[2];
- dst[3]=src[3];
- src+=UNROLL;
- dst+=UNROLL;
- if(src >= src_end)
- goto TAIL;
- }
-
- //Handle remainder.
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
- if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
- }
- }
-
- if( src >= src_end)
- goto TAIL;
- if(*src <= ALPHA_TRANS)
- goto ALPHA_0;
- else
- goto ALPHA_1_TO_254;
-
-TAIL:
- /* do any residual iterations */
- src_end += UNROLL + 1; //goto the real end
- while(src != src_end) {
- if( *src != 0 ) {
- if( *src >= ALPHA_OPAQ ) {
- *dst = *src;
- }
- else {
- *dst = SkPMSrcOver(*src, *dst);
- }
- }
- src++;
- dst++;
- }
-
-#undef UNROLL
- return;
-}
-
/* Neon version of S32_Blend_BlitRow32()
* portable version is in src/core/SkBlitRow_D32.cpp
*/
@@ -1561,21 +1285,7 @@
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_neon, // S32_Blend,
- /*
- * We have two choices for S32A_Opaque procs. The one reads the src alpha
- * value and attempts to optimize accordingly. The optimization is
- * sensitive to the source content and is not a win in all cases. For
- * example, if there are a lot of transitions between the alpha states,
- * the performance will almost certainly be worse. However, for many
- * common cases the performance is equivalent or better than the standard
- * case where we do not inspect the src alpha.
- */
-#if SK_A32_SHIFT == 24
- // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
- S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
-#else
- S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
-#endif
+ nullptr, // Ported to SkOpts
#ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon // S32A_Blend
#else
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index e96cf17..76f7dd5 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -36,7 +36,8 @@
blit_mask_d32_a8 = sk_neon::blit_mask_d32_a8;
- blit_row_color32 = sk_neon::blit_row_color32;
+ blit_row_color32 = sk_neon::blit_row_color32;
+ blit_row_s32a_opaque = sk_neon::blit_row_s32a_opaque;
color_cube_filter_span = sk_neon::color_cube_filter_span;
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 7a76081..34b078c 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -9,10 +9,11 @@
#define SK_OPTS_NS sk_sse41
#include "SkBlurImageFilter_opts.h"
+#include "SkBlitRow_opts.h"
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
-namespace sk_sse41 {
+namespace sk_sse41_new {
// An SSE register holding at most 64 bits of useful data in the low lanes.
struct m64i {
@@ -211,7 +212,7 @@
}
}
-} // namespace sk_sse41
+} // namespace sk_sse41_new
#endif
@@ -222,8 +223,9 @@
box_blur_yx = sk_sse41::box_blur_yx;
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
- blit_row_color32 = sk_sse41::blit_row_color32;
- blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8;
+ blit_row_color32 = sk_sse41_new::blit_row_color32;
+ blit_mask_d32_a8 = sk_sse41_new::blit_mask_d32_a8;
#endif
+ blit_row_s32a_opaque = sk_sse41::blit_row_s32a_opaque;
}
}
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 9983eb5..0ee78c5 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -12,7 +12,6 @@
#include "SkBlitMask.h"
#include "SkBlitRow.h"
#include "SkBlitRow_opts_SSE2.h"
-#include "SkBlitRow_opts_SSE4.h"
#include "SkOncePtr.h"
#include "SkRTConf.h"
@@ -215,21 +214,11 @@
static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
nullptr, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,
- S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
- S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
-};
-
-static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
- nullptr, // S32_Opaque,
- S32_Blend_BlitRow32_SSE2, // S32_Blend,
- S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque
+ nullptr, // Ported to SkOpts
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
- if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
- return platform_32_procs_SSE4[flags];
- } else
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return platform_32_procs_SSE2[flags];
} else {