ARM Skia NEON patches - 21 - new NEON S32_D565_Opaque
BlitRow565: NEON version of S32_D565_Opaque
Here's a new implementation of S32_D565_Opaque in NEON. It
improves dramatically the speed compared to S32A_D565_Opaque.
Here are the benchmark results (speedup vs. existing NEON):
+-------+-----------+------------+
| count | Cortex-A9 | Cortex-A15 |
+-------+-----------+------------+
| 1 | +130% | +139% |
+-------+-----------+------------+
| 2 | +65,2% | +51% |
+-------+-----------+------------+
| 4 | -25,5% | +10,2% |
+-------+-----------+------------+
| 8 | +63,8% | +32,1% |
+-------+-----------+------------+
| 16 | +110% | +49,2% |
+-------+-----------+------------+
| 64 | +153% | +123,5% |
+-------+-----------+------------+
| 256 | +151% | +144,7% |
+-------+-----------+------------+
| 1024 | +272% | +157,2% |
+-------+-----------+------------+
Signed-off-by: Kévin PETIT <kevin.petit@arm.com>
BUG=
R=djsollen@google.com, mtklein@google.com
Author: kevin.petit.arm@gmail.com
Review URL: https://chromiumcodereview.appspot.com/22351006
git-svn-id: http://skia.googlecode.com/svn/trunk@11415 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 705ee99..ffa0a8b 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -15,9 +15,45 @@
#include "SkUtils.h"
#include "SkCachePreload_arm.h"
-
+#include "SkColor_opts_neon.h"
#include <arm_neon.h>
+void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src, int count,
+ U8CPU alpha, int /*x*/, int /*y*/) {
+ SkASSERT(255 == alpha);
+
+ while (count >= 8) {
+ uint8x8x4_t vsrc;
+ uint16x8_t vdst;
+
+ // Load
+ vsrc = vld4_u8((uint8_t*)src);
+
+ // Convert src to 565
+ vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
+ vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
+ vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
+
+ // Store
+ vst1q_u16(dst, vdst);
+
+ // Prepare next iteration
+ dst += 8;
+ src += 8;
+ count -= 8;
+ };
+
+ // Leftovers
+ while (count > 0) {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ *dst = SkPixel32ToPixel16_ToU16(c);
+ dst++;
+ count--;
+ };
+}
+
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@@ -1330,10 +1366,10 @@
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
// no dither
- // NOTE: For the two functions below, we don't have a special version
- // that assumes that each source pixel is opaque. But our S32A is
- // still faster than the default, so use it.
- S32A_D565_Opaque_neon, // really S32_D565_Opaque
+ // NOTE: For the S32_D565_Blend function below, we don't have a special
+ // version that assumes that each source pixel is opaque. But our
+ // S32A is still faster than the default, so use it.
+ S32_D565_Opaque_neon,
S32A_D565_Blend_neon, // really S32_D565_Blend
S32A_D565_Opaque_neon,
S32A_D565_Blend_neon,
diff --git a/src/opts/SkColor_opts_neon.h b/src/opts/SkColor_opts_neon.h
new file mode 100644
index 0000000..adc2641
--- /dev/null
+++ b/src/opts/SkColor_opts_neon.h
@@ -0,0 +1,12 @@
+#ifndef SkColor_opts_neon_DEFINED
+#define SkColor_opts_neon_DEFINED
+
+#include "SkTypes.h"
+
+#define NEON_A (SK_A32_SHIFT / 8)
+#define NEON_R (SK_R32_SHIFT / 8)
+#define NEON_G (SK_G32_SHIFT / 8)
+#define NEON_B (SK_B32_SHIFT / 8)
+
+#endif /* #ifndef SkColor_opts_neon_DEFINED */
+