ARM Skia NEON patches - 16/17 - Blitmask

Blitmask: NEON optimised version of the D32_A8 functions

    Here are the microbenchmark results I got for the D32_A8
    functions:

    Cortex-A9:
    ==========
    +-------+--------+--------+--------+
    | count | Black  | Opaque | Color  |
    +-------+--------+--------+--------+
    | 1     | -14%   | -39,5% | -37,5% |
    +-------+--------+--------+--------+
    | 2     | -3%    | -29,9% | -25%   |
    +-------+--------+--------+--------+
    | 4     | -11,3% | -22%   | -14,5% |
    +-------+--------+--------+--------+
    | 8     | +128%  | +66,6% | +105%  |
    +-------+--------+--------+--------+
    | 16    | +159%  | +102%  | +149%  |
    +-------+--------+--------+--------+
    | 64    | +189%  | +136%  | +189%  |
    +-------+--------+--------+--------+
    | 256   | +126%  | +102%  | +149%  |
    +-------+--------+--------+--------+
    | 1024  | +67,5% | +81,4% | +123%  |
    +-------+--------+--------+--------+

    Cortex-A15:
    ===========
    +-------+--------+--------+--------+
    | count | Black  | Opaque | Color  |
    +-------+--------+--------+--------+
    | 1     | -24%   | -46,5% | -37,5% |
    +-------+--------+--------+--------+
    | 2     | -18,5% | -35,5% | -28%   |
    +-------+--------+--------+--------+
    | 4     | -5,2%  | -17,5% | -15,5% |
    +-------+--------+--------+--------+
    | 8     | +72%   | +65,8% | +84,7% |
    +-------+--------+--------+--------+
    | 16    | +168%  | +117%  | +149%  |
    +-------+--------+--------+--------+
    | 64    | +165%  | +110%  | +145%  |
    +-------+--------+--------+--------+
    | 256   | +106%  | +99,6% | +141%  |
    +-------+--------+--------+--------+
    | 1024  | +93,7% | +94,7% | +130%  |
    +-------+--------+--------+--------+

    Blitmask: add NEON optimised PlatformBlitRowProcs16

    Here are the microbenchmark results (speedup vs. C code):

    +-------+-----------------+-----------------+
    |       |    Cortex-A9    |   Cortex-A15    |
    | count +--------+--------+--------+--------+
    |       | Blend  | Opaque | Blend  | Opaque |
    +-------+--------+--------+--------+--------+
    | 1     | -19,2% | -36,7% | -33,6% | -44,7% |
    +-------+--------+--------+--------+--------+
    | 2     | -12,6% | -27,8% | -39%   | -48%   |
    +-------+--------+--------+--------+--------+
    | 4     | -11,5% | -21,6% | -37,7% | -44,3% |
    +-------+--------+--------+--------+--------+
    | 8     | +141%  | +59,7% | +123%  | +48,7% |
    +-------+--------+--------+--------+--------+
    | 16    | +213%  | +119%  | +214%  | +121%  |
    +-------+--------+--------+--------+--------+
    | 64    | +212%  | +105%  | +242%  | +167%  |
    +-------+--------+--------+--------+--------+
    | 256   | +289%  | +167%  | +249%  | +207%  |
    +-------+--------+--------+--------+--------+
    | 1024  | +273%  | +169%  | +146%  | +220%  |
    +-------+--------+--------+--------+--------+

    Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=
R=djsollen@google.com, mtklein@google.com, reed@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://codereview.chromium.org/23719002

git-svn-id: http://skia.googlecode.com/svn/trunk@12420 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gyp/opts.gyp b/gyp/opts.gyp
index 01bcdde..bf93926 100644
--- a/gyp/opts.gyp
+++ b/gyp/opts.gyp
@@ -177,6 +177,7 @@
         '../src/opts/SkBitmapProcState_matrixProcs_neon.cpp',
         '../src/opts/SkBitmapProcState_matrix_clamp_neon.h',
         '../src/opts/SkBitmapProcState_matrix_repeat_neon.h',
+        '../src/opts/SkBlitMask_opts_arm_neon.cpp',
         '../src/opts/SkBlitRow_opts_arm_neon.cpp',
         '../src/opts/SkMorphology_opts_neon.cpp',
         '../src/opts/SkXfermode_opts_arm_neon.cpp',
diff --git a/src/opts/SkBlitMask_opts_arm.cpp b/src/opts/SkBlitMask_opts_arm.cpp
index 0ad0919..2bf7603 100644
--- a/src/opts/SkBlitMask_opts_arm.cpp
+++ b/src/opts/SkBlitMask_opts_arm.cpp
@@ -1,14 +1,39 @@
 
+#include "SkColor.h"
+#include "SkColorPriv.h"
 #include "SkBlitMask.h"
+#include "SkUtilsArm.h"
+#include "SkBlitMask_opts_arm_neon.h"
 
 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
                                                      SkMask::Format maskFormat,
                                                      SkColor color) {
+#if SK_ARM_NEON_IS_NONE
+    return NULL;
+#else
+#if SK_ARM_NEON_IS_DYNAMIC
+    if (!sk_cpu_arm_has_neon()) {
+        return NULL;
+    }
+#endif
+    if ((SkBitmap::kARGB_8888_Config == dstConfig) &&
+        (SkMask::kA8_Format == maskFormat)) {
+            return D32_A8_Factory_neon(color);
+    }
+#endif
+
+    // We don't need to handle the SkMask::kLCD16_Format case as the default
+    // LCD16 will call us through SkBlitMask::PlatformBlitRowProcs16()
+
     return NULL;
 }
 
 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
-    return NULL;
+    if (isOpaque) {
+        return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow);
+    } else {
+        return SK_ARM_NEON_WRAP(SkBlitLCD16Row);
+    }
 }
 
 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp
new file mode 100644
index 0000000..7db6fcb
--- /dev/null
+++ b/src/opts/SkBlitMask_opts_arm_neon.cpp
@@ -0,0 +1,255 @@
+
+#include "SkBlitMask.h"
+#include "SkColor_opts_neon.h"
+
+static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
+                              const void* SK_RESTRICT maskPtr, size_t maskRB,
+                              SkColor, int width, int height) {
+    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
+    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
+
+    maskRB -= width;
+    dstRB -= (width << 2);
+    do {
+        int w = width;
+        while (w >= 8) {
+            uint8x8_t vmask = vld1_u8(mask);
+            uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
+            uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
+
+            vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
+            vdevice.val[NEON_A] += vmask;
+
+            vst4_u8((uint8_t*)device, vdevice);
+
+            mask += 8;
+            device += 8;
+            w -= 8;
+        }
+        while (w-- > 0) {
+            unsigned aa = *mask++;
+            *device = (aa << SK_A32_SHIFT)
+                        + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
+            device += 1;
+        };
+        device = (uint32_t*)((char*)device + dstRB);
+        mask += maskRB;
+    } while (--height != 0);
+}
+
+template <bool isColor>
+static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
+                                     const void* SK_RESTRICT maskPtr, size_t maskRB,
+                                     SkColor color, int width, int height) {
+    SkPMColor pmc = SkPreMultiplyColor(color);
+    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
+    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
+    uint8x8x4_t vpmc;
+
+    maskRB -= width;
+    dstRB -= (width << 2);
+
+    if (width >= 8) {
+        vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
+        vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
+        vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
+        vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
+    }
+    do {
+        int w = width;
+        while (w >= 8) {
+            uint8x8_t vmask = vld1_u8(mask);
+            uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
+            if (isColor) {
+                vscale = vsubw_u8(vdupq_n_u16(256),
+                            SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
+            } else {
+                vscale = vsubw_u8(vdupq_n_u16(256), vmask);
+            }
+            uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
+
+            vdev.val[NEON_A] =   SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
+                               + SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
+            vdev.val[NEON_R] =   SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
+                               + SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
+            vdev.val[NEON_G] =   SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
+                               + SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
+            vdev.val[NEON_B] =   SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
+                               + SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
+
+            vst4_u8((uint8_t*)device, vdev);
+
+            mask += 8;
+            device += 8;
+            w -= 8;
+        }
+
+        while (w--) {
+            unsigned aa = *mask++;
+            if (isColor) {
+                *device = SkBlendARGB32(pmc, *device, aa);
+            } else {
+                *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
+                            + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
+            }
+            device += 1;
+        };
+
+        device = (uint32_t*)((char*)device + dstRB);
+        mask += maskRB;
+
+    } while (--height != 0);
+}
+
+static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
+                               const void* SK_RESTRICT maskPtr, size_t maskRB,
+                               SkColor color, int width, int height) {
+    D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
+}
+
+static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
+                              const void* SK_RESTRICT maskPtr, size_t maskRB,
+                              SkColor color, int width, int height) {
+    D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
+}
+
+SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
+    if (SK_ColorBLACK == color) {
+        return D32_A8_Black_neon;
+    } else if (0xFF == SkColorGetA(color)) {
+        return D32_A8_Opaque_neon;
+    } else {
+        return D32_A8_Color_neon;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
+                                        SkColor color, int width,
+                                        SkPMColor opaqueDst) {
+    int colR = SkColorGetR(color);
+    int colG = SkColorGetG(color);
+    int colB = SkColorGetB(color);
+
+    uint8x8_t vcolR, vcolG, vcolB;
+    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
+
+    if (width >= 8) {
+        vcolR = vdup_n_u8(colR);
+        vcolG = vdup_n_u8(colG);
+        vcolB = vdup_n_u8(colB);
+        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
+        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
+        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
+        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
+    }
+
+    while (width >= 8) {
+        uint8x8x4_t vdst;
+        uint16x8_t vmask;
+        uint16x8_t vmaskR, vmaskG, vmaskB;
+        uint8x8_t vsel_trans, vsel_opq;
+
+        vdst = vld4_u8((uint8_t*)dst);
+        vmask = vld1q_u16(src);
+
+        // Prepare compare masks
+        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
+        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
+
+        // Get all the color masks on 5 bits
+        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
+        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
+                             SK_B16_BITS + SK_R16_BITS + 1);
+        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
+
+        // Upscale to 0..32
+        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
+        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
+        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
+
+        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
+        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
+
+        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
+        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
+        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
+
+        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
+        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
+        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
+
+        vst4_u8((uint8_t*)dst, vdst);
+
+        dst += 8;
+        src += 8;
+        width -= 8;
+    }
+
+    // Leftovers
+    for (int i = 0; i < width; i++) {
+        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
+                                    opaqueDst);
+    }
+}
+
+void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
+                                   SkColor color, int width, SkPMColor) {
+    int colA = SkColorGetA(color);
+    int colR = SkColorGetR(color);
+    int colG = SkColorGetG(color);
+    int colB = SkColorGetB(color);
+
+    colA = SkAlpha255To256(colA);
+
+    uint8x8_t vcolR, vcolG, vcolB;
+    uint16x8_t vcolA;
+
+    if (width >= 8) {
+        vcolA = vdupq_n_u16(colA);
+        vcolR = vdup_n_u8(colR);
+        vcolG = vdup_n_u8(colG);
+        vcolB = vdup_n_u8(colB);
+    }
+
+    while (width >= 8) {
+        uint8x8x4_t vdst;
+        uint16x8_t vmask;
+        uint16x8_t vmaskR, vmaskG, vmaskB;
+
+        vdst = vld4_u8((uint8_t*)dst);
+        vmask = vld1q_u16(src);
+
+        // Get all the color masks on 5 bits
+        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
+        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
+                             SK_B16_BITS + SK_R16_BITS + 1);
+        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
+
+        // Upscale to 0..32
+        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
+        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
+        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
+
+        vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
+        vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
+        vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
+
+        vdst.val[NEON_A] = vdup_n_u8(0xFF);
+        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
+        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
+        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
+
+        vst4_u8((uint8_t*)dst, vdst);
+
+        dst += 8;
+        src += 8;
+        width -= 8;
+    }
+
+    for (int i = 0; i < width; i++) {
+        dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
+    }
+}
+
diff --git a/src/opts/SkBlitMask_opts_arm_neon.h b/src/opts/SkBlitMask_opts_arm_neon.h
new file mode 100644
index 0000000..fdbce14
--- /dev/null
+++ b/src/opts/SkBlitMask_opts_arm_neon.h
@@ -0,0 +1,16 @@
+#ifndef SkBlitMask_opts_arm_neon_DEFINED
+#define SkBlitMask_opts_arm_neon_DEFINED
+
+#include "SkColor.h"
+#include "SkBlitMask.h"
+
+extern SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color);
+
+extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
+                                      SkColor color, int width,
+                                      SkPMColor opaqueDst);
+
+extern void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
+                                SkColor color, int width, SkPMColor);
+
+#endif // #ifndef SkBlitMask_opts_arm_neon_DEFINED
diff --git a/src/opts/SkColor_opts_neon.h b/src/opts/SkColor_opts_neon.h
index f812397..85752f5 100644
--- a/src/opts/SkColor_opts_neon.h
+++ b/src/opts/SkColor_opts_neon.h
@@ -2,6 +2,7 @@
 #define SkColor_opts_neon_DEFINED
 
 #include "SkTypes.h"
+#include "SkColorPriv.h"
 
 #include <arm_neon.h>
 
@@ -65,4 +66,20 @@
     return ret;
 }
 
+/* This function blends 8 pixels of the same channel in the exact same way as
+ * SkBlend32.
+ */
+static inline uint8x8_t SkBlend32_neon8(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
+    int16x8_t src_wide, dst_wide;
+
+    src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
+    dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
+
+    src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
+
+    dst_wide += vshrq_n_s16(src_wide, 5);
+
+    return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
+}
+
 #endif /* #ifndef SkColor_opts_neon_DEFINED */