Sk4px blit mask.

Local SKP nanobenching ranges SSE between 1.05x and 0.87x, much more heavily weighted toward <1.0x ratios (speedups).
I profiled the top five regressions (1.05x-1.01x) and they look like noise.  Will follow up after broad bot results.

NEON looks similar but less extreme than SSE changes, ranging between 1.02x and 0.95x, again mostly speedups in 0.99x-0.97x range.

The old code trifurcated into black, opaque-but-not-black, and general versions as a function of the constant src color.  I did not see a significant difference between general and opaque-but-not-black, and I don't think a black version would be faster using SIMD.  So we have here just one version of the code, the general version.

Somewhat fantastically, I see no pixel diffs on GMs or SKPs.

I will be following up with more CLs for the other procs called by SkBlitMask.
BUG=skia:

Review URL: https://codereview.chromium.org/1278253003
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h
index 24a21c6..ffde1af 100644
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@@ -165,6 +165,34 @@
         }
     }
 
+    // As above, but with dst4' = fn(dst4, alpha4).
+    template <typename Fn, typename Dst>
+    static void MapDstAlpha(int n, Dst* dst, const SkAlpha* a, const Fn& fn) {
+        while (n > 0) {
+            if (n >= 8) {
+                Sk4px dst0 = fn(Load4(dst+0), Load4Alphas(a+0)),
+                      dst4 = fn(Load4(dst+4), Load4Alphas(a+4));
+                dst0.store4(dst+0);
+                dst4.store4(dst+4);
+                dst += 8; a += 8; n -= 8;
+                continue;  // Keep our stride at 8 pixels as long as possible.
+            }
+            SkASSERT(n <= 7);
+            if (n >= 4) {
+                fn(Load4(dst), Load4Alphas(a)).store4(dst);
+                dst += 4; a += 4; n -= 4;
+            }
+            if (n >= 2) {
+                fn(Load2(dst), Load2Alphas(a)).store2(dst);
+                dst += 2; a += 2; n -= 2;
+            }
+            if (n >= 1) {
+                fn(Load1(dst), DupAlpha(*a)).store1(dst);
+            }
+            break;
+        }
+    }
+
     // As above, but with dst4' = fn(dst4, src4, alpha4).
     template <typename Fn, typename Dst>
     static void MapDstSrcAlpha(int n, Dst* dst, const SkPMColor* src, const SkAlpha* a,
diff --git a/src/core/SkBlitMask.h b/src/core/SkBlitMask.h
index 3fc306f..d39c094 100644
--- a/src/core/SkBlitMask.h
+++ b/src/core/SkBlitMask.h
@@ -48,18 +48,6 @@
                             const SkPMColor* src, int width);
 
     /**
-     *  Public entry-point to return a blitmask ColorProc.
-     *  May return NULL if config or format are not supported.
-     */
-    static ColorProc ColorFactory(SkColorType, SkMask::Format, SkColor);
-
-    /**
-     *  Return either platform specific optimized blitmask ColorProc,
-     *  or NULL if no optimized routine is available.
-     */
-    static ColorProc PlatformColorProcs(SkColorType, SkMask::Format, SkColor);
-
-    /**
      *  Public entry-point to return a blitcolor BlitLCD16RowProc.
      */
     static BlitLCD16RowProc BlitLCD16RowFactory(bool isOpaque);
diff --git a/src/core/SkBlitMask_D32.cpp b/src/core/SkBlitMask_D32.cpp
index 2108108..fb2991c 100644
--- a/src/core/SkBlitMask_D32.cpp
+++ b/src/core/SkBlitMask_D32.cpp
@@ -8,68 +8,7 @@
 #include "SkBlitMask.h"
 #include "SkColor.h"
 #include "SkColorPriv.h"
-
-static void D32_A8_Color(void* SK_RESTRICT dst, size_t dstRB,
-                         const void* SK_RESTRICT maskPtr, size_t maskRB,
-                         SkColor color, int width, int height) {
-    SkPMColor pmc = SkPreMultiplyColor(color);
-    size_t dstOffset = dstRB - (width << 2);
-    size_t maskOffset = maskRB - width;
-    SkPMColor* SK_RESTRICT device = (SkPMColor *)dst;
-    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
-
-    do {
-        int w = width;
-        do {
-            unsigned aa = *mask++;
-            *device = SkBlendARGB32(pmc, *device, aa);
-            device += 1;
-        } while (--w != 0);
-        device = (uint32_t*)((char*)device + dstOffset);
-        mask += maskOffset;
-    } while (--height != 0);
-}
-
-static void D32_A8_Opaque(void* SK_RESTRICT dst, size_t dstRB,
-                          const void* SK_RESTRICT maskPtr, size_t maskRB,
-                          SkColor color, int width, int height) {
-    SkPMColor pmc = SkPreMultiplyColor(color);
-    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
-    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
-
-    maskRB -= width;
-    dstRB -= (width << 2);
-    do {
-        int w = width;
-        do {
-            unsigned aa = *mask++;
-            *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
-            device += 1;
-        } while (--w != 0);
-        device = (uint32_t*)((char*)device + dstRB);
-        mask += maskRB;
-    } while (--height != 0);
-}
-
-static void D32_A8_Black(void* SK_RESTRICT dst, size_t dstRB,
-                         const void* SK_RESTRICT maskPtr, size_t maskRB,
-                         SkColor, int width, int height) {
-    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
-    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
-
-    maskRB -= width;
-    dstRB -= (width << 2);
-    do {
-        int w = width;
-        do {
-            unsigned aa = *mask++;
-            *device = (aa << SK_A32_SHIFT) + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
-            device += 1;
-        } while (--w != 0);
-        device = (uint32_t*)((char*)device + dstRB);
-        mask += maskRB;
-    } while (--height != 0);
-}
+#include "SkOpts.h"
 
 SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) {
     BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque);
@@ -112,51 +51,25 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-static SkBlitMask::ColorProc D32_A8_Factory(SkColor color) {
-    if (SK_ColorBLACK == color) {
-        return D32_A8_Black;
-    } else if (0xFF == SkColorGetA(color)) {
-        return D32_A8_Opaque;
-    } else {
-        return D32_A8_Color;
-    }
-}
-
-SkBlitMask::ColorProc SkBlitMask::ColorFactory(SkColorType ct,
-                                               SkMask::Format format,
-                                               SkColor color) {
-    ColorProc proc = PlatformColorProcs(ct, format, color);
-    if (proc) {
-        return proc;
-    }
-
-    switch (ct) {
-        case kN32_SkColorType:
-            switch (format) {
-                case SkMask::kA8_Format:
-                    return D32_A8_Factory(color);
-                case SkMask::kLCD16_Format:
-                    return D32_LCD16_Proc;
-                default:
-                    break;
-            }
-            break;
-        default:
-            break;
-    }
-    return NULL;
-}
-
 bool SkBlitMask::BlitColor(const SkPixmap& device, const SkMask& mask,
                            const SkIRect& clip, SkColor color) {
-    ColorProc proc = ColorFactory(device.colorType(), mask.fFormat, color);
-    if (proc) {
-        int x = clip.fLeft;
-        int y = clip.fTop;
-        proc(device.writable_addr32(x, y), device.rowBytes(), mask.getAddr(x, y),
-             mask.fRowBytes, color, clip.width(), clip.height());
+    int x = clip.fLeft, y = clip.fTop;
+
+    if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
+        SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
+                                 (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
+                                 color, clip.width(), clip.height());
         return true;
     }
+
+    if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
+        // TODO: Is this reachable code?  Seems like no.
+        D32_LCD16_Proc(device.writable_addr32(x,y), device.rowBytes(),
+                       mask.getAddr(x,y), mask.fRowBytes,
+                       color, clip.width(), clip.height());
+        return true;
+    }
+
     return false;
 }
 
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 17eab7e..2bfc1af 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -9,6 +9,7 @@
 #include "SkOpts.h"
 
 #define SK_OPTS_NS portable
+#include "SkBlitMask_opts.h"
 #include "SkBlurImageFilter_opts.h"
 #include "SkFloatingPoint_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
@@ -50,6 +51,8 @@
     decltype(texture_compressor)       texture_compressor = portable::texture_compressor;
     decltype(fill_block_dimensions) fill_block_dimensions = portable::fill_block_dimensions;
 
+    decltype(blit_mask_d32_a8) blit_mask_d32_a8 = portable::blit_mask_d32_a8;
+
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_sse2();
     void Init_ssse3();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 206b7ff..e0ef7dc 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -43,6 +43,7 @@
     extern TextureCompressor (*texture_compressor)(SkColorType, SkTextureCompressor::Format);
     extern bool (*fill_block_dimensions)(SkTextureCompressor::Format, int* x, int* y);
 
+    extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/opts/SkBlitMask_opts.h b/src/opts/SkBlitMask_opts.h
new file mode 100644
index 0000000..9129560
--- /dev/null
+++ b/src/opts/SkBlitMask_opts.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkBlitMask_opts_DEFINED
+#define SkBlitMask_opts_DEFINED
+
+#include "Sk4px.h"
+
+namespace SK_OPTS_NS {
+
+static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB,
+                             const SkAlpha* mask, size_t maskRB,
+                             SkColor color, int w, int h) {
+    auto s = Sk4px::DupPMColor(SkPreMultiplyColor(color));
+
+    auto fn = [&](const Sk4px& d, const Sk4px& aa) {
+        //  = (s + d(1-sa))aa + d(1-aa)
+        //  = s*aa + d(1-sa*aa)
+        auto left  = s.approxMulDiv255(aa),
+             right = d.approxMulDiv255(left.alphas().inv());
+        return left + right;  // This does not overflow (exhaustively checked).
+    };
+
+    while (h --> 0) {
+        Sk4px::MapDstAlpha(w, dst, mask, fn);
+        dst  +=  dstRB / sizeof(*dst);
+        mask += maskRB / sizeof(*mask);
+    }
+}
+
+}  // SK_OPTS_NS
+
+#endif//SkBlitMask_opts_DEFINED
diff --git a/src/opts/SkBlitMask_opts_arm.cpp b/src/opts/SkBlitMask_opts_arm.cpp
index 11e172c..e58be5c 100644
--- a/src/opts/SkBlitMask_opts_arm.cpp
+++ b/src/opts/SkBlitMask_opts_arm.cpp
@@ -11,32 +11,6 @@
 #include "SkUtilsArm.h"
 #include "SkBlitMask_opts_arm_neon.h"
 
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
-                                                     SkMask::Format maskFormat,
-                                                     SkColor color) {
-#if SK_ARM_NEON_IS_NONE
-    return NULL;
-#else
-/* ** This has been disabled until we can diagnose and fix the SIGILL generated
-   ** in the NEON code.  See http://skbug.com/2067 for details.
-#if SK_ARM_NEON_IS_DYNAMIC
-    if (!sk_cpu_arm_has_neon()) {
-        return NULL;
-    }
-#endif
-    if ((kN32_SkColorType == dstCT) &&
-        (SkMask::kA8_Format == maskFormat)) {
-            return D32_A8_Factory_neon(color);
-    }
-*/
-#endif
-
-    // We don't need to handle the SkMask::kLCD16_Format case as the default
-    // LCD16 will call us through SkBlitMask::PlatformBlitRowProcs16()
-
-    return NULL;
-}
-
 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
     if (isOpaque) {
         return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow);
diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp
index 3361a5d..ad12369 100644
--- a/src/opts/SkBlitMask_opts_arm_neon.cpp
+++ b/src/opts/SkBlitMask_opts_arm_neon.cpp
@@ -8,129 +8,6 @@
 #include "SkBlitMask.h"
 #include "SkColor_opts_neon.h"
 
-static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
-                              const void* SK_RESTRICT maskPtr, size_t maskRB,
-                              SkColor, int width, int height) {
-    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
-    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
-
-    maskRB -= width;
-    dstRB -= (width << 2);
-    do {
-        int w = width;
-        while (w >= 8) {
-            uint8x8_t vmask = vld1_u8(mask);
-            uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
-            uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
-
-            vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
-            vdevice.val[NEON_A] += vmask;
-
-            vst4_u8((uint8_t*)device, vdevice);
-
-            mask += 8;
-            device += 8;
-            w -= 8;
-        }
-        while (w-- > 0) {
-            unsigned aa = *mask++;
-            *device = (aa << SK_A32_SHIFT)
-                        + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
-            device += 1;
-        };
-        device = (uint32_t*)((char*)device + dstRB);
-        mask += maskRB;
-    } while (--height != 0);
-}
-
-template <bool isColor>
-static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
-                                     const void* SK_RESTRICT maskPtr, size_t maskRB,
-                                     SkColor color, int width, int height) {
-    SkPMColor pmc = SkPreMultiplyColor(color);
-    SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
-    const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
-    uint8x8x4_t vpmc;
-
-    maskRB -= width;
-    dstRB -= (width << 2);
-
-    if (width >= 8) {
-        vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
-        vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
-        vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
-        vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
-    }
-    do {
-        int w = width;
-        while (w >= 8) {
-            uint8x8_t vmask = vld1_u8(mask);
-            uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
-            if (isColor) {
-                vscale = vsubw_u8(vdupq_n_u16(256),
-                            SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
-            } else {
-                vscale = vsubw_u8(vdupq_n_u16(256), vmask);
-            }
-            uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
-
-            vdev.val[NEON_A] =   SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
-                               + SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
-            vdev.val[NEON_R] =   SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
-                               + SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
-            vdev.val[NEON_G] =   SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
-                               + SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
-            vdev.val[NEON_B] =   SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
-                               + SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
-
-            vst4_u8((uint8_t*)device, vdev);
-
-            mask += 8;
-            device += 8;
-            w -= 8;
-        }
-
-        while (w--) {
-            unsigned aa = *mask++;
-            if (isColor) {
-                *device = SkBlendARGB32(pmc, *device, aa);
-            } else {
-                *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
-                            + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
-            }
-            device += 1;
-        };
-
-        device = (uint32_t*)((char*)device + dstRB);
-        mask += maskRB;
-
-    } while (--height != 0);
-}
-
-static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
-                               const void* SK_RESTRICT maskPtr, size_t maskRB,
-                               SkColor color, int width, int height) {
-    D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
-}
-
-static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
-                              const void* SK_RESTRICT maskPtr, size_t maskRB,
-                              SkColor color, int width, int height) {
-    D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
-}
-
-SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
-    if (SK_ColorBLACK == color) {
-        return D32_A8_Black_neon;
-    } else if (0xFF == SkColorGetA(color)) {
-        return D32_A8_Opaque_neon;
-    } else {
-        return D32_A8_Color_neon;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                         SkColor color, int width,
                                         SkPMColor opaqueDst) {
diff --git a/src/opts/SkBlitMask_opts_arm_neon.h b/src/opts/SkBlitMask_opts_arm_neon.h
index 7af51c1..86366a4 100644
--- a/src/opts/SkBlitMask_opts_arm_neon.h
+++ b/src/opts/SkBlitMask_opts_arm_neon.h
@@ -11,8 +11,6 @@
 #include "SkColor.h"
 #include "SkBlitMask.h"
 
-extern SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color);
-
 extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                       SkColor color, int width,
                                       SkPMColor opaqueDst);
diff --git a/src/opts/SkBlitMask_opts_none.cpp b/src/opts/SkBlitMask_opts_none.cpp
index 90f89a7..5c318c7 100644
--- a/src/opts/SkBlitMask_opts_none.cpp
+++ b/src/opts/SkBlitMask_opts_none.cpp
@@ -7,12 +7,6 @@
 
 #include "SkBlitMask.h"
 
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
-                                                     SkMask::Format maskFormat,
-                                                     SkColor color) {
-    return NULL;
-}
-
 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
     return NULL;
 }
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 7f5b677..c017f7e 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -301,54 +301,6 @@
     }
 }
 
-void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
-                               size_t maskRB, SkColor origColor,
-                               int width, int height) {
-    SkPMColor color = SkPreMultiplyColor(origColor);
-    size_t dstOffset = dstRB - (width << 2);
-    size_t maskOffset = maskRB - width;
-    SkPMColor* dst = (SkPMColor *)device;
-    const uint8_t* mask = (const uint8_t*)maskPtr;
-    do {
-        int count = width;
-        if (count >= 4) {
-            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
-                *dst = SkBlendARGB32(color, *dst, *mask);
-                mask++;
-                dst++;
-                count--;
-            }
-            __m128i *d = reinterpret_cast<__m128i*>(dst);
-            __m128i src_pixel = _mm_set1_epi32(color);
-            while (count >= 4) {
-                // Load 4 dst pixels
-                __m128i dst_pixel = _mm_load_si128(d);
-
-                // Set the alpha value
-                __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
-                alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
-                alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
-
-                __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
-                _mm_store_si128(d, result);
-                // Load the next 4 dst pixels and alphas
-                mask = mask + 4;
-                d++;
-                count -= 4;
-            }
-            dst = reinterpret_cast<SkPMColor*>(d);
-        }
-        while (count > 0) {
-            *dst= SkBlendARGB32(color, *dst, *mask);
-            dst += 1;
-            mask++;
-            count --;
-        }
-        dst = (SkPMColor *)((char*)dst + dstOffset);
-        mask += maskOffset;
-    } while (--height != 0);
-}
-
 // The following (left) shifts cause the top 5 bits of the mask components to
 // line up with the corresponding components in an SkPMColor.
 // Note that the mask's RGB16 order may differ from the SkPMColor order.
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 6c0611f..560edf4 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -25,10 +25,6 @@
 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x,
                         int y);
 
-void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask,
-                               size_t maskRB, SkColor color,
-                               int width, int height);
-
 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
                          SkColor color, int width, SkPMColor);
 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index bc3ca3e..789a977 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -8,6 +8,7 @@
 #include "SkOpts.h"
 
 #define SK_OPTS_NS neon
+#include "SkBlitMask_opts.h"
 #include "SkBlurImageFilter_opts.h"
 #include "SkFloatingPoint_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
@@ -33,5 +34,7 @@
 
         texture_compressor    = neon::texture_compressor;
         fill_block_dimensions = neon::fill_block_dimensions;
+
+        blit_mask_d32_a8 = neon::blit_mask_d32_a8;
     }
 }
diff --git a/src/opts/SkOpts_sse2.cpp b/src/opts/SkOpts_sse2.cpp
index 8837efee..3440676 100644
--- a/src/opts/SkOpts_sse2.cpp
+++ b/src/opts/SkOpts_sse2.cpp
@@ -8,6 +8,7 @@
 #include "SkOpts.h"
 
 #define SK_OPTS_NS sse2
+#include "SkBlitMask_opts.h"
 #include "SkBlurImageFilter_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkUtils_opts.h"
@@ -27,5 +28,7 @@
         dilate_y = sse2::dilate_y;
          erode_x = sse2::erode_x;
          erode_y = sse2::erode_y;
+
+        blit_mask_d32_a8 = sse2::blit_mask_d32_a8;
     }
 }
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index c560b91..34045d4 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -252,30 +252,6 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
-                                                     SkMask::Format maskFormat,
-                                                     SkColor color) {
-    if (SkMask::kA8_Format != maskFormat) {
-        return NULL;
-    }
-
-    ColorProc proc = NULL;
-    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
-        switch (dstCT) {
-            case kN32_SkColorType:
-                // The SSE2 version is not (yet) faster for black, so we check
-                // for that.
-                if (SK_ColorBLACK != color) {
-                    proc = SkARGB32_A8_BlitMask_SSE2;
-                }
-                break;
-            default:
-                break;
-        }
-    }
-    return proc;
-}
-
 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
         if (isOpaque) {