Port S32A_opaque blit row to SkOpts.

This should be a pixel-for-pixel (i.e. bug-for-bug) port.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1820313002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1820313002
diff --git a/gyp/opts.gypi b/gyp/opts.gypi
index 3401a80..f3f61ef 100644
--- a/gyp/opts.gypi
+++ b/gyp/opts.gypi
@@ -49,7 +49,6 @@
             '<(skia_src_path)/opts/SkOpts_ssse3.cpp',
         ],
         'sse41_sources': [
-            '<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
             '<(skia_src_path)/opts/SkOpts_sse41.cpp',
         ],
         # These targets are empty, but XCode doesn't like that, so add an empty file to each.
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 75330ae..80c7242 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -52,35 +52,6 @@
     }
 }
 
-static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
-                                  const SkPMColor* SK_RESTRICT src,
-                                  int count, U8CPU alpha) {
-    SkASSERT(255 == alpha);
-    if (count > 0) {
-#ifdef UNROLL
-        if (count & 1) {
-            *dst = SkPMSrcOver(*(src++), *dst);
-            dst += 1;
-            count -= 1;
-        }
-
-        const SkPMColor* SK_RESTRICT srcEnd = src + count;
-        while (src != srcEnd) {
-            *dst = SkPMSrcOver(*(src++), *dst);
-            dst += 1;
-            *dst = SkPMSrcOver(*(src++), *dst);
-            dst += 1;
-        }
-#else
-        do {
-            *dst = SkPMSrcOver(*src, *dst);
-            src += 1;
-            dst += 1;
-        } while (--count > 0);
-#endif
-    }
-}
-
 static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
                                  const SkPMColor* SK_RESTRICT src,
                                  int count, U8CPU alpha) {
@@ -115,7 +86,7 @@
 static const SkBlitRow::Proc32 gDefault_Procs32[] = {
     S32_Opaque_BlitRow32,
     S32_Blend_BlitRow32,
-    S32A_Opaque_BlitRow32,
+    nullptr,
     S32A_Blend_BlitRow32
 };
 
@@ -124,6 +95,11 @@
     // just so we don't crash
     flags &= kFlags32_Mask;
 
+    if (flags == 2) {
+        // S32A_Opaque_BlitRow32 has been ported to SkOpts, but not the others yet.
+        return SkOpts::blit_row_s32a_opaque;
+    }
+
     SkBlitRow::Proc32 proc = PlatformProcs32(flags);
     if (nullptr == proc) {
         proc = gDefault_Procs32[flags];
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index e34b6d7..50659d4 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -63,6 +63,7 @@
 #endif
 
 namespace SkOpts {
+
     // Define default function pointer values here...
     // If our global compile options are set high enough, these defaults might even be
     // CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults.
@@ -84,7 +85,8 @@
 
     decltype(blit_mask_d32_a8) blit_mask_d32_a8 = sk_default::blit_mask_d32_a8;
 
-    decltype(blit_row_color32) blit_row_color32 = sk_default::blit_row_color32;
+    decltype(blit_row_color32)     blit_row_color32     = sk_default::blit_row_color32;
+    decltype(blit_row_s32a_opaque) blit_row_s32a_opaque = sk_default::blit_row_s32a_opaque;
 
     decltype(matrix_translate)       matrix_translate       = sk_default::matrix_translate;
     decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 42c47a5..3989047 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -39,6 +39,7 @@
 
     extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
     extern void (*blit_row_color32)(SkPMColor*, const SkPMColor*, int, SkPMColor);
+    extern void (*blit_row_s32a_opaque)(SkPMColor*, const SkPMColor*, int, U8CPU);
 
     // This function is an optimized version of SkColorCubeFilter::filterSpan
     extern void (*color_cube_filter_span)(const SkPMColor[],
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index f000f30..94d9ee9 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -9,6 +9,12 @@
 #define SkBlitRow_opts_DEFINED
 
 #include "Sk4px.h"
+#include "SkColorPriv.h"
+#include "SkMSAN.h"
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include "SkColor_opts_SSE2.h"
+#endif
 
 namespace SK_OPTS_NS {
 
@@ -17,7 +23,8 @@
 // and it's quite a bit faster than blend_perfect.
 //
 // blend_256_round_alt is our currently blessed algorithm.  Please use it or an analogous one.
-static void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
+static inline
+void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
     unsigned invA = 255 - SkGetPackedA32(color);
     invA += invA >> 7;
     SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
@@ -30,6 +37,167 @@
     });
 }
 
+static inline
+void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
+    SkASSERT(alpha == 0xFF);
+    sk_msan_assert_initialized(src, src+len);
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+    while (len >= 16) {
+        // Load 16 source pixels.
+        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
+             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
+             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
+             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
+
+        const auto alphaMask = _mm_set1_epi32(0xFF000000);
+
+        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+        if (_mm_testz_si128(ORed, alphaMask)) {
+            // All 16 source pixels are transparent.  Nothing to do.
+            src += 16;
+            dst += 16;
+            len -= 16;
+            continue;
+        }
+
+        auto d0 = (__m128i*)(dst) + 0,
+             d1 = (__m128i*)(dst) + 1,
+             d2 = (__m128i*)(dst) + 2,
+             d3 = (__m128i*)(dst) + 3;
+
+        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+        if (_mm_testc_si128(ANDed, alphaMask)) {
+            // All 16 source pixels are opaque.  SrcOver becomes Src.
+            _mm_storeu_si128(d0, s0);
+            _mm_storeu_si128(d1, s1);
+            _mm_storeu_si128(d2, s2);
+            _mm_storeu_si128(d3, s3);
+            src += 16;
+            dst += 16;
+            len -= 16;
+            continue;
+        }
+
+        // TODO: This math is wrong.
+        // Do SrcOver.
+        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
+        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
+        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
+        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
+        src += 16;
+        dst += 16;
+        len -= 16;
+    }
+
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    while (len >= 16) {
+        // Load 16 source pixels.
+        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
+             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
+             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
+             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
+
+        const auto alphaMask = _mm_set1_epi32(0xFF000000);
+
+        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
+        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
+                                                       _mm_setzero_si128()))) {
+            // All 16 source pixels are transparent.  Nothing to do.
+            src += 16;
+            dst += 16;
+            len -= 16;
+            continue;
+        }
+
+        auto d0 = (__m128i*)(dst) + 0,
+             d1 = (__m128i*)(dst) + 1,
+             d2 = (__m128i*)(dst) + 2,
+             d3 = (__m128i*)(dst) + 3;
+
+        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
+        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
+                                                       alphaMask))) {
+            // All 16 source pixels are opaque.  SrcOver becomes Src.
+            _mm_storeu_si128(d0, s0);
+            _mm_storeu_si128(d1, s1);
+            _mm_storeu_si128(d2, s2);
+            _mm_storeu_si128(d3, s3);
+            src += 16;
+            dst += 16;
+            len -= 16;
+            continue;
+        }
+
+        // TODO: This math is wrong.
+        // Do SrcOver.
+        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
+        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
+        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
+        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
+
+        src += 16;
+        dst += 16;
+        len -= 16;
+    }
+
+#elif defined(SK_ARM_HAS_NEON)
+    while (len >= 4) {
+        if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) {
+            // All 16 source pixels are transparent.  Nothing to do.
+            src += 4;
+            dst += 4;
+            len -= 4;
+            continue;
+        }
+
+        if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) {
+            // All 16 source pixels are opaque.  SrcOver becomes Src.
+            dst[0] = src[0];
+            dst[1] = src[1];
+            dst[2] = src[2];
+            dst[3] = src[3];
+            src += 4;
+            dst += 4;
+            len -= 4;
+            continue;
+        }
+
+        // Load 4 source and destination pixels.
+        auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)),
+             src2 = vreinterpret_u8_u32(vld1_u32(src+2)),
+             dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)),
+             dst2 = vreinterpret_u8_u32(vld1_u32(dst+2));
+
+        // TODO: This math is wrong.
+        const uint8x8_t alphas = vcreate_u8(0x0707070703030303);
+        auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)),
+             invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas));
+
+        auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)),
+             dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2));
+
+        dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8));
+        dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8));
+
+        vst1_u32(dst+0, vreinterpret_u32_u8(dst0));
+        vst1_u32(dst+2, vreinterpret_u32_u8(dst2));
+
+        src += 4;
+        dst += 4;
+        len -= 4;
+    }
+#endif
+
+    while (len-- > 0) {
+        if (*src) {
+            *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
+        }
+        src++;
+        dst++;
+    }
+}
+
 }  // SK_OPTS_NS
 
 #endif//SkBlitRow_opts_DEFINED
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 1929918..a6fbc25 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -67,61 +67,6 @@
     }
 }
 
-void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha) {
-    sk_msan_assert_initialized(src, src+count);
-
-    SkASSERT(alpha == 255);
-    if (count <= 0) {
-        return;
-    }
-
-    int count16 = count / 16;
-    __m128i* dst4 = (__m128i*)dst;
-    const __m128i* src4 = (const __m128i*)src;
-
-    for (int i = 0; i < count16 * 4; i += 4) {
-        // Load 16 source pixels.
-        __m128i s0 = _mm_loadu_si128(src4+i+0),
-                s1 = _mm_loadu_si128(src4+i+1),
-                s2 = _mm_loadu_si128(src4+i+2),
-                s3 = _mm_loadu_si128(src4+i+3);
-
-        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
-        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
-        __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
-        if (0xffff == _mm_movemask_epi8(cmp)) {
-            // All 16 source pixels are fully transparent. There's nothing to do!
-            continue;
-        }
-        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
-        cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
-        if (0xffff == _mm_movemask_epi8(cmp)) {
-            // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
-            _mm_storeu_si128(dst4+i+0, s0);
-            _mm_storeu_si128(dst4+i+1, s1);
-            _mm_storeu_si128(dst4+i+2, s2);
-            _mm_storeu_si128(dst4+i+3, s3);
-            continue;
-        }
-        // The general slow case: do the blend for all 16 pixels.
-        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
-        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
-        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
-        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
-    }
-
-    // Wrap up the last <= 15 pixels.
-    SkASSERT(count - (count16*16) <= 15);
-    for (int i = count16*16; i < count; i++) {
-        // This check is not really necessarily, but it prevents pointless autovectorization.
-        if (src[i] & 0xFF000000) {
-            dst[i] = SkPMSrcOver(src[i], dst[i]);
-        }
-    }
-}
-
 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                const SkPMColor* SK_RESTRICT src,
                                int count, U8CPU alpha) {
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 560edf4..652ff6e 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -14,10 +14,6 @@
                               const SkPMColor* SK_RESTRICT src,
                               int count, U8CPU alpha);
 
-void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha);
-
 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                const SkPMColor* SK_RESTRICT src,
                                int count, U8CPU alpha);
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
deleted file mode 100644
index e5d8809..0000000
--- a/src/opts/SkBlitRow_opts_SSE4.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkBlitRow_opts_SSE4.h"
-
-// Some compilers can't compile SSSE3 or SSE4 intrinsics.  We give them stub methods.
-// The stubs should never be called, so we make them crash just to confirm that.
-#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
-    sk_throw();
-}
-
-#else
-
-#include <smmintrin.h>      // SSE4.1 intrinsics
-#include "SkColorPriv.h"
-#include "SkColor_opts_SSE2.h"
-#include "SkMSAN.h"
-
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count,
-                                U8CPU alpha) {
-    sk_msan_assert_initialized(src, src+count);
-
-    SkASSERT(alpha == 255);
-    // As long as we can, we'll work on 16 pixel pairs at once.
-    int count16 = count / 16;
-    __m128i* dst4 = (__m128i*)dst;
-    const __m128i* src4 = (const __m128i*)src;
-
-    for (int i = 0; i < count16 * 4; i += 4) {
-        // Load 16 source pixels.
-        __m128i s0 = _mm_loadu_si128(src4+i+0),
-                s1 = _mm_loadu_si128(src4+i+1),
-                s2 = _mm_loadu_si128(src4+i+2),
-                s3 = _mm_loadu_si128(src4+i+3);
-
-        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
-        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
-        if (_mm_testz_si128(ORed, alphaMask)) {
-            // All 16 source pixels are fully transparent.  There's nothing to do!
-            continue;
-        }
-        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
-        if (_mm_testc_si128(ANDed, alphaMask)) {
-            // All 16 source pixels are fully opaque.  There's no need to read dst or blend it.
-            _mm_storeu_si128(dst4+i+0, s0);
-            _mm_storeu_si128(dst4+i+1, s1);
-            _mm_storeu_si128(dst4+i+2, s2);
-            _mm_storeu_si128(dst4+i+3, s3);
-            continue;
-        }
-        // The general slow case: do the blend for all 16 pixels.
-        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
-        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
-        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
-        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
-    }
-
-    // Wrap up the last <= 15 pixels.
-    for (int i = count16*16; i < count; i++) {
-        // This check is not really necessarily, but it prevents pointless autovectorization.
-        if (src[i] & 0xFF000000) {
-            dst[i] = SkPMSrcOver(src[i], dst[i]);
-        }
-    }
-}
-
-#endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h
deleted file mode 100644
index 577ace6..0000000
--- a/src/opts/SkBlitRow_opts_SSE4.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright 2014 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkBlitRow_opts_SSE4_DEFINED
-#define SkBlitRow_opts_SSE4_DEFINED
-
-#include "SkBlitRow.h"
-
-void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
-                                const SkPMColor* SK_RESTRICT,
-                                int count,
-                                U8CPU alpha);
-#endif
-
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 95bd229..3cb5a92 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -871,282 +871,6 @@
     }
 }
 
-void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha) {
-
-    SkASSERT(255 == alpha);
-    if (count > 0) {
-
-
-    uint8x8_t alpha_mask;
-
-    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
-    alpha_mask = vld1_u8(alpha_mask_setup);
-
-    /* do the NEON unrolled code */
-#define    UNROLL    4
-    while (count >= UNROLL) {
-        uint8x8_t src_raw, dst_raw, dst_final;
-        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
-
-        /* The two prefetches below may make the code slighlty
-         * slower for small values of count but are worth having
-         * in the general case.
-         */
-        __builtin_prefetch(src+32);
-        __builtin_prefetch(dst+32);
-
-        /* get the source */
-        src_raw = vreinterpret_u8_u32(vld1_u32(src));
-#if    UNROLL > 2
-        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
-#endif
-
-        /* get and hold the dst too */
-        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
-#if    UNROLL > 2
-        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-#endif
-
-    /* 1st and 2nd bits of the unrolling */
-    {
-        uint8x8_t dst_cooked;
-        uint16x8_t dst_wide;
-        uint8x8_t alpha_narrow;
-        uint16x8_t alpha_wide;
-
-        /* get the alphas spread out properly */
-        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final = vadd_u8(src_raw, dst_cooked);
-    }
-
-#if    UNROLL > 2
-    /* the 3rd and 4th bits of our unrolling */
-    {
-        uint8x8_t dst_cooked;
-        uint16x8_t dst_wide;
-        uint8x8_t alpha_narrow;
-        uint16x8_t alpha_wide;
-
-        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw_2);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
-    }
-#endif
-
-        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
-#if    UNROLL > 2
-        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
-#endif
-
-        src += UNROLL;
-        dst += UNROLL;
-        count -= UNROLL;
-    }
-#undef    UNROLL
-
-    /* do any residual iterations */
-        while (--count >= 0) {
-            *dst = SkPMSrcOver(*src, *dst);
-            src += 1;
-            dst += 1;
-        }
-    }
-}
-
-void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
-                                const SkPMColor* SK_RESTRICT src,
-                                int count, U8CPU alpha) {
-    SkASSERT(255 == alpha);
-
-    if (count <= 0)
-    return;
-
-    /* Use these to check if src is transparent or opaque */
-    const unsigned int ALPHA_OPAQ  = 0xFF000000;
-    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
-
-#define UNROLL  4
-    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
-    const SkPMColor* SK_RESTRICT src_temp = src;
-
-    /* set up the NEON variables */
-    uint8x8_t alpha_mask;
-    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
-    alpha_mask = vld1_u8(alpha_mask_setup);
-
-    uint8x8_t src_raw, dst_raw, dst_final;
-    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
-    uint8x8_t dst_cooked;
-    uint16x8_t dst_wide;
-    uint8x8_t alpha_narrow;
-    uint16x8_t alpha_wide;
-
-    /* choose the first processing type */
-    if( src >= src_end)
-        goto TAIL;
-    if(*src <= ALPHA_TRANS)
-        goto ALPHA_0;
-    if(*src >= ALPHA_OPAQ)
-        goto ALPHA_255;
-    /* fall-thru */
-
-ALPHA_1_TO_254:
-    do {
-
-        /* get the source */
-        src_raw = vreinterpret_u8_u32(vld1_u32(src));
-        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
-
-        /* get and hold the dst too */
-        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
-        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-
-
-        /* get the alphas spread out properly */
-        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final = vadd_u8(src_raw, dst_cooked);
-
-        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
-        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-
-        /* spread the dest */
-        dst_wide = vmovl_u8(dst_raw_2);
-
-        /* alpha mul the dest */
-        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
-        dst_cooked = vshrn_n_u16(dst_wide, 8);
-
-        /* sum -- ignoring any byte lane overflows */
-        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
-
-        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
-        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
-
-        src += UNROLL;
-        dst += UNROLL;
-
-        /* if 2 of the next pixels aren't between 1 and 254
-        it might make sense to go to the optimized loops */
-        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
-            break;
-
-    } while(src < src_end);
-
-    if (src >= src_end)
-        goto TAIL;
-
-    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
-        goto ALPHA_255;
-
-    /*fall-thru*/
-
-ALPHA_0:
-
-    /*In this state, we know the current alpha is 0 and
-     we optimize for the next alpha also being zero. */
-    src_temp = src;  //so we don't have to increment dst every time
-    do {
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-        if(*(++src) > ALPHA_TRANS)
-            break;
-    } while(src < src_end);
-
-    dst += (src - src_temp);
-
-    /* no longer alpha 0, so determine where to go next. */
-    if( src >= src_end)
-        goto TAIL;
-    if(*src >= ALPHA_OPAQ)
-        goto ALPHA_255;
-    else
-        goto ALPHA_1_TO_254;
-
-ALPHA_255:
-    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
-        dst[0]=src[0];
-        dst[1]=src[1];
-        dst[2]=src[2];
-        dst[3]=src[3];
-        src+=UNROLL;
-        dst+=UNROLL;
-        if(src >= src_end)
-            goto TAIL;
-    }
-
-    //Handle remainder.
-    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
-        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
-            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
-        }
-    }
-
-    if( src >= src_end)
-        goto TAIL;
-    if(*src <= ALPHA_TRANS)
-        goto ALPHA_0;
-    else
-        goto ALPHA_1_TO_254;
-
-TAIL:
-    /* do any residual iterations */
-    src_end += UNROLL + 1;  //goto the real end
-    while(src != src_end) {
-        if( *src != 0 ) {
-            if( *src >= ALPHA_OPAQ ) {
-                *dst = *src;
-            }
-            else {
-                *dst = SkPMSrcOver(*src, *dst);
-            }
-        }
-        src++;
-        dst++;
-    }
-
-#undef    UNROLL
-    return;
-}
-
 /* Neon version of S32_Blend_BlitRow32()
  * portable version is in src/core/SkBlitRow_D32.cpp
  */
@@ -1561,21 +1285,7 @@
 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
     nullptr,   // S32_Opaque,
     S32_Blend_BlitRow32_neon,        // S32_Blend,
-    /*
-     * We have two choices for S32A_Opaque procs. The one reads the src alpha
-     * value and attempts to optimize accordingly.  The optimization is
-     * sensitive to the source content and is not a win in all cases. For
-     * example, if there are a lot of transitions between the alpha states,
-     * the performance will almost certainly be worse.  However, for many
-     * common cases the performance is equivalent or better than the standard
-     * case where we do not inspect the src alpha.
-     */
-#if SK_A32_SHIFT == 24
-    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
-    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
-#else
-    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
-#endif
+    nullptr,  // Ported to SkOpts
 #ifdef SK_CPU_ARM32
     S32A_Blend_BlitRow32_neon        // S32A_Blend
 #else
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index e96cf17..76f7dd5 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -36,7 +36,8 @@
 
         blit_mask_d32_a8 = sk_neon::blit_mask_d32_a8;
 
-        blit_row_color32 = sk_neon::blit_row_color32;
+        blit_row_color32     = sk_neon::blit_row_color32;
+        blit_row_s32a_opaque = sk_neon::blit_row_s32a_opaque;
 
         color_cube_filter_span = sk_neon::color_cube_filter_span;
 
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 7a76081..34b078c 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -9,10 +9,11 @@
 
 #define SK_OPTS_NS sk_sse41
 #include "SkBlurImageFilter_opts.h"
+#include "SkBlitRow_opts.h"
 
 #ifndef SK_SUPPORT_LEGACY_X86_BLITS
 
-namespace sk_sse41 {
+namespace sk_sse41_new {
 
 // An SSE register holding at most 64 bits of useful data in the low lanes.
 struct m64i {
@@ -211,7 +212,7 @@
     }
 }
 
-}  // namespace sk_sse41
+}  // namespace sk_sse41_new
 
 #endif
 
@@ -222,8 +223,9 @@
         box_blur_yx = sk_sse41::box_blur_yx;
 
     #ifndef SK_SUPPORT_LEGACY_X86_BLITS
-        blit_row_color32 = sk_sse41::blit_row_color32;
-        blit_mask_d32_a8 = sk_sse41::blit_mask_d32_a8;
+        blit_row_color32 = sk_sse41_new::blit_row_color32;
+        blit_mask_d32_a8 = sk_sse41_new::blit_mask_d32_a8;
     #endif
+        blit_row_s32a_opaque = sk_sse41::blit_row_s32a_opaque;
     }
 }
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 9983eb5..0ee78c5 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -12,7 +12,6 @@
 #include "SkBlitMask.h"
 #include "SkBlitRow.h"
 #include "SkBlitRow_opts_SSE2.h"
-#include "SkBlitRow_opts_SSE4.h"
 #include "SkOncePtr.h"
 #include "SkRTConf.h"
 
@@ -215,21 +214,11 @@
 static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
     nullptr,                               // S32_Opaque,
     S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
-    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
-};
-
-static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
-    nullptr,                               // S32_Opaque,
-    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    S32A_Opaque_BlitRow32_SSE4,         // S32A_Opaque
+    nullptr,                            // Ported to SkOpts
     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
 };
 
 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
-        return platform_32_procs_SSE4[flags];
-    } else
     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
         return platform_32_procs_SSE2[flags];
     } else {