commit to alpha bitmap procs

The alpha procs differ from the opaque ones only in a very isolated,
branch-predictable place.  If we need to make any of these care about
whether alphaScale == 256, we can probably just put a branch in there.

More refactoring to follow...

For now I've kept Clamp_S32_opaque_D32_nofilter_DX_shaderproc() around.

Cq-Include-Trybots: master.tryserver.blink:linux_trusty_blink_rel
Change-Id: I2739b6fc7d36c1adf6c702b271b20986f86e413f
Reviewed-on: https://skia-review.googlesource.com/c/171040
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 8d5e1a8..26e5ac9 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -22,7 +22,62 @@
 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
 #endif
 
-extern void Clamp_S32_opaque_D32_nofilter_DX_shaderproc(const void*, int, int, uint32_t*, int);
+// One-stop-shop shader for,
+//   - nearest-neighbor sampling (_nofilter_),
+//   - clamp tiling in X and Y both (Clamp_),
+//   - with at most a scale and translate matrix (_DX_),
+//   - and no extra alpha applied (_opaque_),
+//   - sampling from 8888 (_S32_) and drawing to 8888 (_S32_).
+static void Clamp_S32_opaque_D32_nofilter_DX_shaderproc(const void* sIn, int x, int y,
+                                                        SkPMColor* SK_RESTRICT dst, int count) {
+    const SkBitmapProcState& s = *static_cast<const SkBitmapProcState*>(sIn);
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fAlphaScale == 256);
+
+    const unsigned maxX = s.fPixmap.width() - 1;
+    SkFractionalInt fx;
+    int dstY;
+    {
+        const SkBitmapProcStateAutoMapper mapper(s, x, y);
+        const unsigned maxY = s.fPixmap.height() - 1;
+        dstY = SkClampMax(mapper.intY(), maxY);
+        fx = mapper.fractionalIntX();
+    }
+
+    const SkPMColor* SK_RESTRICT src = s.fPixmap.addr32(0, dstY);
+    const SkFractionalInt dx = s.fInvSxFractionalInt;
+
+    // Check if we're safely inside [0...maxX] so no need to clamp each computed index.
+    //
+    if ((uint64_t)SkFractionalIntToInt(fx) <= maxX &&
+        (uint64_t)SkFractionalIntToInt(fx + dx * (count - 1)) <= maxX)
+    {
+        int count4 = count >> 2;
+        for (int i = 0; i < count4; ++i) {
+            SkPMColor src0 = src[SkFractionalIntToInt(fx)]; fx += dx;
+            SkPMColor src1 = src[SkFractionalIntToInt(fx)]; fx += dx;
+            SkPMColor src2 = src[SkFractionalIntToInt(fx)]; fx += dx;
+            SkPMColor src3 = src[SkFractionalIntToInt(fx)]; fx += dx;
+            dst[0] = src0;
+            dst[1] = src1;
+            dst[2] = src2;
+            dst[3] = src3;
+            dst += 4;
+        }
+        for (int i = (count4 << 2); i < count; ++i) {
+            unsigned index = SkFractionalIntToInt(fx);
+            SkASSERT(index <= maxX);
+            *dst++ = src[index];
+            fx += dx;
+        }
+    } else {
+        for (int i = 0; i < count; ++i) {
+            dst[i] = src[SkClampMax(SkFractionalIntToInt(fx), maxX)];
+            fx += dx;
+        }
+    }
+}
 
 #define   NAME_WRAP(x)  x
 #include "SkBitmapProcState_filter.h"
@@ -190,25 +245,18 @@
 
     if (fFilterQuality < kHigh_SkFilterQuality) {
         int index = 0;
-        if (fAlphaScale < 256) {  // note: this distinction is not used for D16
+        if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) {
             index |= 1;
         }
-        if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) {
-            index |= 2;
-        }
         if (fFilterQuality > kNone_SkFilterQuality) {
-            index |= 4;
+            index |= 2;
         }
 
 #if !defined(SK_ARM_HAS_NEON)
         static const SampleProc32 gSkBitmapProcStateSample32[] = {
-            S32_opaque_D32_nofilter_DXDY,
             S32_alpha_D32_nofilter_DXDY,
-            S32_opaque_D32_nofilter_DX,
             S32_alpha_D32_nofilter_DX,
-            S32_opaque_D32_filter_DXDY,
             S32_alpha_D32_filter_DXDY,
-            S32_opaque_D32_filter_DX,
             S32_alpha_D32_filter_DX,
         };
 #endif
@@ -216,7 +264,9 @@
         fSampleProc32 = SK_ARM_NEON_WRAP(gSkBitmapProcStateSample32)[index];
 
         // our special-case shaderprocs
-        if (S32_opaque_D32_nofilter_DX == fSampleProc32 && clampClamp) {
+        if (fAlphaScale == 256
+                && fSampleProc32 == S32_alpha_D32_nofilter_DX
+                && clampClamp) {
             fShaderProc32 = Clamp_S32_opaque_D32_nofilter_DX_shaderproc;
         }
 
@@ -416,12 +466,7 @@
 
     if (kNone_SkFilterQuality != s.fFilterQuality) {
         const SkPMColor* row1 = s.fPixmap.addr32(0, iY1);
-
-        if (s.fAlphaScale < 256) {
-            Filter_32_alpha(iSubY, *row0, *row1, &color, s.fAlphaScale);
-        } else {
-            Filter_32_opaque(iSubY, *row0, *row1, &color);
-        }
+        Filter_32_alpha(iSubY, *row0, *row1, &color, s.fAlphaScale);
     } else {
         if (s.fAlphaScale < 256) {
             color = SkAlphaMulQ(*row0, s.fAlphaScale);
@@ -596,52 +641,3 @@
 
 ///////////////////////
 
-void  Clamp_S32_opaque_D32_nofilter_DX_shaderproc(const void* sIn, int x, int y,
-                                                  SkPMColor* SK_RESTRICT dst, int count) {
-    const SkBitmapProcState& s = *static_cast<const SkBitmapProcState*>(sIn);
-    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
-                             SkMatrix::kScale_Mask)) == 0);
-
-    const unsigned maxX = s.fPixmap.width() - 1;
-    SkFractionalInt fx;
-    int dstY;
-    {
-        const SkBitmapProcStateAutoMapper mapper(s, x, y);
-        const unsigned maxY = s.fPixmap.height() - 1;
-        dstY = SkClampMax(mapper.intY(), maxY);
-        fx = mapper.fractionalIntX();
-    }
-
-    const SkPMColor* SK_RESTRICT src = s.fPixmap.addr32(0, dstY);
-    const SkFractionalInt dx = s.fInvSxFractionalInt;
-
-    // Check if we're safely inside [0...maxX] so no need to clamp each computed index.
-    //
-    if ((uint64_t)SkFractionalIntToInt(fx) <= maxX &&
-        (uint64_t)SkFractionalIntToInt(fx + dx * (count - 1)) <= maxX)
-    {
-        int count4 = count >> 2;
-        for (int i = 0; i < count4; ++i) {
-            SkPMColor src0 = src[SkFractionalIntToInt(fx)]; fx += dx;
-            SkPMColor src1 = src[SkFractionalIntToInt(fx)]; fx += dx;
-            SkPMColor src2 = src[SkFractionalIntToInt(fx)]; fx += dx;
-            SkPMColor src3 = src[SkFractionalIntToInt(fx)]; fx += dx;
-            dst[0] = src0;
-            dst[1] = src1;
-            dst[2] = src2;
-            dst[3] = src3;
-            dst += 4;
-        }
-        for (int i = (count4 << 2); i < count; ++i) {
-            unsigned index = SkFractionalIntToInt(fx);
-            SkASSERT(index <= maxX);
-            *dst++ = src[index];
-            fx += dx;
-        }
-    } else {
-        for (int i = 0; i < count; ++i) {
-            dst[i] = src[SkClampMax(SkFractionalIntToInt(fx), maxX)];
-            fx += dx;
-        }
-    }
-}
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index bcc5da3..6aa5865 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -173,10 +173,9 @@
 
 // These functions are generated via macros, but are exposed here so that
 // platformProcs may test for them by name.
-void S32_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
-                              int count, SkPMColor colors[]);
-void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
-                             int count, SkPMColor colors[]);
+void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+                             const uint32_t xy[], int count, SkPMColor colors[]);
+
 void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[],
                                 int count, int x, int y);
 void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[],
diff --git a/src/core/SkBitmapProcState_filter.h b/src/core/SkBitmapProcState_filter.h
index a30b76d..a7b9a84 100644
--- a/src/core/SkBitmapProcState_filter.h
+++ b/src/core/SkBitmapProcState_filter.h
@@ -8,43 +8,6 @@
 
 #include "SkColorData.h"
 
-/*
-    Filter_32_opaque
-
-    There is no hard-n-fast rule that the filtering must produce
-    exact results for the color components, but if the 4 incoming colors are
-    all opaque, then the output color must also be opaque. Subsequent parts of
-    the drawing pipeline may rely on this (e.g. which blitrow proc to use).
- */
-
-static inline void Filter_32_opaque(unsigned x, unsigned y,
-                                    SkPMColor a00, SkPMColor a01,
-                                    SkPMColor a10, SkPMColor a11,
-                                    SkPMColor* dstColor) {
-    SkASSERT((unsigned)x <= 0xF);
-    SkASSERT((unsigned)y <= 0xF);
-
-    int xy = x * y;
-    const uint32_t mask = 0xFF00FF;
-
-    int scale = 256 - 16*y - 16*x + xy;
-    uint32_t lo = (a00 & mask) * scale;
-    uint32_t hi = ((a00 >> 8) & mask) * scale;
-
-    scale = 16*x - xy;
-    lo += (a01 & mask) * scale;
-    hi += ((a01 >> 8) & mask) * scale;
-
-    scale = 16*y - xy;
-    lo += (a10 & mask) * scale;
-    hi += ((a10 >> 8) & mask) * scale;
-
-    lo += (a11 & mask) * xy;
-    hi += ((a11 >> 8) & mask) * xy;
-
-    *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
-}
-
 static inline void Filter_32_alpha(unsigned x, unsigned y,
                                    SkPMColor a00, SkPMColor a01,
                                    SkPMColor a10, SkPMColor a11,
@@ -72,6 +35,7 @@
     lo += (a11 & mask) * xy;
     hi += ((a11 >> 8) & mask) * xy;
 
+    // TODO: if (alphaScale < 256) ...
     lo = ((lo >> 8) & mask) * alphaScale;
     hi = ((hi >> 8) & mask) * alphaScale;
 
@@ -79,26 +43,6 @@
 }
 
 // Two color version, where we filter only along 1 axis
-static inline void Filter_32_opaque(unsigned t,
-                                    SkPMColor color0,
-                                    SkPMColor color1,
-                                    SkPMColor* dstColor) {
-    SkASSERT((unsigned)t <= 0xF);
-
-    const uint32_t mask = 0xFF00FF;
-
-    int scale = 256 - 16*t;
-    uint32_t lo = (color0 & mask) * scale;
-    uint32_t hi = ((color0 >> 8) & mask) * scale;
-
-    scale = 16*t;
-    lo += (color1 & mask) * scale;
-    hi += ((color1 >> 8) & mask) * scale;
-
-    *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
-}
-
-// Two color version, where we filter only along 1 axis
 static inline void Filter_32_alpha(unsigned t,
                                    SkPMColor color0,
                                    SkPMColor color1,
@@ -117,6 +61,7 @@
     lo += (color1 & mask) * scale;
     hi += ((color1 >> 8) & mask) * scale;
 
+    // TODO: if (alphaScale < 256) ...
     lo = ((lo >> 8) & mask) * alphaScale;
     hi = ((hi >> 8) & mask) * alphaScale;
 
diff --git a/src/core/SkBitmapProcState_procs.h b/src/core/SkBitmapProcState_procs.h
index 6badd62..ebdf155 100644
--- a/src/core/SkBitmapProcState_procs.h
+++ b/src/core/SkBitmapProcState_procs.h
@@ -9,28 +9,16 @@
 // E.g. for ARM NEON, defined it as 'x ## _neon' to ensure all important
 // identifiers have a _neon suffix.
 #ifndef NAME_WRAP
-#error "Please define NAME_WRAP() before including this file"
+    #error "Please define NAME_WRAP() before including this file"
 #endif
 
-// SRC == 8888
-
-#define FILTER_PROC(x, y, a, b, c, d, dst)   NAME_WRAP(Filter_32_opaque)(x, y, a, b, c, d, dst)
-
-#define MAKENAME(suffix)        NAME_WRAP(S32_opaque_D32 ## suffix)
-#define SRCTYPE                 SkPMColor
-#define CHECKSTATE(state)       SkASSERT(4 == state.fPixmap.info().bytesPerPixel()); \
-                                SkASSERT(state.fAlphaScale == 256)
-#define RETURNDST(src)          src
-#define SRC_TO_FILTER(src)      src
-#include "SkBitmapProcState_sample.h"
-
-#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d, dst)   NAME_WRAP(Filter_32_alpha)(x, y, a, b, c, d, dst, alphaScale)
+#define FILTER_PROC(x, y, a, b, c, d, dst) \
+    NAME_WRAP(Filter_32_alpha)(x, y, a, b, c, d, dst, alphaScale)
 
 #define MAKENAME(suffix)        NAME_WRAP(S32_alpha_D32 ## suffix)
 #define SRCTYPE                 SkPMColor
 #define CHECKSTATE(state)       SkASSERT(4 == state.fPixmap.info().bytesPerPixel()); \
-                                SkASSERT(state.fAlphaScale < 256)
+                                SkASSERT(state.fAlphaScale <= 256)
 #define PREAMBLE(state)         unsigned alphaScale = state.fAlphaScale
 #define RETURNDST(src)          SkAlphaMulQ(src, alphaScale)
 #define SRC_TO_FILTER(src)      src
diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp
index b954d85..1087b4d 100644
--- a/src/opts/SkBitmapProcState_arm_neon.cpp
+++ b/src/opts/SkBitmapProcState_arm_neon.cpp
@@ -20,12 +20,8 @@
 #include "SkBitmapProcState_procs.h"
 
 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
-    S32_opaque_D32_nofilter_DXDY_neon,
     S32_alpha_D32_nofilter_DXDY_neon,
-    S32_opaque_D32_nofilter_DX_neon,
     S32_alpha_D32_nofilter_DX_neon,
-    S32_opaque_D32_filter_DXDY_neon,
     S32_alpha_D32_filter_DXDY_neon,
-    S32_opaque_D32_filter_DX_neon,
     S32_alpha_D32_filter_DX_neon,
 };
diff --git a/src/opts/SkBitmapProcState_filter_neon.h b/src/opts/SkBitmapProcState_filter_neon.h
index 5352ce4..ab3cec8 100644
--- a/src/opts/SkBitmapProcState_filter_neon.h
+++ b/src/opts/SkBitmapProcState_filter_neon.h
@@ -5,54 +5,8 @@
  * found in the LICENSE file.
  */
 
-#include <arm_neon.h>
 #include "SkColorData.h"
-
-/*
- * Filter_32_opaque
- *
- * There is no hard-n-fast rule that the filtering must produce
- * exact results for the color components, but if the 4 incoming colors are
- * all opaque, then the output color must also be opaque. Subsequent parts of
- * the drawing pipeline may rely on this (e.g. which blitrow proc to use).
- *
- */
-// Chrome on Android uses -Os so we need to force these inline. Otherwise
-// calling the function in the inner loops will cause significant overhead on
-// some platforms.
-static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
-                                                   SkPMColor a00, SkPMColor a01,
-                                                   SkPMColor a10, SkPMColor a11,
-                                                   SkPMColor *dst) {
-    uint8x8_t vy, vconst16_8, v16_y, vres;
-    uint16x4_t vx, vconst16_16, v16_x, tmp;
-    uint32x2_t va0, va1;
-    uint16x8_t tmp1, tmp2;
-
-    vy = vdup_n_u8(y);                // duplicate y into vy
-    vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
-    v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
-
-    va0 = vdup_n_u32(a00);            // duplicate a00
-    va1 = vdup_n_u32(a10);            // duplicate a10
-    va0 = vset_lane_u32(a01, va0, 1); // set top to a01
-    va1 = vset_lane_u32(a11, va1, 1); // set top to a11
-
-    tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
-    tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
-
-    vx = vdup_n_u16(x);                // duplicate x into vx
-    vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
-    v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
-
-    tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
-    tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
-    tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
-    tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
-
-    vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
-    vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
-}
+#include <arm_neon.h>
 
 static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
                                                   SkPMColor a00, SkPMColor a01,
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 92be971..391d421 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -8,125 +8,17 @@
 #include "SkBitmapProcState_opts_SSE2.h"
 #include "SkBitmapProcState_utils.h"
 #include "SkColorData.h"
-#include "SkPaint.h"
 #include "SkTo.h"
-#include "SkUTF.h"
 
 #include <emmintrin.h>
 
-void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors) {
-    SkASSERT(count > 0 && colors != nullptr);
-    SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
-    SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
-    SkASSERT(s.fAlphaScale == 256);
-
-    const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
-    size_t rb = s.fPixmap.rowBytes();
-    uint32_t XY = *xy++;
-    unsigned y0 = XY >> 14;
-    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
-    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
-    unsigned subY = y0 & 0xF;
-
-    // ( 0,  0,  0,  0,  0,  0,  0, 16)
-    __m128i sixteen = _mm_cvtsi32_si128(16);
-
-    // ( 0,  0,  0,  0, 16, 16, 16, 16)
-    sixteen = _mm_shufflelo_epi16(sixteen, 0);
-
-    // ( 0,  0,  0,  0,  0,  0,  0,  y)
-    __m128i allY = _mm_cvtsi32_si128(subY);
-
-    // ( 0,  0,  0,  0,  y,  y,  y,  y)
-    allY = _mm_shufflelo_epi16(allY, 0);
-
-    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
-    __m128i negY = _mm_sub_epi16(sixteen, allY);
-
-    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
-    allY = _mm_unpacklo_epi64(allY, negY);
-
-    // (16, 16, 16, 16, 16, 16, 16, 16 )
-    sixteen = _mm_shuffle_epi32(sixteen, 0);
-
-    // ( 0,  0,  0,  0,  0,  0,  0,  0)
-    __m128i zero = _mm_setzero_si128();
-    do {
-        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
-        unsigned x0 = XX >> 18;
-        unsigned x1 = XX & 0x3FFF;
-
-        // (0, 0, 0, 0, 0, 0, 0, x)
-        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
-
-        // (0, 0, 0, 0, x, x, x, x)
-        allX = _mm_shufflelo_epi16(allX, 0);
-
-        // (x, x, x, x, x, x, x, x)
-        allX = _mm_shuffle_epi32(allX, 0);
-
-        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
-        __m128i negX = _mm_sub_epi16(sixteen, allX);
-
-        // Load 4 samples (pixels).
-        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
-        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
-        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
-        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
-
-        // (0, 0, a00, a10)
-        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
-
-        // Expand to 16 bits per component.
-        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
-
-        // ((a00 * (16-y)), (a10 * y)).
-        a00a10 = _mm_mullo_epi16(a00a10, allY);
-
-        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
-        a00a10 = _mm_mullo_epi16(a00a10, negX);
-
-        // (0, 0, a01, a10)
-        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
-
-        // Expand to 16 bits per component.
-        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
-
-        // (a01 * (16-y)), (a11 * y)
-        a01a11 = _mm_mullo_epi16(a01a11, allY);
-
-        // (a01 * (16-y) * x), (a11 * y * x)
-        a01a11 = _mm_mullo_epi16(a01a11, allX);
-
-        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
-        __m128i sum = _mm_add_epi16(a00a10, a01a11);
-
-        // (DC, a00*w00 + a01*w01)
-        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
-
-        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
-        sum = _mm_add_epi16(sum, shifted);
-
-        // Divide each 16 bit component by 256.
-        sum = _mm_srli_epi16(sum, 8);
-
-        // Pack lower 4 16 bit values of sum into lower 4 bytes.
-        sum = _mm_packus_epi16(sum, zero);
-
-        // Extract low int and store.
-        *colors++ = _mm_cvtsi128_si32(sum);
-    } while (--count > 0);
-}
-
 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                   const uint32_t* xy,
                                   int count, uint32_t* colors) {
     SkASSERT(count > 0 && colors != nullptr);
     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
-    SkASSERT(s.fAlphaScale < 256);
+    SkASSERT(s.fAlphaScale <= 256);
 
     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
     size_t rb = s.fPixmap.rowBytes();
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 42fe80b..7faeab4 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -10,12 +10,9 @@
 
 #include "SkBitmapProcState.h"
 
-void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
-                                   const uint32_t* xy,
-                                   int count, uint32_t* colors);
 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
-                                  const uint32_t* xy,
-                                  int count, uint32_t* colors);
+                                  const uint32_t* xy, int count, uint32_t* colors);
+
 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
                                      int count, int x, int y);
 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
diff --git a/src/opts/SkBitmapProcState_opts_none.cpp b/src/opts/SkBitmapProcState_opts_none.cpp
index 9b3c7b6..44e975b 100644
--- a/src/opts/SkBitmapProcState_opts_none.cpp
+++ b/src/opts/SkBitmapProcState_opts_none.cpp
@@ -7,15 +7,4 @@
 
 #include "SkBitmapProcState.h"
 
-/*  A platform may optionally overwrite any of these with accelerated
-    versions. On input, these will already have valid function pointers,
-    so a platform need only overwrite the ones it chooses, based on the
-    current state (e.g. fBitmap, fInvMatrix, etc.)
-
-    fShaderProc32
-    fMatrixProc
-    fSampleProc32
- */
-
-// empty implementation just uses default supplied function pointers
 void SkBitmapProcState::platformProcs() {}
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 29debd7..9b63e4a 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -9,7 +9,6 @@
 #include "SkBitmapProcState_opts_SSSE3.h"
 #include "SkCpu.h"
 
-
 /*
  *****************************************
  *********This file is deprecated*********
@@ -21,41 +20,20 @@
  *****************************************
  */
 
-
-/* This file must *not* be compiled with -msse or any other optional SIMD
-   extension, otherwise gcc may generate SIMD instructions even for scalar ops
-   (and thus give an invalid instruction on Pentium3 on the code below).
-   For example, only files named *_SSE2.cpp in this directory should be
-   compiled with -msse2 or higher. */
-
-////////////////////////////////////////////////////////////////////////////////
-
 void SkBitmapProcState::platformProcs() {
-    /* Every optimization in the function requires at least SSE2 */
     if (!SkCpu::Supports(SkCpu::SSE2)) {
         return;
     }
-    const bool ssse3 = SkCpu::Supports(SkCpu::SSSE3);
 
-    /* Check fSampleProc32 */
-    if (fSampleProc32 == S32_opaque_D32_filter_DX) {
-        if (ssse3) {
-            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
-        } else {
-            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
-        }
-    } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
-        if (ssse3) {
-            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
-        } else {
-            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
-        }
+    if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+        fSampleProc32 = SkCpu::Supports(SkCpu::SSSE3) ? S32_alpha_D32_filter_DX_SSSE3
+                                                      : S32_alpha_D32_filter_DX_SSE2;
     }
 
-    /* Check fMatrixProc */
     if (fMatrixProc == ClampX_ClampY_filter_scale) {
-        fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
-    } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
-        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+        fMatrixProc =  ClampX_ClampY_filter_scale_SSE2;
+    }
+    if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+        fMatrixProc =  ClampX_ClampY_nofilter_scale_SSE2;
     }
 }