Make SSE2/Neon convolution functions not to read extra bytes

This change makes SSE2/Neon horizontal convolution functions do not read
extra pixels past the end of the buffer. So we can remove all the SIMD
specific logic in SkConvolver to deal with last couple of rows and also
avoid applying padding to convolution filters.

Performance impact is small. Nanobench time change:
                              SSE2    NEON
bitmap_scale_filter_64_256     1%     -2%
bitmap_scale_filter_256_64     1%      2%
bitmap_scale_filter_90_10      1%     -1%
bitmap_scale_filter_90_30      1%      0%
bitmap_scale_filter_90_80      1%      0%
bitmap_scale_filter_90_90      1%      1%
bitmap_scale_filter_80_90      0%      0%
bitmap_scale_filter_30_90      3%      6%
bitmap_scale_filter_10_90      0%      2%

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2481733003
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2481733003
diff --git a/src/core/SkBitmapScaler.cpp b/src/core/SkBitmapScaler.cpp
index 5edb1b2..25fbd1f 100644
--- a/src/core/SkBitmapScaler.cpp
+++ b/src/core/SkBitmapScaler.cpp
@@ -22,8 +22,7 @@
     SkResizeFilter(SkBitmapScaler::ResizeMethod method,
                    int srcFullWidth, int srcFullHeight,
                    float destWidth, float destHeight,
-                   const SkRect& destSubset,
-                   const SkConvolutionProcs& convolveProcs);
+                   const SkRect& destSubset);
     ~SkResizeFilter() { delete fBitmapFilter; }
 
     // Returns the filled filter values.
@@ -48,8 +47,7 @@
     void computeFilters(int srcSize,
                         float destSubsetLo, float destSubsetSize,
                         float scale,
-                        SkConvolutionFilter1D* output,
-                        const SkConvolutionProcs& convolveProcs);
+                        SkConvolutionFilter1D* output);
 
     SkConvolutionFilter1D fXFilter;
     SkConvolutionFilter1D fYFilter;
@@ -58,8 +56,7 @@
 SkResizeFilter::SkResizeFilter(SkBitmapScaler::ResizeMethod method,
                                int srcFullWidth, int srcFullHeight,
                                float destWidth, float destHeight,
-                               const SkRect& destSubset,
-                               const SkConvolutionProcs& convolveProcs) {
+                               const SkRect& destSubset) {
 
     SkASSERT(method >= SkBitmapScaler::RESIZE_FirstMethod &&
              method <= SkBitmapScaler::RESIZE_LastMethod);
@@ -88,7 +85,7 @@
     float scaleY = destHeight / srcFullHeight;
 
     this->computeFilters(srcFullWidth, destSubset.fLeft, destSubset.width(),
-                         scaleX, &fXFilter, convolveProcs);
+                         scaleX, &fXFilter);
     if (srcFullWidth == srcFullHeight &&
         destSubset.fLeft == destSubset.fTop &&
         destSubset.width() == destSubset.height()&&
@@ -96,7 +93,7 @@
         fYFilter = fXFilter;
     } else {
         this->computeFilters(srcFullHeight, destSubset.fTop, destSubset.height(),
-                          scaleY, &fYFilter, convolveProcs);
+                          scaleY, &fYFilter);
     }
 }
 
@@ -114,8 +111,7 @@
 void SkResizeFilter::computeFilters(int srcSize,
                                   float destSubsetLo, float destSubsetSize,
                                   float scale,
-                                  SkConvolutionFilter1D* output,
-                                  const SkConvolutionProcs& convolveProcs) {
+                                  SkConvolutionFilter1D* output) {
   float destSubsetHi = destSubsetLo + destSubsetSize;  // [lo, hi)
 
   // When we're doing a magnification, the scale will be larger than one. This
@@ -200,10 +196,6 @@
     // Now it's ready to go.
     output->AddFilter(SkScalarFloorToInt(srcBegin), fixedFilterValues, filterCount);
   }
-
-  if (convolveProcs.fApplySIMDPadding) {
-      convolveProcs.fApplySIMDPadding(output);
-  }
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -222,13 +214,13 @@
         return false;
     }
 
-    SkConvolutionProcs convolveProcs= { 0, nullptr, nullptr, nullptr, nullptr };
+    SkConvolutionProcs convolveProcs= { nullptr, nullptr, nullptr };
     PlatformConvolutionProcs(&convolveProcs);
 
     SkRect destSubset = SkRect::MakeIWH(result.width(), result.height());
 
     SkResizeFilter filter(method, source.width(), source.height(),
-                          result.width(), result.height(), destSubset, convolveProcs);
+                          result.width(), result.height(), destSubset);
 
     // Get a subset encompassing this touched area. We construct the
     // offsets and row strides such that it looks like a new bitmap, while
diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp
index c662e2d..c32cc03 100644
--- a/src/core/SkConvolver.cpp
+++ b/src/core/SkConvolver.cpp
@@ -401,19 +401,6 @@
     // We need to check which is the last line to convolve before we advance 4
     // lines in one iteration.
     int lastFilterOffset, lastFilterLength;
-
-    // SSE2 can access up to 3 extra pixels past the end of the
-    // buffer. At the bottom of the image, we have to be careful
-    // not to access data past the end of the buffer. Normally
-    // we fall back to the C++ implementation for the last row.
-    // If the last row is less than 3 pixels wide, we may have to fall
-    // back to the C++ version for more rows. Compute how many
-    // rows we need to avoid the SSE implementation for here.
-    filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset,
-                           &lastFilterLength);
-    int avoidSimdRows = 1 + convolveProcs.fExtraHorizontalReads /
-        (lastFilterOffset + lastFilterLength);
-
     filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
                            &lastFilterLength);
 
@@ -424,8 +411,7 @@
         // Generate output rows until we have enough to run the current filter.
         while (nextXRow < filterOffset + filterLength) {
             if (convolveProcs.fConvolve4RowsHorizontally &&
-                nextXRow + 3 < lastFilterOffset + lastFilterLength -
-                avoidSimdRows) {
+                nextXRow + 3 < lastFilterOffset + lastFilterLength) {
                 const unsigned char* src[4];
                 unsigned char* outRow[4];
                 for (int i = 0; i < 4; ++i) {
@@ -435,10 +421,7 @@
                 convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth);
                 nextXRow += 4;
             } else {
-                // Check if we need to avoid SSE2 for this row.
-                if (convolveProcs.fConvolveHorizontally &&
-                    nextXRow < lastFilterOffset + lastFilterLength -
-                    avoidSimdRows) {
+                if (convolveProcs.fConvolveHorizontally) {
                     convolveProcs.fConvolveHorizontally(
                         &sourceData[(uint64_t)nextXRow * sourceByteRowStride],
                         filterX, rowBuffer.advanceRow(), sourceHasAlpha);
diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h
index 4e23f6c..28a08df 100644
--- a/src/core/SkConvolver.h
+++ b/src/core/SkConvolver.h
@@ -157,17 +157,11 @@
     const SkConvolutionFilter1D& filter,
     unsigned char* outRow,
     bool hasAlpha);
-typedef void (*SkConvolveFilterPadding_pointer)(
-    SkConvolutionFilter1D* filter);
 
 struct SkConvolutionProcs {
-  // This is how many extra pixels may be read by the
-  // conolve*horizontally functions.
-    int fExtraHorizontalReads;
     SkConvolveVertically_pointer fConvolveVertically;
     SkConvolve4RowsHorizontally_pointer fConvolve4RowsHorizontally;
     SkConvolveHorizontally_pointer fConvolveHorizontally;
-    SkConvolveFilterPadding_pointer fApplySIMDPadding;
 };
 
 
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index ecaad23..324ac1a 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -40,6 +40,20 @@
 }
 #endif
 
+static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
+        const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& accum, int r) {
+    int remainder[4] = {0};
+    for (int i = 0; i < r; i++) {
+        SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
+        remainder[0] += coeff * pixels_left[i * 4 + 0];
+        remainder[1] += coeff * pixels_left[i * 4 + 1];
+        remainder[2] += coeff * pixels_left[i * 4 + 2];
+        remainder[3] += coeff * pixels_left[i * 4 + 3];
+    }
+    __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
+    accum = _mm_add_epi32(accum, t);
+}
+
 // Convolves horizontally along a single row. The row data is given in
 // |src_data| and continues for the num_values() of the filter.
 void convolveHorizontally_SSE2(const unsigned char* src_data,
@@ -50,13 +64,6 @@
 
     int filter_offset, filter_length;
     __m128i zero = _mm_setzero_si128();
-    __m128i mask[4];
-    // |mask| will be used to decimate all extra filter coefficients that are
-    // loaded by SIMD when |filter_length| is not divisible by 4.
-    // mask[0] is not used in following algorithm.
-    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
 
     // Output one pixel each iteration, calculating all channels (RGBA) together.
     for (int out_x = 0; out_x < num_values; out_x++) {
@@ -120,38 +127,12 @@
             filter_values += 4;
         }
 
-        // When |filter_length| is not divisible by 4, we need to decimate some of
-        // the filter coefficient that was loaded incorrectly to zero; Other than
-        // that the algorithm is same with above, exceot that the 4th pixel will be
-        // always absent.
-        int r = filter_length&3;
+        // When |filter_length| is not divisible by 4, we accumulate the last 1 - 3
+        // coefficients one at a time.
+        int r = filter_length & 3;
         if (r) {
-            // Note: filter_values must be padded to align_up(filter_offset, 8).
-            __m128i coeff, coeff16;
-            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-            // Mask out extra filter taps.
-            coeff = _mm_and_si128(coeff, mask[r]);
-            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-
-            // Note: line buffer must be padded to align_up(filter_offset, 16).
-            // We resolve this by use C-version for the last horizontal line.
-            __m128i src8 = _mm_loadu_si128(row_to_filter);
-            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
-            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
-            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
-            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-            accum = _mm_add_epi32(accum, t);
-            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
-            accum = _mm_add_epi32(accum, t);
-
-            src16 = _mm_unpackhi_epi8(src8, zero);
-            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
-            mul_hi = _mm_mulhi_epi16(src16, coeff16);
-            mul_lo = _mm_mullo_epi16(src16, coeff16);
-            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
-            accum = _mm_add_epi32(accum, t);
+            int remainder_offset = (filter_offset + filter_length - r) * 4;
+            accum_remainder(src_data + remainder_offset, filter_values, accum, r);
         }
 
         // Shift right for fixed point implementation.
@@ -182,13 +163,6 @@
 
     int filter_offset, filter_length;
     __m128i zero = _mm_setzero_si128();
-    __m128i mask[4];
-    // |mask| will be used to decimate all extra filter coefficients that are
-    // loaded by SIMD when |filter_length| is not divisible by 4.
-    // mask[0] is not used in following algorithm.
-    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
 
     // Output one pixel each iteration, calculating all channels (RGBA) together.
     for (int out_x = 0; out_x < num_values; out_x++) {
@@ -245,24 +219,11 @@
 
         int r = filter_length & 3;
         if (r) {
-            // Note: filter_values must be padded to align_up(filter_offset, 8);
-            __m128i coeff;
-            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
-            // Mask out extra filter taps.
-            coeff = _mm_and_si128(coeff, mask[r]);
-
-            __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
-            /* c1 c1 c1 c1 c0 c0 c0 c0 */
-            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
-            __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
-            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
-
-            __m128i src8, src16, mul_hi, mul_lo, t;
-
-            ITERATION(src_data[0] + start, accum0);
-            ITERATION(src_data[1] + start, accum1);
-            ITERATION(src_data[2] + start, accum2);
-            ITERATION(src_data[3] + start, accum3);
+            int remainder_offset = (filter_offset + filter_length - r) * 4;
+            accum_remainder(src_data[0] + remainder_offset, filter_values, accum0, r);
+            accum_remainder(src_data[1] + remainder_offset, filter_values, accum1, r);
+            accum_remainder(src_data[2] + remainder_offset, filter_values, accum2, r);
+            accum_remainder(src_data[3] + remainder_offset, filter_values, accum3, r);
         }
 
         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
@@ -487,14 +448,3 @@
                                        out_row);
     }
 }
-
-void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
-    // Padding |paddingCount| of more dummy coefficients after the coefficients
-    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
-    // together to access invalid memory areas. We are not trying to align the
-    // coefficients right now due to the opaqueness of <vector> implementation.
-    // This has to be done after all |AddFilter| calls.
-    for (int i = 0; i < 8; ++i) {
-        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
-    }
-}
diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp
index ce2656d..4193e6a 100644
--- a/src/opts/SkBitmapProcState_arm_neon.cpp
+++ b/src/opts/SkBitmapProcState_arm_neon.cpp
@@ -83,6 +83,20 @@
 #include <arm_neon.h>
 #include "SkConvolver.h"
 
+static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left,
+        const SkConvolutionFilter1D::ConvolutionFixed* filter_values, int32x4_t& accum, int r) {
+    int remainder[4] = {0};
+    for (int i = 0; i < r; i++) {
+        SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i];
+        remainder[0] += coeff * pixels_left[i * 4 + 0];
+        remainder[1] += coeff * pixels_left[i * 4 + 1];
+        remainder[2] += coeff * pixels_left[i * 4 + 2];
+        remainder[3] += coeff * pixels_left[i * 4 + 3];
+    }
+    int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
+    accum += t;
+}
+
 // Convolves horizontally along a single row. The row data is given in
 // |srcData| and continues for the numValues() of the filter.
 void convolveHorizontally_neon(const unsigned char* srcData,
@@ -140,33 +154,11 @@
             rowToFilter += 16;
             filterValues += 4;
         }
+
         int r = filterLength & 3;
         if (r) {
-            const uint16_t mask[4][4] = {
-                {0, 0, 0, 0},
-                {0xFFFF, 0, 0, 0},
-                {0xFFFF, 0xFFFF, 0, 0},
-                {0xFFFF, 0xFFFF, 0xFFFF, 0}
-            };
-            uint16x4_t coeffs;
-            int16x4_t coeff0, coeff1, coeff2;
-            coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
-            coeffs &= vld1_u16(&mask[r][0]);
-            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
-            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
-            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
-
-            // Load pixels and calc
-            uint8x16_t pixels = vld1q_u8(rowToFilter);
-            int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
-            int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
-            int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
-            int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
-            int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
-
-            accum += p0;
-            accum += p1;
-            accum += p2;
+            int remainder_offset = (filterOffset + filterLength - r) * 4;
+            accum_remainder(srcData + remainder_offset, filterValues, accum, r);
         }
 
         // Bring this value back in range. All of the filter scaling factors
@@ -374,15 +366,6 @@
     int num_values = filter.numValues();
 
     int filterOffset, filterLength;
-    // |mask| will be used to decimate all extra filter coefficients that are
-    // loaded by SIMD when |filter_length| is not divisible by 4.
-    // mask[0] is not used in following algorithm.
-    const uint16_t mask[4][4] = {
-        {0, 0, 0, 0},
-        {0xFFFF, 0, 0, 0},
-        {0xFFFF, 0xFFFF, 0, 0},
-        {0xFFFF, 0xFFFF, 0xFFFF, 0}
-    };
 
     // Output one pixel each iteration, calculating all channels (RGBA) together.
     for (int outX = 0; outX < num_values; outX++) {
@@ -437,22 +420,11 @@
 
         int r = filterLength & 3;
         if (r) {
-            int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
-            coeffs = vld1_s16(filterValues);
-            coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
-            coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
-            coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
-            coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
-            coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
-
-            uint8x16_t pixels;
-            int16x8_t p01_16, p23_16;
-            int32x4_t p0, p1, p2, p3;
-
-            ITERATION(srcData[0] + start, accum0);
-            ITERATION(srcData[1] + start, accum1);
-            ITERATION(srcData[2] + start, accum2);
-            ITERATION(srcData[3] + start, accum3);
+            int remainder_offset = (filterOffset + filterLength - r) * 4;
+            accum_remainder(srcData[0] + remainder_offset, filterValues, accum0, r);
+            accum_remainder(srcData[1] + remainder_offset, filterValues, accum1, r);
+            accum_remainder(srcData[2] + remainder_offset, filterValues, accum2, r);
+            accum_remainder(srcData[3] + remainder_offset, filterValues, accum3, r);
         }
 
         int16x4_t accum16;
@@ -479,21 +451,8 @@
     }
 }
 
-void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
-    // Padding |paddingCount| of more dummy coefficients after the coefficients
-    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
-    // together to access invalid memory areas. We are not trying to align the
-    // coefficients right now due to the opaqueness of <vector> implementation.
-    // This has to be done after all |AddFilter| calls.
-    for (int i = 0; i < 8; ++i) {
-        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
-    }
-}
-
 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
-    procs->fExtraHorizontalReads = 3;
     procs->fConvolveVertically = &convolveVertically_neon;
     procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
     procs->fConvolveHorizontally = &convolveHorizontally_neon;
-    procs->fApplySIMDPadding = &applySIMDPadding_neon;
 }
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index a8003a3..64cf0da 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -37,11 +37,9 @@
 
 void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
     if (SkCpu::Supports(SkCpu::SSE2)) {
-        procs->fExtraHorizontalReads = 3;
         procs->fConvolveVertically = &convolveVertically_SSE2;
         procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
         procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
-        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
     }
 }