Optimize highQualityFilter

portable version:
before:
  10M   1       806µs   807µs   810µs   821µs   1%      █▂▁▁▃▁▁▁█▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic
after:
  10M   1       566µs   568µs   569µs   579µs   1%      ▄▂▂█▂▁▁▁▃▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic

SSE version:
before:
  10M   1       485µs   486µs   487µs   494µs   1%      ▇▂▁▁▁▁█▂▁▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic
after:
  10M   1       419µs   420µs   421µs   430µs   1%      ▅▃▂▁▁█▂▁▁▁ 8888    bitmap_BGRA_8888_A_scale_rotate_bicubic

BUG=skia:

Review URL: https://codereview.chromium.org/759603002
diff --git a/src/core/SkBitmapFilter.cpp b/src/core/SkBitmapFilter.cpp
index 20a0514..fce4c1d 100644
--- a/src/core/SkBitmapFilter.cpp
+++ b/src/core/SkBitmapFilter.cpp
@@ -29,6 +29,7 @@
     const int maxX = s.fBitmap->width();
     const int maxY = s.fBitmap->height();
     SkAutoTMalloc<SkScalar> xWeights(maxX);
+    const SkBitmapFilter* filter = s.getBitmapFilter();
 
     while (count-- > 0) {
         SkPoint srcPt;
@@ -40,30 +41,33 @@
         SkScalar weight = 0;
         SkScalar fr = 0, fg = 0, fb = 0, fa = 0;
 
-        int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
-        int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY);
-        int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
-        int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX);
+        int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
+        int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
+        int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
+        int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
 
         for (int srcX = x0; srcX < x1 ; srcX++) {
             // Looking these up once instead of each loop is a ~15% speedup.
-            xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
+            xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
         }
 
         for (int srcY = y0; srcY < y1; srcY++) {
-            SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
+            SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
 
             for (int srcX = x0; srcX < x1 ; srcX++) {
                 SkScalar xWeight = xWeights[srcX - x0];
 
                 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
+                weight += combined_weight;
 
                 SkPMColor c = *s.fBitmap->getAddr32(srcX, srcY);
+                if (!c) {
+                    continue;
+                }
                 fr += combined_weight * SkGetPackedR32(c);
                 fg += combined_weight * SkGetPackedG32(c);
                 fb += combined_weight * SkGetPackedB32(c);
                 fa += combined_weight * SkGetPackedA32(c);
-                weight += combined_weight;
             }
         }
 
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index 04f1486..2996f53 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -49,6 +49,7 @@
     const int maxX = s.fBitmap->width();
     const int maxY = s.fBitmap->height();
     SkAutoTMalloc<SkScalar> xWeights(maxX);
+    const SkBitmapFilter* filter = s.getBitmapFilter();
 
     while (count-- > 0) {
         SkPoint srcPt;
@@ -59,34 +60,37 @@
         __m128 weight = _mm_setzero_ps();
         __m128 accum = _mm_setzero_ps();
 
-        int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
-        int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY);
-        int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
-        int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX);
+        int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY);
+        int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);
+        int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX);
+        int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);
 
         for (int srcX = x0; srcX < x1 ; srcX++) {
             // Looking these up once instead of each loop is a ~15% speedup.
-            xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
+            xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));
         }
 
         for (int srcY = y0; srcY < y1; srcY++) {
-            SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
+            SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));
 
             for (int srcX = x0; srcX < x1 ; srcX++) {
                 SkScalar xWeight = xWeights[srcX - x0];
 
                 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
+                __m128 weightVector = _mm_set1_ps(combined_weight);
+                weight = _mm_add_ps( weight, weightVector );
 
                 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);
+                if (!color) {
+                    continue;
+                }
 
                 __m128i c = _mm_cvtsi32_si128(color);
                 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
                 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
                 __m128 cfloat = _mm_cvtepi32_ps(c);
 
-                __m128 weightVector = _mm_set1_ps(combined_weight);
                 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
-                weight = _mm_add_ps( weight, weightVector );
             }
         }