Add SkBlendARGB32_SSE2() to clean up code

Related nanobench results:
before:
maxrss  loops   min     median  mean    max     stddev  samples         config  bench
  10M   2       31.9µs  32.4µs  33.3µs  38.7µs  6%      █▄▂▂▂▁▂▁▁▁      8888    bitmap_BGRA_8888_A_scale_bicubic
  10M   13      43.8µs  51.8µs  49.6µs  57.9µs  11%     ▁▁▁▁▂▆▇▆▅█      8888    bitmap_BGRA_8888_A_scale_bilerp
  10M   13      23.7µs  24.3µs  26µs    32.7µs  13%     ▅█▆▁▁▁▁▂▁▁      8888    bitmap_Index_8_A
  10M   4       1.68µs  1.7µs   4.09µs  25.4µs  183%    █▁▁▁▁▁▁▁▁▁      8888    text_16_AA_88
  10M   144     1.76µs  1.77µs  1.78µs  1.81µs  1%      █▂▇▂▅▁▁▁▁▁      8888    text_16_AA_FF
  10M   10      4.7µs   5.34µs  5.61µs  8.63µs  21%     █▂▂▃▂▁▁▁▁▄      8888    rotated_rects_aa_alternating_transparent_and_opaque_src
  10M   50      4.44µs  4.47µs  4.5µs   4.71µs  2%      █▅▃▂▂▂▁▁▁▁      8888    rotated_rects_aa_changing_opaque_src
  10M   51      4.39µs  4.78µs  5.21µs  6.62µs  17%     ▁▆▆▇▁▁█▁▂▂      8888    rotated_rects_aa_same_opaque_src
  10M   50      4.47µs  5.79µs  5.43µs  6.14µs  11%     ▄▂▁▃▇▇▆▇▇█      8888    rotated_rects_aa_alternating_transparent_and_opaque_srcover
  10M   30      4.35µs  6.06µs  5.84µs  7.63µs  16%     ▅▅▅▄▅▅▄█▁▁      8888    rotated_rects_aa_changing_transparent_srcover
  10M   44      4.31µs  4.51µs  4.76µs  6.25µs  13%     ▄▂▂▁█▃▁▃▁▁      8888    rotated_rects_aa_changing_opaque_srcover
  10M   46      4.36µs  4.42µs  4.75µs  6.19µs  14%     ▆█▃▁▁▁▁▁▁▁      8888    rotated_rects_aa_same_transparent_srcover
  10M   47      4.29µs  4.35µs  4.44µs  5.15µs  6%      ▃▂▂▁▁█▁▁▁▁      8888    rotated_rects_aa_same_opaque_srcover
  10M   3       39.1µs  39.2µs  50.7µs  153µs   71%     █▁▁▁▁▁▁▁▁▁      8888    rectori
  10M   1       2.3ms   2.31ms  2.35ms  2.74ms  6%      ▁▁▁▁▁▁▁▁█▂      8888    maskcolor
  10M   1       2.33ms  2.34ms  2.53ms  3.14ms  11%     ▁▁▁▁▁▁▅█▄▄      8888    maskopaque
  10M   11      15µs    15.3µs  15.7µs  18.3µs  7%      ▅▃▂▂▁▁▁▁█▁      8888    rrects_3_stroke_4
  10M   46      3.99µs  4.07µs  4.14µs  4.54µs  4%      █▅▅▃▂▂▁▁▁▁      8888    rrects_3
  10M   16      15.6µs  15.9µs  16.1µs  17.5µs  4%      █▄▃▂▂▂▁▂▁▁      8888    ovals_3_stroke_4
  10M   40      5.09µs  5.18µs  5.23µs  5.67µs  3%      █▅▃▂▂▁▃▁▁▁      8888    ovals_3
  10M   231     1.92µs  1.93µs  1.94µs  2µs     1%      █▃▂▁▃▁▁▁▁▁      8888    zeroradroundrect
  10M   924     3.88µs  3.93µs  4.11µs  4.95µs  9%      ▁█▆▃▁▁▁▁▁▁      8888    arbroundrect
  10M   8       8.11µs  8.47µs  8.48µs  8.85µs  3%      █▅▇▄▄▂▁▄▄▆      8888    merge_large
  10M   14      6.71µs  6.92µs  6.96µs  7.46µs  3%      ▃▆▁█▃▃▃▂▂▁      8888    merge_small
  11M   2       225µs   227µs   229µs   233µs   1%      ███▃▇▂▃▁▃▂      8888    displacement_full_large
  16M   1       381µs   401µs   401µs   421µs   3%      ▅▅▅█▆▄▄▃▃▁      8888    displacement_alpha_large
  19M   1       507µs   508µs   509µs   512µs   0%      █▃▂▆▂▂▃▂▃▁      8888    displacement_zero_large
  19M   19      9µs     9.11µs  9.15µs  9.67µs  2%      ▄▂▂▂█▂▁▁▁▂      8888    displacement_full_small
  19M   5       54.2µs  54.5µs  54.9µs  58µs    2%      █▃▂▂▁▁▃▁▁▁      8888    blurroundrect_WH[100x100]_cr[90]
  20M   1       229µs   230µs   231µs   240µs   2%      █▄▃▂▂▁▁▁▁▂      8888    GM_varied_text_clipped_no_lcd
  20M   1       267µs   269µs   270µs   279µs   1%      █▄▃▂▂▂▂▂▁▁      8888    GM_varied_text_ignorable_clip_no_lcd
  22M   1       1.95ms  1.97ms  2.03ms  2.46ms  8%      ▁▁▁▁▁▁▁▂█▃      8888    GM_convex_poly_clip

after:
maxrss	loops	min	median	mean	max	stddev	samples   	config	bench
  10M	2	31.5µs	32.3µs	32.8µs	37.2µs	5%	█▄▃▂▂▂▁▁▁▁	8888	bitmap_BGRA_8888_A_scale_bicubic
  10M	13	43.9µs	44µs	44.1µs	44.9µs	1%	█▂▁▁▁▆▁▁▁▂	8888	bitmap_BGRA_8888_A_scale_bilerp
  10M	19	22.7µs	23.3µs	25.6µs	32.4µs	14%	▁▁▁▁▁▅▆▁▅█	8888	bitmap_Index_8_A
  10M	5	1.79µs	1.97µs	3.85µs	21.1µs	158%	█▁▁▁▁▁▁▁▁▁	8888	text_16_AA_88
  10M	141	1.83µs	1.83µs	1.85µs	1.93µs	2%	▅▁▁█▁▁▁▁▁▁	8888	text_16_AA_FF
  10M	10	4.65µs	4.92µs	5.06µs	6.56µs	11%	█▃▃▂▂▂▁▁▁▁	8888	rotated_rects_aa_alternating_transparent_and_opaque_src
  10M	51	4.35µs	4.48µs	4.83µs	6.68µs	17%	▂▁▁▁▁▁▁▂▆█	8888	rotated_rects_aa_changing_opaque_src
  10M	51	4.38µs	4.79µs	4.85µs	5.84µs	11%	▁█▁▃▃▁▄▁▄▇	8888	rotated_rects_aa_same_opaque_src
  10M	32	5.58µs	6.24µs	6.1µs	6.39µs	5%	█▂█▆▁▇▄▅▇▇	8888	rotated_rects_aa_alternating_transparent_and_opaque_srcover
  10M	42	4.28µs	5.59µs	5.11µs	6.01µs	15%	▂▂█▇█▂▁▆▁▇	8888	rotated_rects_aa_changing_transparent_srcover
  10M	48	4.24µs	4.33µs	4.58µs	6.46µs	15%	▁▁▁▁▁█▃▂▁▁	8888	rotated_rects_aa_changing_opaque_srcover
  10M	48	4.28µs	4.3µs	4.4µs	5.12µs	6%	▂▂▁▁▁▁▁▁▁█	8888	rotated_rects_aa_same_transparent_srcover
  10M	46	4.24µs	4.29µs	4.66µs	7.11µs	20%	▁▁▁▁▁▁▁▁▃█	8888	rotated_rects_aa_same_opaque_srcover
  10M	3	39.3µs	39.4µs	51.4µs	154µs	70%	█▁▁▁▁▁▁▁▁▁	8888	rectori
  10M	1	2.32ms	2.43ms	2.53ms	3.14ms	11%	▁▁▁▁▂▄█▃▅▁	8888	maskcolor
  10M	1	2.33ms	2.37ms	2.54ms	3.21ms	12%	▁▁▁▁▁▂█▅▆▁	8888	maskopaque
  10M	10	15.3µs	15.6µs	15.8µs	17.2µs	4%	█▅▃▂▂▂▁▁▁▁	8888	rrects_3_stroke_4
  10M	46	4.03µs	4.09µs	4.15µs	4.47µs	4%	█▄▆▂▂▂▁▁▁▁	8888	rrects_3
  10M	15	15.9µs	16.2µs	16.3µs	17.8µs	4%	█▄▃▂▂▂▁▁▁▁	8888	ovals_3_stroke_4
  10M	40	5.14µs	5.26µs	5.29µs	5.72µs	3%	█▅▃▂▂▁▂▂▁▁	8888	ovals_3
  10M	222	1.91µs	1.99µs	2.21µs	2.91µs	19%	▂▁▁▁▁▁▂▇▇█	8888	zeroradroundrect
  10M	462	3.9µs	3.96µs	4.23µs	5.22µs	12%	▆▄█▁▂▁▁▁▁▁	8888	arbroundrect
  10M	8	8.2µs	8.59µs	8.62µs	8.97µs	3%	▆▄█▄▅▃▁▆▄█	8888	merge_large
  10M	14	6.73µs	6.88µs	6.86µs	7.08µs	2%	▄█▁▂▄▂▅▄▂▅	8888	merge_small
  11M	2	221µs	234µs	237µs	263µs	5%	▄▃▃▃▄▃▂▁▇█	8888	displacement_full_large
  16M	1	387µs	416µs	427µs	471µs	7%	▇█▁▃▃▁▃▃▇▆	8888	displacement_alpha_large
  19M	1	512µs	521µs	528µs	594µs	5%	█▂▂▂▁▁▂▃▁▁	8888	displacement_zero_large
  19M	18	9.06µs	9.12µs	9.13µs	9.23µs	1%	█▃▃▃▄▃▆▁▅▅	8888	displacement_full_small
  19M	5	55.6µs	55.9µs	56.5µs	59.5µs	2%	█▃▂▁▁▁▁▁▅▁	8888	blurroundrect_WH[100x100]_cr[90]
  20M	1	229µs	233µs	235µs	254µs	3%	█▄▃▂▂▁▁▂▁▁	8888	GM_varied_text_clipped_no_lcd
  20M	1	270µs	271µs	272µs	278µs	1%	█▄▃▂▂▂▁▂▁▇	8888	GM_varied_text_ignorable_clip_no_lcd
  22M	1	1.96ms	2ms	2.06ms	2.45ms	7%	▂▂▁▁▁▁▁▃█▄	8888	GM_convex_poly_clip

BUG=skia:

Review URL: https://codereview.chromium.org/754733002
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 17cd20a..7b9c043 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -207,74 +207,14 @@
             count--;
         }
 
-        uint32_t src_scale = SkAlpha255To256(alpha);
-
         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
-        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
-        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
-        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
         while (count >= 4) {
             // Load 4 pixels each of src and dest.
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);
 
-            // Get red and blue pixels into lower byte of each word.
-            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
-            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
-
-            // Get alpha and green into lower byte of each word.
-            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
-            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
-
-            // Put per-pixel alpha in low byte of each word.
-            // After the following two statements, the dst_alpha looks like
-            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
-            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
-            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
-
-            // dst_alpha = dst_alpha * src_scale
-            // Because src_scales are in the higher byte of each word and
-            // we use mulhi here, the resulting alpha values are already
-            // in the right place and don't need to be divided by 256.
-            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
-            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
-
-            // Subtract alphas from 256, to get 1..256
-            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
-
-            // Multiply red and blue by dst pixel alpha.
-            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
-            // Multiply alpha and green by dst pixel alpha.
-            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
-
-            // Multiply red and blue by global alpha.
-            // (4 x (0, rs.h, 0, bs.h))
-            // where rs.h stands for the higher byte of r * src_scale,
-            // and bs.h the higher byte of b * src_scale.
-            // Again, because we use mulhi, the resuling red and blue
-            // values are already in the right place and don't need to
-            // be divided by 256.
-            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
-            // Multiply alpha and green by global alpha.
-            // (4 x (0, as.h, 0, gs.h))
-            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
-
-            // Divide by 256.
-            dst_rb = _mm_srli_epi16(dst_rb, 8);
-
-            // Mask out low bits (goodies already in the right place; no need to divide)
-            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
-            // Shift alpha and green to higher byte of each word.
-            // (4 x (as.h, 0, gs.h, 0))
-            src_ag = _mm_slli_epi16(src_ag, 8);
-
-            // Combine back into RGBA.
-            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
-            src_pixel = _mm_or_si128(src_rb, src_ag);
-
-            // Add two pixels into result.
-            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
+            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
             _mm_store_si128(d, result);
             s++;
             d++;
@@ -367,73 +307,24 @@
                 count--;
             }
             __m128i *d = reinterpret_cast<__m128i*>(dst);
-            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
-            __m128i c_256 = _mm_set1_epi16(256);
-            __m128i c_1 = _mm_set1_epi16(1);
             __m128i src_pixel = _mm_set1_epi32(color);
             while (count >= 4) {
-                // Load 4 pixels each of src and dest.
+                // Load 4 dst pixels
                 __m128i dst_pixel = _mm_load_si128(d);
 
-                //set the aphla value
-                __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
-                src_scale_wide = _mm_unpacklo_epi8(src_scale_wide,
-                                                   _mm_setzero_si128());
-                src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wide);
+                // Set the alpha value
+                __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
+                alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
+                alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
 
-                //call SkAlpha255To256()
-                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
-
-                // Get red and blue pixels into lower byte of each word.
-                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
-                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
-
-                // Get alpha and green into lower byte of each word.
-                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
-                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
-
-                // Put per-pixel alpha in low byte of each word.
-                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
-                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
-
-                // dst_alpha = dst_alpha * src_scale
-                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
-
-                // Divide by 256.
-                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
-
-                // Subtract alphas from 256, to get 1..256
-                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
-                // Multiply red and blue by dst pixel alpha.
-                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
-                // Multiply alpha and green by dst pixel alpha.
-                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
-
-                // Multiply red and blue by global alpha.
-                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
-                // Multiply alpha and green by global alpha.
-                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
-                // Divide by 256.
-                dst_rb = _mm_srli_epi16(dst_rb, 8);
-                src_rb = _mm_srli_epi16(src_rb, 8);
-
-                // Mask out low bits (goodies already in the right place; no need to divide)
-                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
-                src_ag = _mm_andnot_si128(rb_mask, src_ag);
-
-                // Combine back into RGBA.
-                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
-                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
-
-                // Add two pixels into result.
-                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
+                __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
                 _mm_store_si128(d, result);
-                // load the next 4 pixel
+                // Load the next 4 dst pixels and alphas
                 mask = mask + 4;
                 d++;
                 count -= 4;
             }
-            dst = reinterpret_cast<SkPMColor *>(d);
+            dst = reinterpret_cast<SkPMColor*>(d);
         }
         while (count > 0) {
             *dst= SkBlendARGB32(color, *dst, *mask);
diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h
index c1e41f0..c52fc1e 100644
--- a/src/opts/SkColor_opts_SSE2.h
+++ b/src/opts/SkColor_opts_SSE2.h
@@ -203,5 +203,34 @@
     return d_pixel;
 }
 
+// Portable version SkBlendARGB32 is in SkColorPriv.h.
+static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
+                                         const __m128i& aa) {
+    __m128i src_scale = SkAlpha255To256_SSE2(aa);
+    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+    __m128i dst_scale = SkGetPackedA32_SSE2(src);
+    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+    dst_scale = _mm_srli_epi16(dst_scale, 8);
+    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+
+    __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
+    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+}
+
+// Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
+static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
+                                         const unsigned aa) {
+    unsigned alpha = SkAlpha255To256(aa);
+    __m128i src_scale = _mm_set1_epi32(alpha);
+    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+    __m128i dst_scale = SkGetPackedA32_SSE2(src);
+    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+    dst_scale = _mm_srli_epi16(dst_scale, 8);
+    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+
+    __m128i result = SkAlphaMulQ_SSE2(src, alpha);
+    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+}
+
 #undef ASSERT_EQ
 #endif // SkColor_opts_SSE2_DEFINED