SkBlendARGB32 and S32[A]_Blend_BlitRow32 are currently formulated as: SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(dst, dst_scale), which boils down to ((src*src_scale)>>8) + ((dst*dst_scale)>>8). In particular, note that the intermediate precision is discarded before the two parts are added together, causing the final result to possibly inaccurate.

In Firefox, we use SkCanvas::saveLayer in combination with a backdrop that initializes the layer to the background. When this is blended back onto background using transparency, where the source and destination pixel colors are the same, the resulting color after the blend is not preserved due to the lost precision mentioned above. In cases where this operation is repeatedly performed, this causes substantially noticeable differences in color as evidenced in this downstream Firefox bug report:  https://bugzilla.mozilla.org/show_bug.cgi?id=1200684

In the test-case in the downstream report, essentially it does blend(src=0xFF2E3338, dst=0xFF2E3338, scale=217), which gives the result 0xFF2E3237, while we would expect to get back 0xFF2E3338.

This problem goes away if the blend is instead reformulated to effectively do (src*src_scale + dst*dst_scale)>>8, which keeps the intermediate precision during the addition before shifting it off.

This modifies the blending operations thusly. The performance should remain mostly unchanged, or possibly improve slightly, so there should be no real downside to doing this, with the benefit of making the results more accurate. Without this, it is currently unsafe for Firefox to blend a layer back onto itself that was initialized with a copy of its background.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2097883002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

[mtklein adds...]
No public API changes.
TBR=reed@google.com

Review-Url: https://codereview.chromium.org/2097883002
diff --git a/gm/blend.cpp b/gm/blend.cpp
index b20422f..8aed292 100644
--- a/gm/blend.cpp
+++ b/gm/blend.cpp
@@ -16,20 +16,20 @@
 
         p.setColor(SK_ColorRED);
         canvas->drawRect(SkRect::MakeXYWH(0,0,1,1), p);
-        p.setColor(0xFC008000);
+        p.setColor(0xFC208000);
         canvas->drawRect(SkRect::MakeXYWH(0,0,1,1), p);
 
         p.setColor(SK_ColorRED);
         canvas->drawRect(SkRect::MakeXYWH(1,0,1,1), p);
         canvas->saveLayer(nullptr, nullptr);
-            p.setColor(0xFC008000);
+            p.setColor(0xFC208000);
             canvas->drawRect(SkRect::MakeXYWH(1,0,1,1), p);
         canvas->restore();
 
         p.setColor(SK_ColorRED);
         canvas->drawRect(SkRect::MakeXYWH(2,0,1,1), p);
         canvas->saveLayerAlpha(nullptr, 0xFC);
-            p.setColor(sk_tool_utils::color_to_565(0xFF008000));
+            p.setColor(sk_tool_utils::color_to_565(0xFF208000));
             canvas->drawRect(SkRect::MakeXYWH(2,0,1,1), p);
         canvas->restore();
     canvas->restore();
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index 40e6e15..694d324 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -200,6 +200,18 @@
  */
 #define SkAlphaMul(value, alpha256)     (((value) * (alpha256)) >> 8)
 
+/** Calculates 256 - (value * alpha256) / 255 in range [0,256],
+ *  for [0,255] value and [0,256] alpha256.
+ */
+static inline U16CPU SkAlphaMulInv256(U16CPU value, U16CPU alpha256) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return SkAlpha255To256(255 - SkAlphaMul(value, alpha256));
+#else
+    unsigned prod = 0xFFFF - value * alpha256;
+    return (prod + (prod >> 8)) >> 8;
+#endif
+}
+
 //  The caller may want negative values, so keep all params signed (int)
 //  so we don't accidentally slip into unsigned math and lose the sign
 //  extension when we shift (in SkAlphaMul)
@@ -568,13 +580,38 @@
     return src + SkAlphaMulQ(dst, SkAlpha255To256(255 - SkGetPackedA32(src)));
 }
 
+/**
+ * Interpolates between colors src and dst using [0,256] scale.
+ */
+static inline SkPMColor SkPMLerp(SkPMColor src, SkPMColor dst, unsigned scale) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return SkAlphaMulQ(src, scale) + SkAlphaMulQ(dst, 256 - scale);
+#else
+    return SkFastFourByteInterp256(src, dst, scale);
+#endif
+}
+
 static inline SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa) {
     SkASSERT((unsigned)aa <= 255);
 
     unsigned src_scale = SkAlpha255To256(aa);
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
     unsigned dst_scale = SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale));
 
     return SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(dst, dst_scale);
+#else
+    unsigned dst_scale = SkAlphaMulInv256(SkGetPackedA32(src), src_scale);
+
+    const uint32_t mask = 0xFF00FF;
+
+    uint32_t src_rb = (src & mask) * src_scale;
+    uint32_t src_ag = ((src >> 8) & mask) * src_scale;
+
+    uint32_t dst_rb = (dst & mask) * dst_scale;
+    uint32_t dst_ag = ((dst >> 8) & mask) * dst_scale;
+
+    return (((src_rb + dst_rb) >> 8) & mask) | ((src_ag + dst_ag) & ~mask);
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp
index 9ac84c6..648e0ea 100644
--- a/src/core/SkBlitRow_D16.cpp
+++ b/src/core/SkBlitRow_D16.cpp
@@ -180,7 +180,7 @@
             {
                 unsigned d = *dst;
                 int sa = SkGetPackedA32(c);
-                int dst_scale = SkAlpha255To256(255 - SkAlphaMul(sa, src_scale));
+                int dst_scale = SkAlphaMulInv256(sa, src_scale);
                 int dither = DITHER_VALUE(x);
 
                 int sr = SkGetPackedR32(c);
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 80c7242..9494557 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -26,25 +26,27 @@
     SkASSERT(alpha <= 255);
     if (count > 0) {
         unsigned src_scale = SkAlpha255To256(alpha);
-        unsigned dst_scale = 256 - src_scale;
 
 #ifdef UNROLL
         if (count & 1) {
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
             count -= 1;
         }
 
         const SkPMColor* SK_RESTRICT srcEnd = src + count;
         while (src != srcEnd) {
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
-            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
+            src += 1;
             dst += 1;
         }
 #else
         do {
-            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
             src += 1;
             dst += 1;
         } while (--count > 0);
diff --git a/src/core/SkBlitter_A8.cpp b/src/core/SkBlitter_A8.cpp
index 57587c6..6697614 100644
--- a/src/core/SkBlitter_A8.cpp
+++ b/src/core/SkBlitter_A8.cpp
@@ -276,7 +276,7 @@
 
     int src_scale = SkAlpha255To256(aa);
     int sa = SkGetPackedA32(src);
-    int dst_scale = 256 - SkAlphaMul(sa, src_scale);
+    int dst_scale = SkAlphaMulInv256(sa, src_scale);
 
     return SkToU8((sa * src_scale + da * dst_scale) >> 8);
 }
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index 7adab55..aada058 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -237,7 +237,11 @@
         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
     }
 
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
     unsigned dst_scale = 255 - SkGetPackedA32(color);
+#else
+    unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
+#endif
     size_t rowBytes = fDevice.rowBytes();
     while (--height >= 0) {
         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
diff --git a/src/core/SkSpriteBlitter_RGB16.cpp b/src/core/SkSpriteBlitter_RGB16.cpp
index d0d3001..6c5a7cb 100644
--- a/src/core/SkSpriteBlitter_RGB16.cpp
+++ b/src/core/SkSpriteBlitter_RGB16.cpp
@@ -30,7 +30,11 @@
         dg = SkAlphaBlend(SkPacked32ToG16(sc), SkGetPackedG16(dc), src_scale);
         db = SkAlphaBlend(SkPacked32ToB16(sc), SkGetPackedB16(dc), src_scale);
     } else {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         unsigned dst_scale = 255 - SkAlphaMul(sa, src_scale);
+#else
+        unsigned dst_scale = SkAlphaMulInv256(sa, src_scale);
+#endif
         dr = (SkPacked32ToR16(sc) * src_scale + SkGetPackedR16(dc) * dst_scale) >> 8;
         dg = (SkPacked32ToG16(sc) * src_scale + SkGetPackedG16(dc) * dst_scale) >> 8;
         db = (SkPacked32ToB16(sc) * src_scale + SkGetPackedB16(dc) * dst_scale) >> 8;
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index a6fbc25..7ce1fc9 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -26,12 +26,11 @@
     }
 
     uint32_t src_scale = SkAlpha255To256(alpha);
-    uint32_t dst_scale = 256 - src_scale;
 
     if (count >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            *dst = SkPMLerp(*src, *dst, src_scale);
             src++;
             dst++;
             count--;
@@ -45,11 +44,7 @@
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);
 
-            src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
-            dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
-
-            // Add result
-            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
+            __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
             _mm_store_si128(d, result);
             s++;
             d++;
@@ -60,7 +55,7 @@
     }
 
     while (count > 0) {
-        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+        *dst = SkPMLerp(*src, *dst, src_scale);
         src++;
         dst++;
         count--;
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 3cb5a92..7998a89 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -909,7 +909,12 @@
         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
 
         // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         // Store
         vst1_u32(dst, vreinterpret_u32_u8(vres));
@@ -931,7 +936,12 @@
         vsrc_wide = vmovl_u8(vsrc);
         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         // Store
         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
@@ -943,7 +953,7 @@
                          const SkPMColor* SK_RESTRICT src,
                          int count, U8CPU alpha) {
 
-    SkASSERT(255 >= alpha);
+    SkASSERT(255 > alpha);
 
     if (count <= 0) {
         return;
@@ -963,9 +973,7 @@
 
         // Calc dst_scale
         dst_scale = vget_lane_u8(vsrc, 3);
-        dst_scale *= alpha256;
-        dst_scale >>= 8;
-        dst_scale = 256 - dst_scale;
+        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
 
         // Process src
         vsrc_wide = vmovl_u8(vsrc);
@@ -976,7 +984,12 @@
         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
 
         // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+        vdst_wide += vsrc_wide;
+        vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
         dst++;
@@ -1007,9 +1020,20 @@
             // Calc dst_scale
             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
             vdst_scale = vmovl_u8(vsrc_alphas);
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
             vdst_scale *= vsrc_scale;
             vdst_scale = vshrq_n_u16(vdst_scale, 8);
             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
+#else
+            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
+            // A 16-bit lane would overflow if we used 0xFFFF here,
+            // so use an approximation with 0xFF00 that is off by 1,
+            // and add back 1 after to get the correct value.
+            // This is valid if alpha256 <= 255.
+            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
+            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
+            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
+#endif
 
             // Process src
             vsrc_wide = vmovl_u8(vsrc);
@@ -1020,7 +1044,12 @@
             vdst_wide *= vdst_scale;
 
             // Combine
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
+#else
+            vdst_wide += vsrc_wide;
+            vres = vshrn_n_u16(vdst_wide, 8);
+#endif
 
             vst1_u32(dst, vreinterpret_u32_u8(vres));
 
diff --git a/src/opts/SkBlitRow_opts_mips_dsp.cpp b/src/opts/SkBlitRow_opts_mips_dsp.cpp
index e25f7c5..4282191 100644
--- a/src/opts/SkBlitRow_opts_mips_dsp.cpp
+++ b/src/opts/SkBlitRow_opts_mips_dsp.cpp
@@ -789,9 +789,15 @@
         "muleu_s.ph.qbr  %[t5],    %[t6],    %[t5]    \n\t"
         "addiu           %[src],   %[src],   4        \n\t"
         "addiu           %[count], %[count], -1       \n\t"
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
         "precrq.qb.ph    %[t0],    %[t3],    %[t2]    \n\t"
         "precrq.qb.ph    %[t2],    %[t5],    %[t4]    \n\t"
         "addu            %[t1],    %[t0],    %[t2]    \n\t"
+#else
+        "addu            %[t0],    %[t3],    %[t5]    \n\t"
+        "addu            %[t2],    %[t2],    %[t4]    \n\t"
+        "precrq.qb.ph    %[t1],    %[t0],    %[t2]    \n\t"
+#endif
         "sw              %[t1],    0(%[dst])          \n\t"
         "b               1b                           \n\t"
         " addi           %[dst],   %[dst],   4        \n\t"
diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h
index feb1d98..a3db880 100644
--- a/src/opts/SkColor_opts_SSE2.h
+++ b/src/opts/SkColor_opts_SSE2.h
@@ -80,6 +80,42 @@
     return _mm_or_si128(rb, ag);
 }
 
+// Portable version SkFastFourByteInterp256 is in SkColorPriv.h.
+static inline __m128i SkFastFourByteInterp256_SSE2(const __m128i& src, const __m128i& dst, const unsigned src_scale) {
+    // Computes dst + (((src - dst)*src_scale)>>8)
+    const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+    // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
+    __m128i src_rb = _mm_and_si128(mask, src);
+    __m128i src_ag = _mm_srli_epi16(src, 8);
+    __m128i dst_rb = _mm_and_si128(mask, dst);
+    __m128i dst_ag = _mm_srli_epi16(dst, 8);
+
+    // Compute scaled differences.
+    __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
+    __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
+    __m128i s = _mm_set1_epi16(src_scale);
+    diff_rb = _mm_mullo_epi16(diff_rb, s);
+    diff_ag = _mm_mullo_epi16(diff_ag, s);
+
+    // Pack the differences back together.
+    diff_rb = _mm_srli_epi16(diff_rb, 8);
+    diff_ag = _mm_andnot_si128(mask, diff_ag);
+    __m128i diff = _mm_or_si128(diff_rb, diff_ag);
+
+    // Add difference to destination.
+    return _mm_add_epi8(dst, diff);
+}
+
+// Portable version SkPMLerp is in SkColorPriv.h
+static inline __m128i SkPMLerp_SSE2(const __m128i& src, const __m128i& dst, const unsigned scale) {
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+    return _mm_add_epi8(SkAlphaMulQ_SSE2(src, scale), SkAlphaMulQ_SSE2(dst, 256 - scale));
+#else
+    return SkFastFourByteInterp256_SSE2(src, dst, scale);
+#endif
+}
+
 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
 #if SK_A32_SHIFT == 24                // It's very common (universal?) that alpha is the top byte.
     return _mm_srli_epi32(src, 24);   // You'd hope the compiler would remove the left shift then,
@@ -213,33 +249,56 @@
                                                              SkGetPackedA32_SSE2(src))));
 }
 
-// Portable version is SkBlendARGB32 in SkColorPriv.h.
-static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
-                                         const __m128i& aa) {
-    __m128i src_scale = SkAlpha255To256_SSE2(aa);
-    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
-    __m128i dst_scale = SkGetPackedA32_SSE2(src);
-    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
-    dst_scale = _mm_srli_epi16(dst_scale, 8);
-    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
-
-    __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
-    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
-}
-
 // Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
                                          const unsigned aa) {
     unsigned alpha = SkAlpha255To256(aa);
-    __m128i src_scale = _mm_set1_epi32(alpha);
-    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
-    __m128i dst_scale = SkGetPackedA32_SSE2(src);
-    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
-    dst_scale = _mm_srli_epi16(dst_scale, 8);
-    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
+#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
+     __m128i src_scale = _mm_set1_epi32(alpha);
+     // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
+     __m128i dst_scale = SkGetPackedA32_SSE2(src);
+     dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+     dst_scale = _mm_srli_epi16(dst_scale, 8);
+     dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
 
-    __m128i result = SkAlphaMulQ_SSE2(src, alpha);
-    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+     __m128i result = SkAlphaMulQ_SSE2(src, alpha);
+     return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
+#else
+    __m128i src_scale = _mm_set1_epi16(alpha);
+    // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
+    __m128i dst_scale = SkGetPackedA32_SSE2(src);
+    // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
+    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
+    dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
+    dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
+    dst_scale = _mm_srli_epi32(dst_scale, 8);
+    // Duplicate scales into 2x16-bit pattern per pixel.
+    dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+    dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
+
+    const __m128i mask = _mm_set1_epi32(0x00FF00FF);
+
+    // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
+    __m128i src_rb = _mm_and_si128(mask, src);
+    __m128i src_ag = _mm_srli_epi16(src, 8);
+    __m128i dst_rb = _mm_and_si128(mask, dst);
+    __m128i dst_ag = _mm_srli_epi16(dst, 8);
+
+    // Scale them.
+    src_rb = _mm_mullo_epi16(src_rb, src_scale);
+    src_ag = _mm_mullo_epi16(src_ag, src_scale);
+    dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
+    dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
+
+    // Add the scaled source and destination.
+    dst_rb = _mm_add_epi16(src_rb, dst_rb);
+    dst_ag = _mm_add_epi16(src_ag, dst_ag);
+
+    // Unsplay the halves back together.
+    dst_rb = _mm_srli_epi16(dst_rb, 8);
+    dst_ag = _mm_andnot_si128(mask, dst_ag);
+    return _mm_or_si128(dst_rb, dst_ag);
+#endif
 }
 
 #undef ASSERT_EQ