Use the 4f impl for legacy/8888 linear gradients

This is a partial revert of https://skia-review.googlesource.com/c/20280,
adding back L32 support for 4f gradients and switching off the legacy
impl.

If it sticks, we should be able to completely delete the legacy gradient
code.

Change-Id: Iaa6d722ea4dfd8e4d959bb76eca815a30ca7098f
Reviewed-on: https://skia-review.googlesource.com/31425
Reviewed-by: Herb Derby <herb@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Florin Malita <fmalita@chromium.org>
diff --git a/src/shaders/gradients/Sk4fLinearGradient.cpp b/src/shaders/gradients/Sk4fLinearGradient.cpp
index 700b7fb..11bf96d 100644
--- a/src/shaders/gradients/Sk4fLinearGradient.cpp
+++ b/src/shaders/gradients/Sk4fLinearGradient.cpp
@@ -12,20 +12,21 @@
 
 namespace {
 
-template<ApplyPremul premul>
-void ramp(const Sk4f& c, const Sk4f& dc, SkPM4f dst[], int n) {
+template<typename dstType, ApplyPremul premul>
+void ramp(const Sk4f& c, const Sk4f& dc, dstType dst[], int n,
+          const Sk4f& bias0, const Sk4f& bias1) {
     SkASSERT(n > 0);
 
-    const Sk4f dc2 = dc + dc;
-    const Sk4f dc4 = dc2 + dc2;
+    const Sk4f dc2 = dc + dc,
+               dc4 = dc2 + dc2;
 
-    Sk4f c0 = c ;
-    Sk4f c1 = c + dc;
-    Sk4f c2 = c0 + dc2;
-    Sk4f c3 = c1 + dc2;
+    Sk4f c0 =  c +      DstTraits<dstType, premul>::pre_lerp_bias(bias0),
+         c1 =  c + dc + DstTraits<dstType, premul>::pre_lerp_bias(bias1),
+         c2 = c0 + dc2,
+         c3 = c1 + dc2;
 
     while (n >= 4) {
-        DstTraits<premul>::store4x(c0, c1, c2, c3, dst);
+        DstTraits<dstType, premul>::store4x(c0, c1, c2, c3, dst, bias0, bias1);
         dst += 4;
 
         c0 = c0 + dc4;
@@ -35,12 +36,12 @@
         n -= 4;
     }
     if (n & 2) {
-        DstTraits<premul>::store(c0, dst++);
-        DstTraits<premul>::store(c1, dst++);
+        DstTraits<dstType, premul>::store(c0, dst++, bias0);
+        DstTraits<dstType, premul>::store(c1, dst++, bias1);
         c0 = c0 + dc2;
     }
     if (n & 1) {
-        DstTraits<premul>::store(c0, dst);
+        DstTraits<dstType, premul>::store(c0, dst, bias0);
     }
 }
 
@@ -141,37 +142,81 @@
     }
 }
 
+
+void SkLinearGradient::
+LinearGradient4fContext::shadeSpan(int x, int y, SkPMColor dst[], int count) {
+    SkASSERT(count > 0);
+
+    float bias0 = 0,
+          bias1 = 0;
+
+    if (fDither) {
+        static constexpr float dither_cell[] = {
+            -3/8.0f,  1/8.0f,
+             3/8.0f, -1/8.0f,
+        };
+
+        const int rowIndex = (y & 1) << 1;
+        bias0 = dither_cell[rowIndex + 0];
+        bias1 = dither_cell[rowIndex + 1];
+
+        if (x & 1) {
+            SkTSwap(bias0, bias1);
+        }
+    }
+
+    if (fColorsArePremul) {
+        // In premul interpolation mode, components are pre-scaled by 255 and the store
+        // op is truncating. We pre-bias here to achieve rounding.
+        bias0 += 0.5f;
+        bias1 += 0.5f;
+
+        this->shadePremulSpan<SkPMColor, ApplyPremul::False>(x, y, dst, count, bias0, bias1);
+    } else {
+        // In unpremul interpolation mode, Components are not pre-scaled.
+        bias0 *= 1/255.0f;
+        bias1 *= 1/255.0f;
+
+        this->shadePremulSpan<SkPMColor, ApplyPremul::True >(x, y, dst, count, bias0, bias1);
+    }
+}
+
 void SkLinearGradient::
 LinearGradient4fContext::shadeSpan4f(int x, int y, SkPM4f dst[], int count) {
     SkASSERT(count > 0);
+
+    // 4f dests are dithered at a later stage, if needed.
+    static constexpr float bias0 = 0,
+                           bias1 = 0;
     if (fColorsArePremul) {
-        this->shadePremulSpan<ApplyPremul::False>(x, y, dst, count);
+        this->shadePremulSpan<SkPM4f, ApplyPremul::False>(x, y, dst, count, bias0, bias1);
     } else {
-        this->shadePremulSpan<ApplyPremul::True>(x, y, dst, count);
+        this->shadePremulSpan<SkPM4f, ApplyPremul::True >(x, y, dst, count, bias0, bias1);
     }
 }
 
-template<ApplyPremul premul>
+template<typename dstType, ApplyPremul premul>
 void SkLinearGradient::
-LinearGradient4fContext::shadePremulSpan(int x, int y, SkPM4f dst[], int count) const {
-    const SkLinearGradient& shader =
-        static_cast<const SkLinearGradient&>(fShader);
+LinearGradient4fContext::shadePremulSpan(int x, int y, dstType dst[], int count,
+                                         float bias0, float bias1) const {
+    const SkLinearGradient& shader = static_cast<const SkLinearGradient&>(fShader);
     switch (shader.fTileMode) {
     case kClamp_TileMode:
-        this->shadeSpanInternal<premul, kClamp_TileMode>(x, y, dst, count);
+        this->shadeSpanInternal<dstType, premul, kClamp_TileMode >(x, y, dst, count, bias0, bias1);
         break;
     case kRepeat_TileMode:
-        this->shadeSpanInternal<premul, kRepeat_TileMode>(x, y, dst, count);
+        this->shadeSpanInternal<dstType, premul, kRepeat_TileMode>(x, y, dst, count, bias0, bias1);
         break;
     case kMirror_TileMode:
-        this->shadeSpanInternal<premul, kMirror_TileMode>(x, y, dst, count);
+        this->shadeSpanInternal<dstType, premul, kMirror_TileMode>(x, y, dst, count, bias0, bias1);
         break;
     }
 }
 
-template<ApplyPremul premul, SkShader::TileMode tileMode>
+template<typename dstType, ApplyPremul premul, SkShader::TileMode tileMode>
 void SkLinearGradient::
-LinearGradient4fContext::shadeSpanInternal(int x, int y, SkPM4f dst[], int count) const {
+LinearGradient4fContext::shadeSpanInternal(int x, int y, dstType dst[], int count,
+                                           float bias0, float bias1) const {
     SkPoint pt;
     fDstToPosProc(fDstToPos,
                   x + SK_ScalarHalf,
@@ -179,12 +224,15 @@
                   &pt);
     const SkScalar fx = pinFx<tileMode>(pt.x());
     const SkScalar dx = fDstToPos.getScaleX();
-    LinearIntervalProcessor<premul, tileMode> proc(fIntervals->begin(),
-                                                   fIntervals->end() - 1,
-                                                   this->findInterval(fx),
-                                                   fx,
-                                                   dx,
-                                                   SkScalarNearlyZero(dx * count));
+    LinearIntervalProcessor<dstType, premul, tileMode> proc(fIntervals->begin(),
+                                                            fIntervals->end() - 1,
+                                                            this->findInterval(fx),
+                                                            fx,
+                                                            dx,
+                                                            SkScalarNearlyZero(dx * count));
+    Sk4f bias4f0(bias0),
+         bias4f1(bias1);
+
     while (count > 0) {
         // What we really want here is SkTPin(advance, 1, count)
         // but that's a significant perf hit for >> stops; investigate.
@@ -199,18 +247,23 @@
             || (n == count && proc.currentRampIsZero()));
 
         if (proc.currentRampIsZero()) {
-            DstTraits<premul>::store(proc.currentColor(), dst, n);
+            DstTraits<dstType, premul>::store(proc.currentColor(), dst, n);
         } else {
-            ramp<premul>(proc.currentColor(), proc.currentColorGrad(), dst, n);
+            ramp<dstType, premul>(proc.currentColor(), proc.currentColorGrad(), dst, n,
+                                  bias4f0, bias4f1);
         }
 
         proc.advance(SkIntToScalar(n));
         count -= n;
         dst   += n;
+
+        if (n & 1) {
+            SkTSwap(bias4f0, bias4f1);
+        }
     }
 }
 
-template<ApplyPremul premul, SkShader::TileMode tileMode>
+template<typename dstType, ApplyPremul premul, SkShader::TileMode tileMode>
 class SkLinearGradient::
 LinearGradient4fContext::LinearIntervalProcessor {
 public:
@@ -275,8 +328,8 @@
     void compute_interval_props(SkScalar t) {
         SkASSERT(in_range(t, fInterval->fT0, fInterval->fT1));
 
-        const Sk4f dc = DstTraits<premul>::load(fInterval->fCg);
-                  fCc = DstTraits<premul>::load(fInterval->fCb) + dc * Sk4f(t);
+        const Sk4f dc = DstTraits<dstType, premul>::load(fInterval->fCg);
+                  fCc = DstTraits<dstType, premul>::load(fInterval->fCb) + dc * Sk4f(t);
                 fDcDx = dc * fDx;
             fZeroRamp = fIsVertical || (dc == 0).allTrue();
     }
@@ -295,8 +348,8 @@
             //
             //   Avg += C * (t1 - t0)
             //
-            const auto c = DstTraits<premul>::load(i->fCb)
-                         + DstTraits<premul>::load(i->fCg) * (i->fT0 + i->fT1) * 0.5f;
+            const auto c = DstTraits<dstType, premul>::load(i->fCb)
+                         + DstTraits<dstType, premul>::load(i->fCg) * (i->fT0 + i->fT1) * 0.5f;
             fCc = fCc + c * (i->fT1 - i->fT0);
         }
     }