Reland "bring back lowp bilerp_clamp_8888"

This reverts commit c6671be0ea75880626807c4c40792b2cb1301a39.

Reason for revert: looks like it didn't help

Original change's description:
> Revert "bring back lowp bilerp_clamp_8888"
> 
> This reverts commit 5043c09b8b092fb932b42312881565f12d3998f5.
> 
> Reason for revert: speculative, to try to unblock android roller
> 
> Original change's description:
> > bring back lowp bilerp_clamp_8888
> > 
> > PS1-3 are the literal code I removed a few months ago.
> > 
> > PS4 adds asserts to make sure we don't underflow.
> > PS5 adds rounding when scaling off bias, reducing diff slightly.
> > 
> > Cq-Include-Trybots: luci.chromium.try:linux-blink-rel
> > Change-Id: Ice573395a1680e1c3b55a8fe3e4d034f15adde1b
> > Reviewed-on: https://skia-review.googlesource.com/c/196620
> > Commit-Queue: Mike Klein <mtklein@google.com>
> > Reviewed-by: Brian Osman <brianosman@google.com>
> 
> TBR=mtklein@google.com,brianosman@google.com,reed@google.com
> 
> # Not skipping CQ checks because original CL landed > 1 day ago.
> 
> Change-Id: I816c23c4a12451be518ea3baf4b9a3fdba9b818a
> Cq-Include-Trybots: luci.chromium.try:linux-blink-rel
> Reviewed-on: https://skia-review.googlesource.com/c/196777
> Reviewed-by: Mike Reed <reed@google.com>

TBR=mtklein@google.com,brianosman@google.com,reed@google.com

Change-Id: I0f96a395ac704a52735c93a08f8ed52a148d5de5
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Cq-Include-Trybots: luci.chromium.try:linux-blink-rel
Reviewed-on: https://skia-review.googlesource.com/c/196980
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Reed <reed@google.com>
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index c6102a0..108431b 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -3273,6 +3273,73 @@
     store_8888_(ptr, tail, r,g,b,a);
 }
 
+#if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE)
+    static void(*bilerp_clamp_8888)(void) = nullptr;
+#else
+STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
+    // (cx,cy) are the center of our sample.
+    F cx = x,
+      cy = y;
+
+    // All sample points are at the same fractional offset (fx,fy).
+    // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
+    F fx = fract(cx + 0.5f),
+      fy = fract(cy + 0.5f);
+
+    // We'll accumulate the color of all four samples into {r,g,b,a} directly.
+    r = g = b = a = 0;
+
+    // The first three sample points will calculate their area using math
+    // just like in the float code above, but the fourth will take up all the rest.
+    //
+    // Logically this is the same as doing the math for the fourth pixel too,
+    // but rounding error makes this a better strategy, keeping opaque opaque, etc.
+    //
+    // We can keep up to 8 bits of fractional precision without overflowing 16-bit,
+    // so our "1.0" area is 256.
+    const uint16_t bias = 256;
+    U16 remaining = bias;
+
+    for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f)
+    for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) {
+        // (x,y) are the coordinates of this sample point.
+        F x = cx + dx,
+          y = cy + dy;
+
+        // ix_and_ptr() will clamp to the image's bounds for us.
+        const uint32_t* ptr;
+        U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+
+        U16 sr,sg,sb,sa;
+        from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa);
+
+        // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
+        // are combined in direct proportion to their area overlapping that logical query pixel.
+        // At positive offsets, the x-axis contribution to that rectangle is fx,
+        // or (1-fx) at negative x.  Same deal for y.
+        F sx = (dx > 0) ? fx : 1.0f - fx,
+          sy = (dy > 0) ? fy : 1.0f - fy;
+
+        U16 area = (dy == 0.5f && dx == 0.5f) ? remaining
+                                              : cast<U16>(sx * sy * bias);
+        for (size_t i = 0; i < N; i++) {
+            SkASSERT(remaining[i] >= area[i]);
+        }
+        remaining -= area;
+
+        r += sr * area;
+        g += sg * area;
+        b += sb * area;
+        a += sa * area;
+    }
+
+    r = (r + bias/2) / bias;
+    g = (g + bias/2) / bias;
+    b = (b + bias/2) / bias;
+    a = (a + bias/2) / bias;
+}
+#endif
+
 // Now we'll add null stand-ins for stages we haven't implemented in lowp.
 // If a pipeline uses these stages, it'll boot it out of lowp into highp.
 #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
@@ -3320,7 +3387,6 @@
     NOT_IMPLEMENTED(mirror_y)
     NOT_IMPLEMENTED(repeat_y)
     NOT_IMPLEMENTED(negate_x)
-    NOT_IMPLEMENTED(bilerp_clamp_8888)
     NOT_IMPLEMENTED(bilinear_nx)
     NOT_IMPLEMENTED(bilinear_ny)
     NOT_IMPLEMENTED(bilinear_px)