Reland "Use a table for dither effect rather than math."

This is a reland of 0de475e29ec84f9ec892997565fc4f6348e4610e

Old code restored behind a build flag pending web_test rebaseline.

Original change's description:
> Use a table for dither effect rather than math.
>
> We used to use integer math on sk_FragCoord, when supported, and a
> fallback using floating point (on a 4x4 rather than 8x8 grid). Now we
> precompute a 8x8 table in a texture because it was shown to be
> significantly faster on several devices. Test was done with the following
> running in viewer with the stats layer enabled and looking at total
> frame time:
>  SkRandom r;
>  for (int i = 0; i < N; ++i) {
>      SkColor c[2] = {r.nextU(), c[1] = r.nextU()};
>      SkPoint pts[2] = {{r.nextRangeScalar(0, 500), r.nextRangeScalar(0, 500)},
>                        {r.nextRangeScalar(0, 500), r.nextRangeScalar(0, 500)}};
>      SkPaint p;
>      p.setDither(true);
>      p.setShader(SkGradientShader::MakeLinear(pts, c, nullptr, 2, SkTileMode::kRepeat));
>      canvas->drawPaint(p);
>  }
>
> Device            GPU             N      no dither    int math dither   table dither
> Linux desktop     QuadroP1000     5000   304ms        400ms (1.31x)     383ms (1.26x)
> TecnoSpark3Pro    PowerVRGE8320   200    299ms        820ms (2.74x)     592ms (1.98x)
> Pixel 4           Adreno640       500    110ms        221ms (2.01x)     214ms (1.95x)
> Galaxy S20 FE     Mali-G77 MP11   600    165ms        360ms (2.18x)     260ms (1.58x)
>
>
> Bug: b/195281495
> Change-Id: I200a2be8e450ab66f7c8ae340a5c83ec6780db09
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/437239
> Commit-Queue: Brian Salomon <bsalomon@google.com>
> Reviewed-by: Michael Ludwig <michaelludwig@google.com>

Bug: b/195281495
Change-Id: Ia52d24aa731281b161865b08954f9eeaca0033eb
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/437677
Commit-Queue: Brian Salomon <bsalomon@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/SkGr.cpp b/src/gpu/SkGr.cpp
index 13b828d..79d0ffe 100644
--- a/src/gpu/SkGr.cpp
+++ b/src/gpu/SkGr.cpp
@@ -44,6 +44,7 @@
 #include "src/gpu/effects/GrBlendFragmentProcessor.h"
 #include "src/gpu/effects/GrPorterDuffXferProcessor.h"
 #include "src/gpu/effects/GrSkSLFP.h"
+#include "src/gpu/effects/GrTextureEffect.h"
 #include "src/image/SkImage_Base.h"
 #include "src/shaders/SkShaderBase.h"
 
@@ -324,8 +325,37 @@
     SkUNREACHABLE;
 }
 
+#if !defined(SK_DISABLE_GPU_TABLE_DITHER)
+static SkBitmap make_dither_lut() {
+    static constexpr struct DitherTable {
+        constexpr DitherTable() : data() {
+            for (int x = 0; x < 8; ++x) {
+                for (int y = 0; y < 8; ++y) {
+                    // The computation of 'm' and 'value' is lifted from CPU backend.
+                    unsigned int m = (y & 1) << 5 | (x & 1) << 4 |
+                                     (y & 2) << 2 | (x & 2) << 1 |
+                                     (y & 4) >> 1 | (x & 4) >> 2;
+                    float value = float(m) * 1.0 / 64.0 - 63.0 / 128.0;
+                    // Bias by 0.5 to be in 0..1, mul by 255 and round to nearest int to make byte.
+                    data[y * 8 + x] = (uint8_t)((value + 0.5) * 255.f + 0.5f);
+                }
+            }
+        }
+        uint8_t data[64];
+    } gTable;
+    SkBitmap bmp;
+    bmp.setInfo(SkImageInfo::MakeA8(8, 8));
+    bmp.setPixels(const_cast<uint8_t*>(gTable.data));
+    bmp.setImmutable();
+    return bmp;
+}
+#endif
+
 static std::unique_ptr<GrFragmentProcessor> make_dither_effect(
-        std::unique_ptr<GrFragmentProcessor> inputFP, float range, const GrCaps* caps) {
+        GrRecordingContext* rContext,
+        std::unique_ptr<GrFragmentProcessor> inputFP,
+        float range,
+        const GrCaps* caps) {
     if (range == 0 || inputFP == nullptr) {
         return inputFP;
     }
@@ -334,6 +364,54 @@
         return inputFP;
     }
 
+#if !defined(SK_DISABLE_GPU_TABLE_DITHER)
+    // We used to use integer math on sk_FragCoord, when supported, and a fallback using floating
+    // point (on a 4x4 rather than 8x8 grid). Now we precompute a 8x8 table in a texture because
+    // it was shown to be significantly faster on several devices. Test was done with the following
+    // running in viewer with the stats layer enabled and looking at total frame time:
+    //      SkRandom r;
+    //      for (int i = 0; i < N; ++i) {
+    //          SkColor c[2] = {r.nextU(), c[1] = r.nextU()};
+    //          SkPoint pts[2] = {{r.nextRangeScalar(0, 500), r.nextRangeScalar(0, 500)},
+    //                            {r.nextRangeScalar(0, 500), r.nextRangeScalar(0, 500)}};
+    //          SkPaint p;
+    //          p.setDither(true);
+    //          p.setShader(SkGradientShader::MakeLinear(pts, c, nullptr, 2, SkTileMode::kRepeat));
+    //          canvas->drawPaint(p);
+    //      }
+    // Device            GPU             N      no dither    int math dither   table dither
+    // Linux desktop     QuadroP1000     5000   304ms        400ms (1.31x)     383ms (1.26x)
+    // TecnoSpark3Pro    PowerVRGE8320   200    299ms        820ms (2.74x)     592ms (1.98x)
+    // Pixel 4           Adreno640       500    110ms        221ms (2.01x)     214ms (1.95x)
+    // Galaxy S20 FE     Mali-G77 MP11   600    165ms        360ms (2.18x)     260ms (1.58x)
+    static const SkBitmap gLUT = make_dither_lut();
+    auto [tex, ct] = GrMakeCachedBitmapProxyView(rContext, gLUT, GrMipmapped::kNo);
+    if (!tex) {
+        return inputFP;
+    }
+    SkASSERT(ct == GrColorType::kAlpha_8);
+    GrSamplerState sampler(GrSamplerState::WrapMode::kRepeat, SkFilterMode::kNearest);
+    auto te = GrTextureEffect::Make(
+            std::move(tex), kPremul_SkAlphaType, SkMatrix::I(), sampler, *caps);
+    static auto effect = SkMakeRuntimeEffect(SkRuntimeEffect::MakeForShader, R"(
+        uniform half range;
+        uniform shader table;
+        half4 main(float2 xy, half4 color) {
+            half value = sample(table, sk_FragCoord.xy).a - 0.5; // undo the bias in the table
+            // For each color channel, add the random offset to the channel value and then clamp
+            // between 0 and alpha to keep the color premultiplied.
+            return half4(clamp(color.rgb + value * range, 0.0, color.a), color.a);
+        }
+    )", SkRuntimeEffectPriv::ES3Options());
+    return GrSkSLFP::Make(effect,
+                          "Dither",
+                          std::move(inputFP),
+                          GrSkSLFP::OptFlags::kPreservesOpaqueInput,
+                          "range",
+                          range,
+                          "table",
+                          std::move(te));
+#else
     if (caps->shaderCaps()->integerSupport()) {
         // This ordered-dither code is lifted from the cpu backend.
         static auto effect = SkMakeRuntimeEffect(SkRuntimeEffect::MakeForShader, R"(
@@ -387,6 +465,7 @@
                               GrSkSLFP::OptFlags::kPreservesOpaqueInput,
                               "range", range);
     }
+#endif
 }
 #endif
 
@@ -504,7 +583,7 @@
     if (SkPaintPriv::ShouldDither(skPaint, GrColorTypeToSkColorType(ct)) && paintFP != nullptr) {
         float ditherRange = dither_range_for_config(ct);
         paintFP = make_dither_effect(
-                std::move(paintFP), ditherRange, context->priv().caps());
+                context, std::move(paintFP), ditherRange, context->priv().caps());
     }
 #endif