Reland "Support large kernels on GPU in matrix convolution effect"

This reverts commit a117e7b75bd6ae262d52ee61769afd7cb1ff556f.

Reason for revert: Fixed divide-by-0 in the unpremul logic. This was here before but never caused problems (or we ignored them.)

Original change's description:
> Revert "Reland "Support large kernels on GPU in matrix convolution effect""
>
> This reverts commit 76cb9c4d4c8b5db4a3c002a4174172e4972a05b6.
>
> Reason for revert: Tegra3 & Metal issues
>
> Original change's description:
> > Reland "Support large kernels on GPU in matrix convolution effect"
> >
> > This reverts commit 41e377d1baf0dbf1355ec644a6fe40f6da48fc43.
> >
> > Reason for revert: fixed issues
> >
> > Bug: skia:8449
> > Change-Id: I0c4389f0efa92c6da69253b2304ad9a072750965
> > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/287817
> > Commit-Queue: Adlai Holler <adlai@google.com>
> > Reviewed-by: Brian Salomon <bsalomon@google.com>
>
> TBR=bsalomon@google.com,robertphillips@google.com,michaelludwig@google.com,adlai@google.com
>
> Change-Id: I5c3f04d4d262550a3298b8fd677c8a1661be7ad9
> No-Presubmit: true
> No-Tree-Checks: true
> No-Try: true
> Bug: skia:8449
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/289076
> Reviewed-by: Adlai Holler <adlai@google.com>
> Commit-Queue: Adlai Holler <adlai@google.com>

TBR=bsalomon@google.com,robertphillips@google.com,michaelludwig@google.com,adlai@google.com


Bug: skia:8449
Change-Id: I90b8e9e0eb52bc08308fb472eb216ed0bd4785a1
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/289030
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Adlai Holler <adlai@google.com>
diff --git a/src/gpu/effects/GrMatrixConvolutionEffect.cpp b/src/gpu/effects/GrMatrixConvolutionEffect.cpp
index 8bb5a56..3736996 100644
--- a/src/gpu/effects/GrMatrixConvolutionEffect.cpp
+++ b/src/gpu/effects/GrMatrixConvolutionEffect.cpp
@@ -6,6 +6,11 @@
  */
 #include "src/gpu/effects/GrMatrixConvolutionEffect.h"
 
+#include "include/private/SkHalf.h"
+#include "src/gpu/GrBitmapTextureMaker.h"
+#include "src/gpu/GrContextPriv.h"
+#include "src/gpu/GrProxyProvider.h"
+#include "src/gpu/GrRecordingContextPriv.h"
 #include "src/gpu/GrTexture.h"
 #include "src/gpu/GrTextureProxy.h"
 #include "src/gpu/effects/GrTextureEffect.h"
@@ -26,34 +31,195 @@
 private:
     typedef GrGLSLProgramDataManager::UniformHandle UniformHandle;
 
+    void emitKernelBlock(EmitArgs&, SkIPoint);
+
     UniformHandle               fKernelUni;
     UniformHandle               fKernelOffsetUni;
     UniformHandle               fGainUni;
     UniformHandle               fBiasUni;
+    UniformHandle               fKernelBiasUni;
 
     typedef GrGLSLFragmentProcessor INHERITED;
 };
 
+GrMatrixConvolutionEffect::KernelWrapper GrMatrixConvolutionEffect::KernelWrapper::Make(
+        GrRecordingContext* context, SkISize size, const GrCaps& caps, const SkScalar* values) {
+    if (nullptr == context || nullptr == values || size.isEmpty()) {
+        return {};
+    }
+    const int length = size.area();
+    // Small kernel -> just fill the array.
+    KernelWrapper result(size);
+    if (length <= kMaxUniformSize) {
+        for (int i = 0; i < length; i++) {
+            result.fArray[i] = SkScalarToFloat(values[i]);
+        }
+        return result;
+    }
+
+    ScalableSampler& scalableSampler = result.fScalableSampler;
+    bool useA16 =
+        context->defaultBackendFormat(kA16_float_SkColorType, GrRenderable::kNo).isValid();
+    SkScalar min = values[0];
+    if (!useA16) {
+        // Determine min and max values to figure out inner gain & bias.
+        SkScalar max = values[0];
+        for (int i = 1; i < length; i++) {
+            if (values[i] < min) {
+                min = values[i];
+            }
+            if (values[i] > max) {
+                max = values[i];
+            }
+        }
+        // Treat near-0 gain (i.e. box blur) as 1, and let the kernelBias
+        // move everything up to the final value.
+        const SkScalar computedGain = max - min;
+        scalableSampler.fGain =
+            SkScalarNearlyZero(computedGain) ? 1.0f : SkScalarToFloat(computedGain);
+        // Inner bias is pre-inner-gain so we divide that out.
+        scalableSampler.fBias = SkScalarToFloat(min) / scalableSampler.fGain;
+    }
+
+    // TODO: Enable kernel caching and check perf.
+    static constexpr bool kCacheKernelTexture = false;
+
+    GrUniqueKey key;
+    if (kCacheKernelTexture) {
+        static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
+        GrUniqueKey::Builder builder(&key, kDomain, length, "Matrix Convolution Kernel");
+        // Texture cache key is the exact content of the kernel.
+        static_assert(sizeof(float) == 4);
+        for (int i = 0; i < length; i++) {
+            builder[i] = *(const uint32_t*)&values[i];
+        }
+        builder.finish();
+    }
+
+    // Find or create a texture.
+    GrProxyProvider* proxyProvider = context->priv().proxyProvider();
+    GrSurfaceProxyView view;
+    SkColorType colorType = useA16 ? kA16_float_SkColorType : kAlpha_8_SkColorType;
+    sk_sp<GrTextureProxy> cachedKernel;
+    if (kCacheKernelTexture && (cachedKernel = proxyProvider->findOrCreateProxyByUniqueKey(key))) {
+        GrSwizzle swizzle =
+            context->priv().caps()->getReadSwizzle(cachedKernel->backendFormat(),
+                                                   SkColorTypeToGrColorType(colorType));
+        view = {std::move(cachedKernel), kTopLeft_GrSurfaceOrigin, swizzle};
+    } else {
+        SkBitmap bm;
+        auto info = SkImageInfo::Make({(int)GrNextPow2(length), 1}, colorType,
+                                      kPremul_SkAlphaType, nullptr);
+        if (!bm.tryAllocPixels(info)) {
+            return {};
+        }
+        for (int i = 0; i < length; i++) {
+            if (useA16) {
+                *bm.getAddr16(i, 0) = SkFloatToHalf(values[i]);
+            } else {
+                *bm.getAddr8(i, 0) =
+                    SkScalarRoundToInt((values[i] - min) / scalableSampler.fGain * 255);
+            }
+        }
+        bm.setImmutable();
+        GrBitmapTextureMaker maker(context, bm, GrImageTexGenPolicy::kNew_Uncached_Budgeted);
+        view = maker.view(GrMipMapped::kNo);
+        if (!view) {
+            return {};
+        }
+        if (kCacheKernelTexture) {
+            proxyProvider->assignUniqueKeyToProxy(key, view.asTextureProxy());
+        }
+    }
+    scalableSampler.fSampler = { std::move(view) };
+    return result;
+}
+
+bool GrMatrixConvolutionEffect::KernelWrapper::operator==(const KernelWrapper& k) const {
+    if (fSize != k.fSize) {
+        return false;
+    } else if (this->isSampled()) {
+        return fScalableSampler == k.fScalableSampler;
+    } else {
+        return std::equal(fArray.begin(), fArray.begin() + fSize.area(), k.fArray.begin());
+    }
+}
+
+bool GrMatrixConvolutionEffect::KernelWrapper::ScalableSampler::operator==(
+                                                                const ScalableSampler& k) const {
+    return fSampler == k.fSampler && fGain == k.fGain && fBias == k.fBias;
+}
+
+// For sampled kernels, emit a for loop that does all the kernel accumulation.
+// For uniform kernels, emit a single iteration. Function is called repeatedly in a for loop.
+// loc is ignored for sampled kernels.
+void GrGLMatrixConvolutionEffect::emitKernelBlock(EmitArgs& args, SkIPoint loc) {
+    const GrMatrixConvolutionEffect& mce = args.fFp.cast<GrMatrixConvolutionEffect>();
+    GrGLSLFPFragmentBuilder* fragBuilder = args.fFragBuilder;
+    GrGLSLUniformHandler* uniformHandler = args.fUniformHandler;
+    int kernelWidth = mce.kernelSize().width();
+    int kernelHeight = mce.kernelSize().height();
+    int kernelArea = kernelWidth * kernelHeight;
+
+    if (mce.kernelIsSampled()) {
+        fragBuilder->codeAppendf("half2 kernelCoord = half2(0, 0);");
+        fragBuilder->codeAppendf("for (int i = 0; i < %d; ++i)", (int)kernelArea);
+    }
+
+    GrGLSLShaderBuilder::ShaderBlock block(fragBuilder);
+
+    fragBuilder->codeAppend("half k;");
+    fragBuilder->codeAppend("half2 sourceOffset;");
+    if (mce.kernelIsSampled()) {
+        const char* kernelBias = uniformHandler->getUniformCStr(fKernelBiasUni);
+        fragBuilder->codeAppend("k = ");
+        fragBuilder->appendTextureLookup(args.fTexSamplers[0], "kernelCoord");
+        fragBuilder->codeAppendf(".w + %s;", kernelBias);
+        fragBuilder->codeAppendf("sourceOffset.y = floor(i / %d);", kernelWidth);
+        fragBuilder->codeAppendf("sourceOffset.x = i - sourceOffset.y * %d;", kernelWidth);
+        float kernelStride = 1.0f / (float)GrNextPow2(kernelArea);
+        fragBuilder->codeAppendf("kernelCoord.x += %f;", kernelStride);
+    } else {
+        fragBuilder->codeAppendf("sourceOffset = half2(%d, %d);", loc.x(), loc.y());
+        int offset = loc.y() * kernelWidth + loc.x();
+        static constexpr const char kVecSuffix[][4] = { ".x", ".y", ".z", ".w" };
+        const char* kernel = uniformHandler->getUniformCStr(fKernelUni);
+        fragBuilder->codeAppendf("k = %s[%d]%s;", kernel, offset / 4,
+                                 kVecSuffix[offset & 0x3]);
+    }
+
+    auto sample = this->invokeChild(0, args, "coord + sourceOffset");
+    fragBuilder->codeAppendf("half4 c = %s;", sample.c_str());
+    if (!mce.convolveAlpha()) {
+        fragBuilder->codeAppend("c.rgb /= max(c.a, 0.0001);");
+        fragBuilder->codeAppend("c.rgb = saturate(c.rgb);");
+    }
+    fragBuilder->codeAppend("sum += c * k;");
+}
+
 void GrGLMatrixConvolutionEffect::emitCode(EmitArgs& args) {
     const GrMatrixConvolutionEffect& mce = args.fFp.cast<GrMatrixConvolutionEffect>();
 
-    int kWidth = mce.kernelSize().width();
-    int kHeight = mce.kernelSize().height();
+    int kernelWidth = mce.kernelSize().width();
+    int kernelHeight = mce.kernelSize().height();
 
-    int arrayCount = (kWidth * kHeight + 3) / 4;
-    SkASSERT(4 * arrayCount >= kWidth * kHeight);
+    int arrayCount = (kernelWidth * kernelHeight + 3) / 4;
+    SkASSERT(4 * arrayCount >= kernelWidth * kernelHeight);
 
     GrGLSLUniformHandler* uniformHandler = args.fUniformHandler;
-    fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag, kHalf4_GrSLType,
-                                                 "Kernel",
-                                                 arrayCount);
+    if (mce.kernelIsSampled()) {
+        fKernelBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag,
+                                                    kHalf_GrSLType, "KernelBias");
+    } else {
+        fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag,
+                                                     kHalf4_GrSLType, "Kernel", arrayCount);
+    }
     fKernelOffsetUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf2_GrSLType,
                                                   "KernelOffset");
     fGainUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Gain");
     fBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Bias");
 
     const char* kernelOffset = uniformHandler->getUniformCStr(fKernelOffsetUni);
-    const char* kernel = uniformHandler->getUniformCStr(fKernelUni);
     const char* gain = uniformHandler->getUniformCStr(fGainUni);
     const char* bias = uniformHandler->getUniformCStr(fBiasUni);
 
@@ -62,27 +228,17 @@
                                                     mce.sampleMatrix());
     fragBuilder->codeAppend("half4 sum = half4(0, 0, 0, 0);");
     fragBuilder->codeAppendf("float2 coord = %s - %s;", coords2D.c_str(), kernelOffset);
-    fragBuilder->codeAppend("half4 c;");
 
-    const char* kVecSuffix[4] = { ".x", ".y", ".z", ".w" };
-    for (int y = 0; y < kHeight; y++) {
-        for (int x = 0; x < kWidth; x++) {
-            GrGLSLShaderBuilder::ShaderBlock block(fragBuilder);
-            int offset = y*kWidth + x;
-
-            fragBuilder->codeAppendf("half k = %s[%d]%s;", kernel, offset / 4,
-                                     kVecSuffix[offset & 0x3]);
-            SkSL::String coord;
-            coord.appendf("coord + half2(%d, %d)", x, y);
-            auto sample = this->invokeChild(0, args, coord);
-            fragBuilder->codeAppendf("half4 c = %s;", sample.c_str());
-            if (!mce.convolveAlpha()) {
-                fragBuilder->codeAppend("c.rgb /= c.a;");
-                fragBuilder->codeAppend("c.rgb = saturate(c.rgb);");
+    if (mce.kernelIsSampled()) {
+        this->emitKernelBlock(args, {});
+    } else {
+        for (int x = 0; x < kernelWidth; ++x) {
+            for (int y = 0; y < kernelHeight; ++y) {
+                this->emitKernelBlock(args, SkIPoint::Make(x, y));
             }
-            fragBuilder->codeAppend("sum += c * k;");
         }
     }
+
     if (mce.convolveAlpha()) {
         fragBuilder->codeAppendf("%s = sum * %s + %s;", args.fOutputColor, gain, bias);
         fragBuilder->codeAppendf("%s.a = saturate(%s.a);", args.fOutputColor, args.fOutputColor);
@@ -90,7 +246,7 @@
                                  args.fOutputColor, args.fOutputColor, args.fOutputColor);
     } else {
         auto sample = this->invokeChild(0, args, coords2D.c_str());
-        fragBuilder->codeAppendf("c = %s;", sample.c_str());
+        fragBuilder->codeAppendf("half4 c = %s;", sample.c_str());
         fragBuilder->codeAppendf("%s.a = c.a;", args.fOutputColor);
         fragBuilder->codeAppendf("%s.rgb = saturate(sum.rgb * %s + %s);", args.fOutputColor, gain, bias);
         fragBuilder->codeAppendf("%s.rgb *= %s.a;", args.fOutputColor, args.fOutputColor);
@@ -111,17 +267,22 @@
                                             const GrFragmentProcessor& processor) {
     const GrMatrixConvolutionEffect& conv = processor.cast<GrMatrixConvolutionEffect>();
     pdman.set2fv(fKernelOffsetUni, 1, conv.kernelOffset().ptr());
-    int kernelCount = conv.kernelSize().width() * conv.kernelSize().height();
-    int arrayCount = (kernelCount + 3) / 4;
-    SkASSERT(4 * arrayCount >= kernelCount);
-    pdman.set4fv(fKernelUni, arrayCount, conv.kernel());
-    pdman.set1f(fGainUni, conv.gain());
+    float totalGain = conv.gain();
+    if (conv.kernelIsSampled()) {
+        totalGain *= conv.kernelSampleGain();
+        pdman.set1f(fKernelBiasUni, conv.kernelSampleBias());
+    } else {
+        int kernelCount = conv.kernelSize().area();
+        int arrayCount = (kernelCount + 3) / 4;
+        SkASSERT(4 * arrayCount >= kernelCount);
+        pdman.set4fv(fKernelUni, arrayCount, conv.kernel());
+    }
     pdman.set1f(fBiasUni, conv.bias());
+    pdman.set1f(fGainUni, totalGain);
 }
 
 GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(std::unique_ptr<GrFragmentProcessor> child,
-                                                     const SkISize& kernelSize,
-                                                     const SkScalar* kernel,
+                                                     KernelWrapper kernel,
                                                      SkScalar gain,
                                                      SkScalar bias,
                                                      const SkIPoint& kernelOffset,
@@ -129,14 +290,14 @@
         // To advertise either the modulation or opaqueness optimizations we'd have to examine the
         // parameters.
         : INHERITED(kGrMatrixConvolutionEffect_ClassID, kNone_OptimizationFlags)
-        , fKernelSize(kernelSize)
+        , fKernel(std::move(kernel))
         , fGain(SkScalarToFloat(gain))
         , fBias(SkScalarToFloat(bias) / 255.0f)
         , fConvolveAlpha(convolveAlpha) {
     child->setSampledWithExplicitCoords();
     this->registerChildProcessor(std::move(child));
-    for (int i = 0; i < kernelSize.width() * kernelSize.height(); i++) {
-        fKernel[i] = SkScalarToFloat(kernel[i]);
+    if (fKernel.isSampled()) {
+        this->setTextureSamplerCnt(1);
     }
     fKernelOffset = {static_cast<float>(kernelOffset.x()),
                      static_cast<float>(kernelOffset.y())};
@@ -145,7 +306,7 @@
 
 GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(const GrMatrixConvolutionEffect& that)
         : INHERITED(kGrMatrixConvolutionEffect_ClassID, kNone_OptimizationFlags)
-        , fKernelSize(that.fKernelSize)
+        , fKernel(that.fKernel)
         , fGain(that.fGain)
         , fBias(that.fBias)
         , fKernelOffset(that.fKernelOffset)
@@ -153,7 +314,9 @@
     auto child = that.childProcessor(0).clone();
     child->setSampledWithExplicitCoords();
     this->registerChildProcessor(std::move(child));
-    std::copy_n(that.fKernel, fKernelSize.width() * fKernelSize.height(), fKernel);
+    if (fKernel.isSampled()) {
+        this->setTextureSamplerCnt(1);
+    }
     this->addCoordTransform(&fCoordTransform);
 }
 
@@ -172,14 +335,18 @@
 
 bool GrMatrixConvolutionEffect::onIsEqual(const GrFragmentProcessor& sBase) const {
     const GrMatrixConvolutionEffect& s = sBase.cast<GrMatrixConvolutionEffect>();
-    return fKernelSize == s.kernelSize() &&
-           std::equal(fKernel, fKernel + fKernelSize.area(), s.fKernel) &&
+    return fKernel == s.fKernel &&
            fGain == s.gain() &&
            fBias == s.bias() &&
            fKernelOffset == s.kernelOffset() &&
            fConvolveAlpha == s.convolveAlpha();
 }
 
+const GrFragmentProcessor::TextureSampler& GrMatrixConvolutionEffect::onTextureSampler(
+        int index) const {
+    return IthTextureSampler(index, fKernel.scalableSampler().fSampler);
+}
+
 static void fill_in_1D_gaussian_kernel_with_stride(float* kernel, int size, int stride,
                                                    float twoSigmaSqrd) {
     SkASSERT(!SkScalarNearlyZero(twoSigmaSqrd, SK_ScalarNearlyZero));
@@ -204,7 +371,6 @@
 
 static void fill_in_2D_gaussian_kernel(float* kernel, int width, int height,
                                        SkScalar sigmaX, SkScalar sigmaY) {
-    SkASSERT(width * height <= MAX_KERNEL_SIZE);
     const float twoSigmaSqrdX = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaX));
     const float twoSigmaSqrdY = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaY));
 
@@ -260,7 +426,8 @@
     }
 }
 
-std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::Make(GrSurfaceProxyView srcView,
+std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::Make(GrRecordingContext* context,
+                                                                     GrSurfaceProxyView srcView,
                                                                      const SkIRect& srcBounds,
                                                                      const SkISize& kernelSize,
                                                                      const SkScalar* kernel,
@@ -270,14 +437,19 @@
                                                                      GrSamplerState::WrapMode wm,
                                                                      bool convolveAlpha,
                                                                      const GrCaps& caps) {
+    auto kw = KernelWrapper::Make(context, kernelSize, caps, kernel);
+    if (!kw.isValid()) {
+        return nullptr;
+    }
     GrSamplerState sampler(wm, GrSamplerState::Filter::kNearest);
     auto child = GrTextureEffect::MakeSubset(std::move(srcView), kPremul_SkAlphaType, SkMatrix::I(),
                                              sampler, SkRect::Make(srcBounds), caps);
     return std::unique_ptr<GrFragmentProcessor>(new GrMatrixConvolutionEffect(
-            std::move(child), kernelSize, kernel, gain, bias, kernelOffset, convolveAlpha));
+            std::move(child), std::move(kw), gain, bias, kernelOffset, convolveAlpha));
 }
 
 std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::MakeGaussian(
+        GrRecordingContext* context,
         GrSurfaceProxyView srcView,
         const SkIRect& srcBounds,
         const SkISize& kernelSize,
@@ -289,11 +461,11 @@
         SkScalar sigmaX,
         SkScalar sigmaY,
         const GrCaps& caps) {
-    float kernel[MAX_KERNEL_SIZE];
-
-    fill_in_2D_gaussian_kernel(kernel, kernelSize.width(), kernelSize.height(), sigmaX, sigmaY);
-    return Make(std::move(srcView), srcBounds, kernelSize, kernel, gain, bias, kernelOffset, wm,
-                convolveAlpha, caps);
+    SkAutoSTMalloc<32, float> kernel(kernelSize.area());
+    fill_in_2D_gaussian_kernel(kernel.get(), kernelSize.width(), kernelSize.height(),
+                               sigmaX, sigmaY);
+    return Make(context, std::move(srcView), srcBounds, kernelSize, kernel.get(),
+                gain, bias, kernelOffset, wm, convolveAlpha, caps);
 }
 
 GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrMatrixConvolutionEffect);
@@ -302,8 +474,9 @@
 std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::TestCreate(GrProcessorTestData* d) {
     auto [view, ct, at] = d->randomView();
 
-    int width = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE);
-    int height = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE / width);
+    static constexpr size_t kMaxTestKernelSize = 2 * kMaxUniformSize;
+    int width = d->fRandom->nextRangeU(1, kMaxTestKernelSize);
+    int height = d->fRandom->nextRangeU(1, kMaxTestKernelSize / width);
     SkISize kernelSize = SkISize::Make(width, height);
     std::unique_ptr<SkScalar[]> kernel(new SkScalar[width * height]);
     for (int i = 0; i < width * height; i++) {
@@ -325,8 +498,8 @@
     auto wm = static_cast<GrSamplerState::WrapMode>(
             d->fRandom->nextULessThan(GrSamplerState::kWrapModeCount));
     bool convolveAlpha = d->fRandom->nextBool();
-
-    return GrMatrixConvolutionEffect::Make(std::move(view),
+    return GrMatrixConvolutionEffect::Make(d->context()->priv().asRecordingContext(),
+                                           std::move(view),
                                            bounds,
                                            kernelSize,
                                            kernel.get(),