In reduced shader mode 1D Gaussian effect doesn't bake loop count.

Good for 92 shader compile reduction in desk_carsvg.skp.

This is probably a candidate for doing all the time, not just in reduced
shader mode.

Bug: skia:11844
Change-Id: I84e1b41580828d6a4a548c19480cf12c47eeb299
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/399416
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
Commit-Queue: Brian Salomon <bsalomon@google.com>
diff --git a/src/gpu/effects/GrGaussianConvolutionFragmentProcessor.cpp b/src/gpu/effects/GrGaussianConvolutionFragmentProcessor.cpp
index 09ec9c3..e075aec 100644
--- a/src/gpu/effects/GrGaussianConvolutionFragmentProcessor.cpp
+++ b/src/gpu/effects/GrGaussianConvolutionFragmentProcessor.cpp
@@ -33,11 +33,28 @@
 private:
     UniformHandle fKernelUni;
     UniformHandle fOffsetsUni;
+    UniformHandle fKernelWidthUni;
     UniformHandle fIncrementUni;
 
     using INHERITED = GrGLSLFragmentProcessor;
 };
 
+enum class LoopType {
+    kUnrolled,
+    kFixedLength,
+    kVariableLength,
+};
+
+static LoopType loop_type(const GrShaderCaps& caps) {
+    // This checks that bitwise integer operations and array indexing by non-consts are allowed.
+    if (caps.generation() < k130_GrGLSLGeneration) {
+        return LoopType::kUnrolled;
+    }
+    // If we're in reduced shader mode and we can have a loop then use a uniform to limit the
+    // number of iterations so we don't need a code variation for each width.
+    return caps.reducedShaderMode() ? LoopType::kVariableLength : LoopType::kFixedLength;
+}
+
 void GrGaussianConvolutionFragmentProcessor::Impl::emitCode(EmitArgs& args) {
     const GrGaussianConvolutionFragmentProcessor& ce =
             args.fFp.cast<GrGaussianConvolutionFragmentProcessor>();
@@ -49,31 +66,52 @@
 
     int width = SkGpuBlurUtils::LinearKernelWidth(ce.fRadius);
 
-    int arrayCount = (width + 3) / 4;
-    SkASSERT(4 * arrayCount >= width);
+    LoopType loopType = loop_type(*args.fShaderCaps);
+
+    int arrayCount;
+    if (loopType == LoopType::kVariableLength) {
+        // Size the kernel uniform for the maximum width.
+        arrayCount = (SkGpuBlurUtils::LinearKernelWidth(kMaxKernelRadius) + 3) / 4;
+    } else {
+        arrayCount = (width + 3) / 4;
+        SkASSERT(4 * arrayCount >= width);
+    }
 
     Var kernel(kUniform_Modifier, Array(kHalf4_Type, arrayCount), "Kernel");
     fKernelUni = VarUniformHandle(kernel);
 
-    Var color(kHalf4_Type, "color", Half4(0));
-    Declare(color);
 
     Var offsets(kUniform_Modifier, Array(kHalf4_Type, arrayCount), "Offsets");
     fOffsetsUni = VarUniformHandle(offsets);
 
+    Var color(kHalf4_Type, "color", Half4(0));
+    Declare(color);
+
     Var coord(kFloat2_Type, "coord", sk_SampleCoord());
     Declare(coord);
 
-    // This checks that bitwise integer operations and array indexing by non-consts are allowed.
-    if (args.fShaderCaps->generation() >= k130_GrGLSLGeneration) {
-        Var i(kInt_Type, "i", 0);
-        For(Declare(i), i < width, i++,
-            color += SampleChild(/*index=*/0, coord + offsets[i / 4][i & 3] * increment) *
-                     kernel[i / 4][i & 0x3]);
-    } else {
-        for (int i = 0; i < width; i++) {
-            color += SampleChild(/*index=*/0, coord + offsets[i / 4][i & 3] * increment) *
-                     kernel[i / 4][i & 0x3];
+    switch (loopType) {
+        case LoopType::kUnrolled:
+            for (int i = 0; i < width; i++) {
+                color += SampleChild(/*index=*/0, coord + offsets[i / 4][i & 3] * increment) *
+                         kernel[i / 4][i & 0x3];
+            }
+            break;
+        case LoopType::kFixedLength: {
+            Var i(kInt_Type, "i", 0);
+            For(Declare(i), i < width, i++,
+                color += SampleChild(/*index=*/0, coord + offsets[i / 4][i & 3] * increment) *
+                         kernel[i / 4][i & 0x3]);
+            break;
+        }
+        case LoopType::kVariableLength: {
+            Var kernelWidth(kUniform_Modifier, kInt_Type, "kernelWidth");
+            fKernelWidthUni = VarUniformHandle(kernelWidth);
+            Var i(kInt_Type, "i", 0);
+            For(Declare(i), i < kernelWidth, i++,
+                color += SampleChild(/*index=*/0, coord + offsets[i / 4][i & 3] * increment) *
+                         kernel[i / 4][i & 0x3]);
+            break;
         }
     }
 
@@ -96,13 +134,18 @@
     SkASSERT(arraySize <= SK_ARRAY_COUNT(GrGaussianConvolutionFragmentProcessor::fKernel));
     pdman.set4fv(fKernelUni, arrayCount, conv.fKernel);
     pdman.set4fv(fOffsetsUni, arrayCount, conv.fOffsets);
+    if (fKernelWidthUni.isValid()) {
+        pdman.set1i(fKernelWidthUni, width);
+    }
 }
 
 void GrGaussianConvolutionFragmentProcessor::Impl::GenKey(const GrProcessor& processor,
-                                                          const GrShaderCaps&,
+                                                          const GrShaderCaps& shaderCaps,
                                                           GrProcessorKeyBuilder* b) {
     const auto& conv = processor.cast<GrGaussianConvolutionFragmentProcessor>();
-    b->add32(conv.fRadius);
+    if (loop_type(shaderCaps) != LoopType::kVariableLength) {
+        b->add32(conv.fRadius);
+    }
 }
 
 ///////////////////////////////////////////////////////////////////////////////