Implement GPU path for matrix convolution.  Note that when not convolving alpha,
the premultiplying is done less efficiently than in the raster path:  it's
done on each texture access, rather than as a pre-processing pass.   This was
so I could do the filter as a single custom stage; will try the optimization
separately.

This implementation gives a ~30X speedup on the GPU results for the
matrixconvolution bench (~10X due to the GPU, and ~3X due to texture
uploads/readback removal).

Note:  this changes the matrixconvolution for the software path as well, so
it will likely break the bots until that test is rebaselined.

Review URL:  https://codereview.appspot.com/6585069/



git-svn-id: http://skia.googlecode.com/svn/trunk@5809 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/effects/SkMatrixConvolutionImageFilter.cpp b/src/effects/SkMatrixConvolutionImageFilter.cpp
index 1c6f980..6b8a06d 100644
--- a/src/effects/SkMatrixConvolutionImageFilter.cpp
+++ b/src/effects/SkMatrixConvolutionImageFilter.cpp
@@ -12,6 +12,10 @@
 #include "SkRect.h"
 #include "SkUnPreMultiply.h"
 
+#if SK_SUPPORT_GPU
+#include "gl/GrGLProgramStage.h"
+#endif
+
 SkMatrixConvolutionImageFilter::SkMatrixConvolutionImageFilter(const SkISize& kernelSize, const SkScalar* kernel, SkScalar gain, SkScalar bias, const SkIPoint& target, TileMode tileMode, bool convolveAlpha, SkImageFilter* input)
   : INHERITED(input),
     fKernelSize(kernelSize),
@@ -23,6 +27,7 @@
     uint32_t size = fKernelSize.fWidth * fKernelSize.fHeight;
     fKernel = SkNEW_ARRAY(SkScalar, size);
     memcpy(fKernel, kernel, size * sizeof(SkScalar));
+    SkASSERT(kernelSize.fWidth >= 1 && kernelSize.fHeight >= 1);
     SkASSERT(target.fX >= 0 && target.fX < kernelSize.fWidth);
     SkASSERT(target.fY >= 0 && target.fY < kernelSize.fHeight);
 }
@@ -121,11 +126,12 @@
             }
             int a = convolveAlpha
                   ? SkClampMax(SkScalarFloorToInt(SkScalarMul(sumA, fGain) + fBias), 255)
-                  : SkGetPackedA32(PixelFetcher::fetch(src, x, y));
+                  : 255;
             int r = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumR, fGain) + fBias), a);
             int g = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumG, fGain) + fBias), a);
             int b = SkClampMax(SkScalarFloorToInt(SkScalarMul(sumB, fGain) + fBias), a);
             if (!convolveAlpha) {
+                a = SkGetPackedA32(PixelFetcher::fetch(src, x, y));
                 *dptr++ = SkPreMultiplyARGB(a, r, g, b);
             } else {
                 *dptr++ = SkPackARGB32(a, r, g, b);
@@ -225,3 +231,314 @@
     filterBorderPixels(src, result, bottom);
     return true;
 }
+
+#if SK_SUPPORT_GPU
+
+///////////////////////////////////////////////////////////////////////////////
+
+class GrGLMatrixConvolutionEffect;
+
+class GrMatrixConvolutionEffect : public GrSingleTextureEffect {
+public:
+    typedef SkMatrixConvolutionImageFilter::TileMode TileMode;
+    GrMatrixConvolutionEffect(GrTexture*,
+                              const SkISize& kernelSize,
+                              const SkScalar* kernel,
+                              SkScalar gain,
+                              SkScalar bias,
+                              const SkIPoint& target,
+                              TileMode tileMode,
+                              bool convolveAlpha);
+    virtual ~GrMatrixConvolutionEffect();
+
+    static const char* Name() { return "MatrixConvolution"; }
+    const SkISize& kernelSize() const { return fKernelSize; }
+    const float* target() const { return fTarget; }
+    const float* kernel() const { return fKernel; }
+    float gain() const { return fGain; }
+    float bias() const { return fBias; }
+    TileMode tileMode() const { return fTileMode; }
+    bool convolveAlpha() const { return fConvolveAlpha; }
+
+    typedef GrGLMatrixConvolutionEffect GLProgramStage;
+
+    virtual const GrProgramStageFactory& getFactory() const SK_OVERRIDE;
+    virtual bool isEqual(const GrCustomStage&) const SK_OVERRIDE;
+
+private:
+    SkISize  fKernelSize;
+    float   *fKernel;
+    float    fGain;
+    float    fBias;
+    float    fTarget[2];
+    TileMode fTileMode;
+    bool     fConvolveAlpha;
+
+    GR_DECLARE_CUSTOM_STAGE_TEST;
+
+    typedef GrSingleTextureEffect INHERITED;
+};
+
+class GrGLMatrixConvolutionEffect : public GrGLProgramStage {
+public:
+    GrGLMatrixConvolutionEffect(const GrProgramStageFactory& factory,
+                                const GrCustomStage& stage);
+    virtual void setupVariables(GrGLShaderBuilder* builder) SK_OVERRIDE;
+    virtual void emitVS(GrGLShaderBuilder* state,
+                        const char* vertexCoords) SK_OVERRIDE {}
+    virtual void emitFS(GrGLShaderBuilder* state,
+                        const char* outputColor,
+                        const char* inputColor,
+                        const TextureSamplerArray&) SK_OVERRIDE;
+
+    static inline StageKey GenKey(const GrCustomStage& s, const GrGLCaps& caps);
+
+    virtual void setData(const GrGLUniformManager&,
+                         const GrCustomStage&,
+                         const GrRenderTarget*,
+                         int stageNum) SK_OVERRIDE;
+
+private:
+    typedef GrGLUniformManager::UniformHandle        UniformHandle;
+    typedef SkMatrixConvolutionImageFilter::TileMode TileMode;
+    SkISize        fKernelSize;
+    TileMode       fTileMode;
+    bool           fConvolveAlpha;
+
+    UniformHandle  fKernelUni;
+    UniformHandle  fImageIncrementUni;
+    UniformHandle  fTargetUni;
+    UniformHandle  fGainUni;
+    UniformHandle  fBiasUni;
+};
+
+GrGLMatrixConvolutionEffect::GrGLMatrixConvolutionEffect(const GrProgramStageFactory& factory,
+                                           const GrCustomStage& stage)
+    : GrGLProgramStage(factory)
+    , fKernelUni(GrGLUniformManager::kInvalidUniformHandle)
+    , fImageIncrementUni(GrGLUniformManager::kInvalidUniformHandle)
+    , fTargetUni(GrGLUniformManager::kInvalidUniformHandle)
+    , fGainUni(GrGLUniformManager::kInvalidUniformHandle)
+    , fBiasUni(GrGLUniformManager::kInvalidUniformHandle) {
+    const GrMatrixConvolutionEffect& m = static_cast<const GrMatrixConvolutionEffect&>(stage);
+    fKernelSize = m.kernelSize();
+    fTileMode = m.tileMode();
+    fConvolveAlpha = m.convolveAlpha();
+}
+
+void GrGLMatrixConvolutionEffect::setupVariables(GrGLShaderBuilder* builder) {
+    fImageIncrementUni = builder->addUniform(GrGLShaderBuilder::kFragment_ShaderType,
+                                             kVec2f_GrSLType, "ImageIncrement");
+    fKernelUni = builder->addUniformArray(GrGLShaderBuilder::kFragment_ShaderType,
+                                             kFloat_GrSLType, "Kernel", fKernelSize.width() * fKernelSize.height());
+    fTargetUni = builder->addUniform(GrGLShaderBuilder::kFragment_ShaderType,
+                                             kVec2f_GrSLType, "Target");
+    fGainUni = builder->addUniform(GrGLShaderBuilder::kFragment_ShaderType,
+                                   kFloat_GrSLType, "Gain");
+    fBiasUni = builder->addUniform(GrGLShaderBuilder::kFragment_ShaderType,
+                                   kFloat_GrSLType, "Bias");
+}
+
+static void appendTextureLookup(GrGLShaderBuilder* builder,
+                                const GrGLShaderBuilder::TextureSampler& sampler,
+                                const char* coord,
+                                SkMatrixConvolutionImageFilter::TileMode tileMode) {
+    SkString* code = &builder->fFSCode;
+    SkString clampedCoord;
+    switch (tileMode) {
+        case SkMatrixConvolutionImageFilter::kClamp_TileMode:
+            clampedCoord.printf("clamp(%s, 0.0, 1.0)", coord);
+            coord = clampedCoord.c_str();
+            break;
+        case SkMatrixConvolutionImageFilter::kRepeat_TileMode:
+            clampedCoord.printf("fract(%s)", coord);
+            coord = clampedCoord.c_str();
+            break;
+        case SkMatrixConvolutionImageFilter::kClampToBlack_TileMode:
+            code->appendf("clamp(%s, 0.0, 1.0) != %s ? vec4(0, 0, 0, 0) : ", coord, coord);
+            break;
+    }
+    builder->appendTextureLookup(code, sampler, coord);
+}
+
+void GrGLMatrixConvolutionEffect::emitFS(GrGLShaderBuilder* builder,
+                                  const char* outputColor,
+                                  const char* inputColor,
+                                  const TextureSamplerArray& samplers) {
+    SkString* code = &builder->fFSCode;
+
+    const char* target = builder->getUniformCStr(fTargetUni);
+    const char* imgInc = builder->getUniformCStr(fImageIncrementUni);
+    const char* kernel = builder->getUniformCStr(fKernelUni);
+    const char* gain = builder->getUniformCStr(fGainUni);
+    const char* bias = builder->getUniformCStr(fBiasUni);
+    int kWidth = fKernelSize.width();
+    int kHeight = fKernelSize.height();
+
+    code->appendf("\t\tvec4 sum = vec4(0, 0, 0, 0);\n");
+    code->appendf("\t\tvec2 coord = %s - %s * %s;\n",
+                  builder->defaultTexCoordsName(), target, imgInc);
+    code->appendf("\t\tfor (int y = 0; y < %d; y++) {\n", kHeight);
+    code->appendf("\t\t\tfor (int x = 0; x < %d; x++) {\n", kWidth);
+    code->appendf("\t\t\t\tfloat k = %s[y * %d + x];\n", kernel, kWidth);
+    code->appendf("\t\t\t\tvec2 coord2 = coord + vec2(x, y) * %s;\n", imgInc);
+    code->appendf("\t\t\t\tvec4 c = ");
+    appendTextureLookup(builder, samplers[0], "coord2", fTileMode);
+    code->appendf(";\n");
+    if (!fConvolveAlpha) {
+        code->appendf("\t\t\t\tc.rgb /= c.a;\n");
+    }
+    code->appendf("\t\t\t\tsum += c * k;\n");
+    code->appendf("\t\t\t}\n");
+    code->appendf("\t\t}\n");
+    if (fConvolveAlpha) {
+        code->appendf("\t\t%s = sum * %s + %s;\n", outputColor, gain, bias);
+        code->appendf("\t\t%s.rgb = clamp(%s.rgb, 0.0, %s.a);\n", outputColor, outputColor, outputColor);
+    } else {
+        code->appendf("\t\t%s.a = (", outputColor);
+        appendTextureLookup(builder, samplers[0], builder->defaultTexCoordsName(), fTileMode);
+        code->appendf(").a;\n");
+        code->appendf("\t\t%s.rgb = sum.rgb * %s + %s;\n", outputColor, gain, bias);
+        code->appendf("\t\t%s.rgb *= %s.a;\n", outputColor, outputColor);
+    }
+}
+
+namespace {
+
+int encodeXY(int x, int y) {
+    SkASSERT(x >= 1 && y >= 1 && x * y <= 32);
+    if (y < x)
+        return 0x40 | encodeXY(y, x);
+    else 
+        return (0x40 >> x) | (y - x);
+}
+
+};
+
+GrGLProgramStage::StageKey GrGLMatrixConvolutionEffect::GenKey(const GrCustomStage& s,
+                                                        const GrGLCaps& caps) {
+    const GrMatrixConvolutionEffect& m = static_cast<const GrMatrixConvolutionEffect&>(s);
+    StageKey key = encodeXY(m.kernelSize().width(), m.kernelSize().height());
+    key |= m.tileMode() << 7;
+    key |= m.convolveAlpha() ? 1 << 9 : 0;
+    return key;
+}
+
+void GrGLMatrixConvolutionEffect::setData(const GrGLUniformManager& uman,
+                                   const GrCustomStage& data,
+                                   const GrRenderTarget*,
+                                   int stageNum) {
+    const GrMatrixConvolutionEffect& effect =
+        static_cast<const GrMatrixConvolutionEffect&>(data);
+    GrGLTexture& texture =
+        *static_cast<GrGLTexture*>(data.texture(0));
+    // the code we generated was for a specific kernel size
+    GrAssert(effect.kernelSize() == fKernelSize);
+    GrAssert(effect.tileMode() == fTileMode);
+    float imageIncrement[2];
+    imageIncrement[0] = 1.0f / texture.width();
+    imageIncrement[1] = 1.0f / texture.height();
+    uman.set2fv(fImageIncrementUni, 0, 1, imageIncrement);
+    uman.set2fv(fTargetUni, 0, 1, effect.target());
+    uman.set1fv(fKernelUni, 0, fKernelSize.width() * fKernelSize.height(), effect.kernel());
+    uman.set1f(fGainUni, effect.gain());
+    uman.set1f(fBiasUni, effect.bias());
+}
+
+GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(GrTexture* texture,
+                                                     const SkISize& kernelSize,
+                                                     const SkScalar* kernel,
+                                                     SkScalar gain,
+                                                     SkScalar bias,
+                                                     const SkIPoint& target,
+                                                     TileMode tileMode,
+                                                     bool convolveAlpha)
+  : INHERITED(texture),
+    fKernelSize(kernelSize),
+    fGain(SkScalarToFloat(gain)),
+    fBias(SkScalarToFloat(bias) / 255.0f),
+    fTileMode(tileMode),
+    fConvolveAlpha(convolveAlpha) {
+    fKernel = new float[kernelSize.width() * kernelSize.height()];
+    for (int i = 0; i < kernelSize.width() * kernelSize.height(); i++) {
+        fKernel[i] = SkScalarToFloat(kernel[i]);
+    }
+    fTarget[0] = target.x();
+    fTarget[1] = target.y();
+}
+
+GrMatrixConvolutionEffect::~GrMatrixConvolutionEffect() {
+    delete[] fKernel;
+}
+
+const GrProgramStageFactory& GrMatrixConvolutionEffect::getFactory() const {
+    return GrTProgramStageFactory<GrMatrixConvolutionEffect>::getInstance();
+}
+
+bool GrMatrixConvolutionEffect::isEqual(const GrCustomStage& sBase) const {
+    const GrMatrixConvolutionEffect& s =
+        static_cast<const GrMatrixConvolutionEffect&>(sBase);
+    return INHERITED::isEqual(sBase) &&
+           fKernelSize == s.kernelSize() &&
+           !memcmp(fKernel, s.kernel(), fKernelSize.width() * fKernelSize.height() * sizeof(float)) &&
+           fGain == s.gain() &&
+           fBias == s.bias() &&
+           fTarget == s.target() &&
+           fTileMode == s.tileMode() &&
+           fConvolveAlpha == s.convolveAlpha();
+}
+
+GR_DEFINE_CUSTOM_STAGE_TEST(GrMatrixConvolutionEffect);
+
+// A little bit less than the minimum # uniforms required by DX9SM2 (32). 
+// Allows for a 5x5 kernel (or 25x1, for that matter).
+#define MAX_KERNEL_SIZE 25
+
+GrCustomStage* GrMatrixConvolutionEffect::TestCreate(SkRandom* random,
+                                                     GrContext* context,
+                                                     GrTexture* textures[]) {
+    int texIdx = random->nextBool() ? GrCustomStageUnitTest::kSkiaPMTextureIdx :
+                                      GrCustomStageUnitTest::kAlphaTextureIdx;
+    int width = random->nextRangeU(1, MAX_KERNEL_SIZE);
+    int height = random->nextRangeU(1, MAX_KERNEL_SIZE / width);
+    SkISize kernelSize = SkISize::Make(width, height);
+    SkScalar* kernel = new SkScalar[width * height];
+    for (int i = 0; i < width * height; i++) {
+        kernel[i] = random->nextSScalar1();
+    }
+    SkScalar gain = random->nextSScalar1();
+    SkScalar bias = random->nextSScalar1();
+    SkIPoint target = SkIPoint::Make(random->nextRangeU(0, kernelSize.width()),
+                                     random->nextRangeU(0, kernelSize.height()));
+    TileMode tileMode = static_cast<TileMode>(random->nextRangeU(0, 2));
+    bool convolveAlpha = random->nextBool();
+    return SkNEW_ARGS(GrMatrixConvolutionEffect, (textures[texIdx],
+                                                  kernelSize,
+                                                  kernel,
+                                                  gain,
+                                                  bias,
+                                                  target,
+                                                  tileMode,
+                                                  convolveAlpha));
+
+}
+
+bool SkMatrixConvolutionImageFilter::asNewCustomStage(GrCustomStage** stage,
+                                                      GrTexture* texture) const {
+    bool ok = fKernelSize.width() * fKernelSize.height() <= MAX_KERNEL_SIZE;
+    if (ok && stage) {
+        *stage = SkNEW_ARGS(GrMatrixConvolutionEffect, (texture,
+                                                        fKernelSize,
+                                                        fKernel,
+                                                        fGain,
+                                                        fBias,
+                                                        fTarget,
+                                                        fTileMode,
+                                                        fConvolveAlpha));
+    }
+    return ok;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif