Partial reapply of r5364 minus the non-neon code path.

See https://codereview.appspot.com/6465075 for a more detailed description of the contents of this CL.

Review URL: https://codereview.chromium.org/13060004

git-svn-id: http://skia.googlecode.com/svn/trunk@8579 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/bench/BitmapBench.cpp b/bench/BitmapBench.cpp
index 0efdde3..bd2ba68 100644
--- a/bench/BitmapBench.cpp
+++ b/bench/BitmapBench.cpp
@@ -21,25 +21,6 @@
     "ERROR", "a1", "a8", "index8", "565", "4444", "8888"
 };
 
-static void drawIntoBitmap(const SkBitmap& bm) {
-    const int w = bm.width();
-    const int h = bm.height();
-
-    SkCanvas canvas(bm);
-    SkPaint p;
-    p.setAntiAlias(true);
-    p.setColor(SK_ColorRED);
-    canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
-                      SkIntToScalar(SkMin32(w, h))*3/8, p);
-
-    SkRect r;
-    r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
-    p.setStyle(SkPaint::kStroke_Style);
-    p.setStrokeWidth(SkIntToScalar(4));
-    p.setColor(SK_ColorBLUE);
-    canvas.drawRect(r, p);
-}
-
 static int conv6ToByte(int x) {
     return x * 0xFF / 5;
 }
@@ -102,38 +83,23 @@
     bool        fIsOpaque;
     bool        fForceUpdate; //bitmap marked as dirty before each draw. forces bitmap to be updated on device cache
     int         fTileX, fTileY; // -1 means don't use shader
+    bool        fIsVolatile;
+    SkBitmap::Config fConfig;
     SkString    fName;
     enum { N = SkBENCHLOOP(300) };
+    enum { W = 128 };
+    enum { H = 128 };
 public:
     BitmapBench(void* param, bool isOpaque, SkBitmap::Config c,
                 bool forceUpdate = false, bool bitmapVolatile = false,
                 int tx = -1, int ty = -1)
-        : INHERITED(param), fIsOpaque(isOpaque), fForceUpdate(forceUpdate), fTileX(tx), fTileY(ty) {
-        const int w = 128;
-        const int h = 128;
-        SkBitmap bm;
-
-        if (SkBitmap::kIndex8_Config == c) {
-            bm.setConfig(SkBitmap::kARGB_8888_Config, w, h);
-        } else {
-            bm.setConfig(c, w, h);
-        }
-        bm.allocPixels();
-        bm.eraseColor(isOpaque ? SK_ColorBLACK : 0);
-
-        drawIntoBitmap(bm);
-
-        if (SkBitmap::kIndex8_Config == c) {
-            convertToIndex666(bm, &fBitmap);
-        } else {
-            fBitmap = bm;
-        }
-
-        if (fBitmap.getColorTable()) {
-            fBitmap.getColorTable()->setIsOpaque(isOpaque);
-        }
-        fBitmap.setIsOpaque(isOpaque);
-        fBitmap.setIsVolatile(bitmapVolatile);
+        : INHERITED(param)
+        , fIsOpaque(isOpaque)
+        , fForceUpdate(forceUpdate)
+        , fTileX(tx)
+        , fTileY(ty)
+        , fIsVolatile(bitmapVolatile)
+        , fConfig(c) {
     }
 
 protected:
@@ -145,16 +111,43 @@
                 fName.appendf("_%s", gTileName[fTileY]);
             }
         }
-        fName.appendf("_%s%s", gConfigName[fBitmap.config()],
+        fName.appendf("_%s%s", gConfigName[fConfig],
                       fIsOpaque ? "" : "_A");
         if (fForceUpdate)
             fName.append("_update");
-        if (fBitmap.isVolatile())
+        if (fIsVolatile)
             fName.append("_volatile");
 
         return fName.c_str();
     }
 
+    virtual void onPreDraw() {
+        SkBitmap bm;
+
+        if (SkBitmap::kIndex8_Config == fConfig) {
+            bm.setConfig(SkBitmap::kARGB_8888_Config, W, H);
+        } else {
+            bm.setConfig(fConfig, W, H);
+        }
+
+        bm.allocPixels();
+        bm.eraseColor(fIsOpaque ? SK_ColorBLACK : 0);
+
+        onDrawIntoBitmap(bm);
+
+        if (SkBitmap::kIndex8_Config == fConfig) {
+            convertToIndex666(bm, &fBitmap);
+        } else {
+            fBitmap = bm;
+        }
+
+        if (fBitmap.getColorTable()) {
+            fBitmap.getColorTable()->setIsOpaque(fIsOpaque);
+        }
+        fBitmap.setIsOpaque(fIsOpaque);
+        fBitmap.setIsVolatile(fIsVolatile);
+    }
+
     virtual void onDraw(SkCanvas* canvas) {
         SkIPoint dim = this->getSize();
         SkRandom rand;
@@ -177,6 +170,25 @@
         }
     }
 
+    virtual void onDrawIntoBitmap(const SkBitmap& bm) {
+        const int w = bm.width();
+        const int h = bm.height();
+
+        SkCanvas canvas(bm);
+        SkPaint p;
+        p.setAntiAlias(true);
+        p.setColor(SK_ColorRED);
+        canvas.drawCircle(SkIntToScalar(w)/2, SkIntToScalar(h)/2,
+                          SkIntToScalar(SkMin32(w, h))*3/8, p);
+
+        SkRect r;
+        r.set(0, 0, SkIntToScalar(w), SkIntToScalar(h));
+        p.setStyle(SkPaint::kStroke_Style);
+        p.setStrokeWidth(SkIntToScalar(4));
+        p.setColor(SK_ColorBLUE);
+        canvas.drawRect(r, p);
+    }
+
 private:
     typedef SkBenchmark INHERITED;
 };
@@ -241,6 +253,95 @@
     typedef BitmapBench INHERITED;
 };
 
+/** Verify optimizations that test source alpha values. */
+
+class SourceAlphaBitmapBench : public BitmapBench {
+public:
+    enum SourceAlpha { kOpaque_SourceAlpha, kTransparent_SourceAlpha,
+                       kTwoStripes_SourceAlpha, kThreeStripes_SourceAlpha};
+private:
+    SkString    fFullName;
+    SourceAlpha fSourceAlpha;
+public:
+    SourceAlphaBitmapBench(void* param, SourceAlpha alpha, SkBitmap::Config c,
+                bool forceUpdate = false, bool bitmapVolatile = false,
+                int tx = -1, int ty = -1)
+        : INHERITED(param, false, c, forceUpdate, bitmapVolatile, tx, ty)
+        , fSourceAlpha(alpha) {
+    }
+
+protected:
+    virtual const char* onGetName() {
+        fFullName.set(INHERITED::onGetName());
+
+        if (fSourceAlpha == kOpaque_SourceAlpha) {
+                fFullName.append("_source_opaque");
+        } else if (fSourceAlpha == kTransparent_SourceAlpha) {
+                fFullName.append("_source_transparent");
+        } else if (fSourceAlpha == kTwoStripes_SourceAlpha) {
+                fFullName.append("_source_stripes_two");
+        } else if (fSourceAlpha == kThreeStripes_SourceAlpha) {
+                fFullName.append("_source_stripes_three");
+        }
+
+        return fFullName.c_str();
+    }
+
+    virtual void onDrawIntoBitmap(const SkBitmap& bm) SK_OVERRIDE {
+        const int w = bm.width();
+        const int h = bm.height();
+
+        if (kOpaque_SourceAlpha == fSourceAlpha) {
+            bm.eraseColor(SK_ColorBLACK);
+        } else if (kTransparent_SourceAlpha == fSourceAlpha) {
+            bm.eraseColor(0);
+        } else if (kTwoStripes_SourceAlpha == fSourceAlpha) {
+            bm.eraseColor(0);
+
+            SkCanvas canvas(bm);
+            SkPaint p;
+            p.setAntiAlias(false);
+            p.setStyle(SkPaint::kFill_Style);
+            p.setColor(SK_ColorRED);
+
+            // Draw red vertical stripes on transparent background
+            SkRect r;
+            for (int x = 0; x < w; x+=2)
+            {
+                r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
+                canvas.drawRect(r, p);
+            }
+
+        } else if (kThreeStripes_SourceAlpha == fSourceAlpha) {
+            bm.eraseColor(0);
+
+            SkCanvas canvas(bm);
+            SkPaint p;
+            p.setAntiAlias(false);
+            p.setStyle(SkPaint::kFill_Style);
+
+            // Draw vertical stripes on transparent background with a pattern
+            // where the first pixel is fully transparent, the next is semi-transparent
+            // and the third is fully opaque.
+            SkRect r;
+            for (int x = 0; x < w; x++)
+            {
+                if (x % 3 == 0) {
+                    continue; // Keep transparent
+                } else if (x % 3 == 1) {
+                    p.setColor(SkColorSetARGB(127, 127, 127, 127)); // Semi-transparent
+                } else if (x % 3 == 2) {
+                    p.setColor(SK_ColorRED); // Opaque
+                }
+                r.set(SkIntToScalar(x), 0, SkIntToScalar(x+1), SkIntToScalar(h));
+                canvas.drawRect(r, p);
+            }
+        }
+    }
+
+private:
+    typedef BitmapBench INHERITED;
+};
 static SkBenchmark* Fact0(void* p) { return new BitmapBench(p, false, SkBitmap::kARGB_8888_Config); }
 static SkBenchmark* Fact1(void* p) { return new BitmapBench(p, true, SkBitmap::kARGB_8888_Config); }
 static SkBenchmark* Fact2(void* p) { return new BitmapBench(p, true, SkBitmap::kRGB_565_Config); }
@@ -263,6 +364,12 @@
 static SkBenchmark* Fact15(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, true, -1, -1, true, true, true); }
 static SkBenchmark* Fact16(void* p) { return new FilterBitmapBench(p, true, SkBitmap::kARGB_8888_Config, true, false, -1, -1, true, true, true); }
 
+// source alpha tests -> S32A_Opaque_BlitRow32_{arm,neon}
+static SkBenchmark* Fact17(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kOpaque_SourceAlpha, SkBitmap::kARGB_8888_Config); }
+static SkBenchmark* Fact18(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTransparent_SourceAlpha, SkBitmap::kARGB_8888_Config); }
+static SkBenchmark* Fact19(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kTwoStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
+static SkBenchmark* Fact20(void* p) { return new SourceAlphaBitmapBench(p, SourceAlphaBitmapBench::kThreeStripes_SourceAlpha, SkBitmap::kARGB_8888_Config); }
+
 static BenchRegistry gReg0(Fact0);
 static BenchRegistry gReg1(Fact1);
 static BenchRegistry gReg2(Fact2);
@@ -282,3 +389,8 @@
 static BenchRegistry gReg14(Fact14);
 static BenchRegistry gReg15(Fact15);
 static BenchRegistry gReg16(Fact16);
+
+static BenchRegistry gReg17(Fact17);
+static BenchRegistry gReg18(Fact18);
+static BenchRegistry gReg19(Fact19);
+static BenchRegistry gReg20(Fact20);
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 22785be..00086c3 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -517,6 +517,178 @@
     }
 }
 
+void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
+                                const SkPMColor* SK_RESTRICT src,
+                                int count, U8CPU alpha) {
+    SkASSERT(255 == alpha);
+
+    if (count <= 0)
+    return;
+
+    /* Use these to check if src is transparent or opaque */
+    const unsigned int ALPHA_OPAQ  = 0xFF000000;
+    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
+
+#define UNROLL  4
+    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
+    const SkPMColor* SK_RESTRICT src_temp = src;
+
+    /* set up the NEON variables */
+    uint8x8_t alpha_mask;
+    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
+    alpha_mask = vld1_u8(alpha_mask_setup);
+
+    uint8x8_t src_raw, dst_raw, dst_final;
+    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
+    uint8x8_t dst_cooked;
+    uint16x8_t dst_wide;
+    uint8x8_t alpha_narrow;
+    uint16x8_t alpha_wide;
+
+    /* choose the first processing type */
+    if( src >= src_end)
+        goto TAIL;
+    if(*src <= ALPHA_TRANS)
+        goto ALPHA_0;
+    if(*src >= ALPHA_OPAQ)
+        goto ALPHA_255;
+    /* fall-thru */
+
+ALPHA_1_TO_254:
+    do {
+
+        /* get the source */
+        src_raw = vreinterpret_u8_u32(vld1_u32(src));
+        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
+
+        /* get and hold the dst too */
+        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
+        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
+
+
+        /* get the alphas spread out properly */
+        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
+        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
+        /* we collapsed (255-a)+1 ... */
+        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
+
+        /* spread the dest */
+        dst_wide = vmovl_u8(dst_raw);
+
+        /* alpha mul the dest */
+        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
+        dst_cooked = vshrn_n_u16(dst_wide, 8);
+
+        /* sum -- ignoring any byte lane overflows */
+        dst_final = vadd_u8(src_raw, dst_cooked);
+
+        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
+        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
+        /* we collapsed (255-a)+1 ... */
+        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
+
+        /* spread the dest */
+        dst_wide = vmovl_u8(dst_raw_2);
+
+        /* alpha mul the dest */
+        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
+        dst_cooked = vshrn_n_u16(dst_wide, 8);
+
+        /* sum -- ignoring any byte lane overflows */
+        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
+
+        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
+        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
+
+        src += UNROLL;
+        dst += UNROLL;
+
+        /* if 2 of the next pixels aren't between 1 and 254
+        it might make sense to go to the optimized loops */
+        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
+            break;
+
+    } while(src < src_end);
+
+    if (src >= src_end)
+        goto TAIL;
+
+    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
+        goto ALPHA_255;
+
+    /*fall-thru*/
+
+ALPHA_0:
+
+    /*In this state, we know the current alpha is 0 and
+     we optimize for the next alpha also being zero. */
+    src_temp = src;  //so we don't have to increment dst every time
+    do {
+        if(*(++src) > ALPHA_TRANS)
+            break;
+        if(*(++src) > ALPHA_TRANS)
+            break;
+        if(*(++src) > ALPHA_TRANS)
+            break;
+        if(*(++src) > ALPHA_TRANS)
+            break;
+    } while(src < src_end);
+
+    dst += (src - src_temp);
+
+    /* no longer alpha 0, so determine where to go next. */
+    if( src >= src_end)
+        goto TAIL;
+    if(*src >= ALPHA_OPAQ)
+        goto ALPHA_255;
+    else
+        goto ALPHA_1_TO_254;
+
+ALPHA_255:
+    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
+        dst[0]=src[0];
+        dst[1]=src[1];
+        dst[2]=src[2];
+        dst[3]=src[3];
+        src+=UNROLL;
+        dst+=UNROLL;
+        if(src >= src_end)
+            goto TAIL;
+    }
+
+    //Handle remainder.
+    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
+        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
+            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
+        }
+    }
+
+    if( src >= src_end)
+        goto TAIL;
+    if(*src <= ALPHA_TRANS)
+        goto ALPHA_0;
+    else
+        goto ALPHA_1_TO_254;
+
+TAIL:
+    /* do any residual iterations */
+    src_end += UNROLL + 1;  //goto the real end
+    while(src != src_end) {
+        if( *src != 0 ) {
+            if( *src >= ALPHA_OPAQ ) {
+                *dst = *src;
+            }
+            else {
+                *dst = SkPMSrcOver(*src, *dst);
+            }
+        }
+        src++;
+        dst++;
+    }
+
+#undef    UNROLL
+    return;
+}
 
 /* Neon version of S32_Blend_BlitRow32()
  * portable version is in src/core/SkBlitRow_D32.cpp
@@ -1107,6 +1279,20 @@
 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
     NULL,   // S32_Opaque,
     S32_Blend_BlitRow32_neon,        // S32_Blend,
-    S32A_Opaque_BlitRow32_neon,        // S32A_Opaque,
+    /*
+     * We have two choices for S32A_Opaque procs. The one reads the src alpha
+     * value and attempts to optimize accordingly.  The optimization is
+     * sensitive to the source content and is not a win in all cases. For
+     * example, if there are a lot of transitions between the alpha states,
+     * the performance will almost certainly be worse.  However, for many
+     * common cases the performance is equivalent or better than the standard
+     * case where we do not inspect the src alpha.
+     */
+#if SK_A32_SHIFT == 24
+    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
+    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
+#else
+    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
+#endif
     S32A_Blend_BlitRow32_arm        // S32A_Blend
 };