add shaderproc32 for index bitmaps. neon version of 32->16+dither



git-svn-id: http://skia.googlecode.com/svn/trunk@408 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 600b963..eabd966 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -289,6 +289,8 @@
     } while (0)
 
 
+// clamp
+
 #define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
 #define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
@@ -314,6 +316,23 @@
 #define SRC_TO_FILTER(src)      src
 #include "SkBitmapProcState_shaderproc.h"
 
+
+#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
+#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
+#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst)   Filter_32_opaque(x, y, a, b, c, d, dst)
+#define MAKENAME(suffix)        Clamp_SI8_opaque_D32 ## suffix
+#define SRCTYPE                 uint8_t
+#define DSTTYPE                 uint32_t
+#define CHECKSTATE(state)       SkASSERT(state.fBitmap->config() == SkBitmap::kIndex8_Config)
+#define PREAMBLE(state)         const SkPMColor* SK_RESTRICT table = state.fBitmap->getColorTable()->lockColors()
+#define SRC_TO_FILTER(src)      table[src]
+#define POSTAMBLE(state)        state.fBitmap->getColorTable()->unlockColors(false)
+#include "SkBitmapProcState_shaderproc.h"
+
 ///////////////////////////////////////////////////////////////////////////////
 
 static bool valid_for_filtering(unsigned dimension) {
@@ -503,6 +522,8 @@
                    SkShader::kRepeat_TileMode == fTileModeY) {
             fShaderProc16 = Repeat_S16_D16_filter_DX_shaderproc;
         }
+    } else if (SI8_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) {
+        fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc;
     }
 
     // see if our platform has any accelerated overrides
diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h
index b4a53e4..15831b6 100644
--- a/src/core/SkBitmapProcState_shaderproc.h
+++ b/src/core/SkBitmapProcState_shaderproc.h
@@ -36,6 +36,10 @@
         fx = SkScalarToFixed(pt.fX) - (oneX >> 1);
     }
 
+#ifdef PREAMBLE
+    PREAMBLE(s);
+#endif
+    
     do {
         unsigned subX = TILEX_LOW_BITS(fx, maxX);
         unsigned x0 = TILEX_PROCF(fx, maxX);
@@ -51,6 +55,10 @@
 
         fx += dx;
     } while (--count != 0);
+
+#ifdef POSTAMBLE
+    POSTAMBLE(s);
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -65,5 +73,7 @@
 #undef CHECKSTATE
 #undef SRC_TO_FILTER
 #undef FILTER_TO_DST
+#undef PREAMBLE
+#undef POSTAMBLE
 
 #undef SCALE_FILTER_NAME
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 506c4d4..be1cbdf 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -862,6 +862,120 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+#if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
+ * speedup untested, but ARM version is 26 insns/iteration and
+ * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
+ * which is 10x the native version; that's pure instruction counts,
+ * not accounting for any instruction or memory latencies.
+ */
+
+#undef	DEBUG_S32_OPAQUE_DITHER
+
+static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
+                                     const SkPMColor* SK_RESTRICT src,
+                                     int count, U8CPU alpha, int x, int y) {
+    SkASSERT(255 == alpha);
+
+#define	UNROLL	8
+    if (count >= UNROLL) {
+	uint8x8_t d;
+	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
+	d = vld1_u8(dstart);
+
+	while (count >= UNROLL) {
+	    uint8x8_t sr, sg, sb, sa;
+	    uint16x8_t dr, dg, db, da;
+	    uint16x8_t dst8;
+
+	    /* source is in ABGR ordering (R == lsb) */
+	    {
+		register uint8x8_t d0 asm("d0");
+		register uint8x8_t d1 asm("d1");
+		register uint8x8_t d2 asm("d2");
+		register uint8x8_t d3 asm("d3");
+
+		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
+		    : "r" (src)
+                    );
+		    sr = d0; sg = d1; sb = d2; sa = d3;
+	    }
+	    /* XXX: if we want to prefetch, hide it in the above asm()
+	     * using the gcc __builtin_prefetch(), the prefetch will
+	     * fall to the bottom of the loop -- it won't stick up
+	     * at the top of the loop, just after the vld4.
+	     */
+
+	    /* sr = sr - (sr>>5) + d */
+	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
+	    dr = vaddl_u8(sr, d);
+
+	    /* sb = sb - (sb>>5) + d */
+	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
+	    db = vaddl_u8(sb, d);
+
+	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
+	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
+	    /* XXX: check that the "d>>1" here is hoisted */
+
+	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
+	    dst8 = vshrq_n_u16(db, 3);
+	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
+	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
+
+	    /* store it */
+	    vst1q_u16(dst, dst8);
+
+#if	defined(DEBUG_S32_OPAQUE_DITHER)
+	    /* always good to know if we generated good results */
+	    {
+		int i, myx = x, myy = y;
+		DITHER_565_SCAN(myy);
+		for (i=0;i<UNROLL;i++) {
+		    SkPMColor c = src[i];
+		    unsigned dither = DITHER_VALUE(myx);
+		    uint16_t val = SkDitherRGB32To565(c, dither);
+		    if (val != dst[i]) {
+			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
+			    c, dither, val, dst[i], dstart[i]);
+		    }
+		    DITHER_INC_X(myx);
+		}
+	    }
+#endif
+
+	    dst += UNROLL;
+	    src += UNROLL;
+	    count -= UNROLL;
+	    x += UNROLL;		/* probably superfluous */
+	}
+    }
+#undef	UNROLL
+
+    /* residuals */
+    if (count > 0) {
+        DITHER_565_SCAN(y);
+        do {
+            SkPMColor c = *src++;
+            SkPMColorAssert(c);
+            SkASSERT(SkGetPackedA32(c) == 255);
+
+            unsigned dither = DITHER_VALUE(x);
+            *dst++ = SkDitherRGB32To565(c, dither);
+            DITHER_INC_X(x);
+        } while (--count != 0);
+    }
+}
+
+#define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
+#else
+#define	S32_D565_Opaque_Dither_PROC NULL
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
 const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = {
     // no dither
     S32_D565_Opaque_PROC,
@@ -870,7 +984,7 @@
     S32A_D565_Blend_PROC,
     
     // dither
-    NULL,   // S32_D565_Opaque_Dither,
+    S32_D565_Opaque_Dither_PROC,
     S32_D565_Blend_Dither_PROC,
     S32A_D565_Opaque_Dither_PROC,
     NULL,   // S32A_D565_Blend_Dither