add BlitRow procs for 32->32, to allow for neon and other optimizations.
call these new procs in (nearly) all the places we had inlined loops before.
In once instance (blitter_argb32::blitAntiH) we get different results by a
  tiny bit. The new code is more accurate, and exactly inline with all of the
  other like-minded blits, so I think the change is good going forward.



git-svn-id: http://skia.googlecode.com/svn/trunk@366 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
new file mode 100644
index 0000000..f67bb9a
--- /dev/null
+++ b/src/core/SkBlitRow_D32.cpp
@@ -0,0 +1,112 @@
+#include "SkBlitRow.h"
+#include "SkColorPriv.h"
+#include "SkUtils.h"
+
+static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
+                                 const SkPMColor* SK_RESTRICT src,
+                                 int count, U8CPU alpha) {
+    SkASSERT(255 == alpha);
+    memcpy(dst, src, count * sizeof(SkPMColor));
+}
+
+static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
+                                const SkPMColor* SK_RESTRICT src,
+                                int count, U8CPU alpha) {
+    SkASSERT(alpha <= 255);
+    if (count > 0) {
+        unsigned src_scale = SkAlpha255To256(alpha);
+        unsigned dst_scale = 256 - src_scale;
+        do {
+            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
+            src += 1;
+            dst += 1;
+        } while (--count > 0);
+    }
+}
+
+//#define TEST_SRC_ALPHA
+
+static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
+                                  const SkPMColor* SK_RESTRICT src,
+                                  int count, U8CPU alpha) {
+    SkASSERT(255 == alpha);
+    if (count > 0) {
+        do {
+#ifdef TEST_SRC_ALPHA
+            SkPMColor sc = *src;
+            if (sc) {
+                unsigned srcA = SkGetPackedA32(sc);
+                SkPMColor result = sc;
+                if (srcA != 255) {
+                    result = SkPMSrcOver(sc, *dst);
+                }
+                *dst = result;
+            }
+#else
+            *dst = SkPMSrcOver(*src, *dst);
+#endif
+            src += 1;
+            dst += 1;
+        } while (--count > 0);
+    }
+}
+
+static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
+                                 const SkPMColor* SK_RESTRICT src,
+                                 int count, U8CPU alpha) {
+    SkASSERT(alpha <= 255);
+    if (count > 0) {
+        do {
+            *dst = SkBlendARGB32(*src, *dst, alpha);
+            src += 1;
+            dst += 1;
+        } while (--count > 0);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+static const SkBlitRow::Proc32 gDefault_Procs32[] = {
+    S32_Opaque_BlitRow32,
+    S32_Blend_BlitRow32,
+    S32A_Opaque_BlitRow32,
+    S32A_Blend_BlitRow32
+};
+
+SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
+    SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32));
+    // just so we don't crash
+    flags &= kFlags32_Mask;
+    
+    SkBlitRow::Proc32 proc = gPlatform_Procs32[flags];
+    if (NULL == proc) {
+        proc = gDefault_Procs32[flags];
+    }
+    SkASSERT(proc);
+    return proc;
+}
+
+void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count,
+                        SkPMColor color) {
+    if (count > 0) {
+        if (0 == color) {
+            if (src != dst) {
+                memcpy(dst, src, count * sizeof(SkPMColor));
+            }
+        }
+        unsigned colorA = SkGetPackedA32(color);
+        if (255 == colorA) {
+            sk_memset32(dst, color, count);
+        } else {
+            unsigned scale = 256 - SkAlpha255To256(colorA);
+            do {
+                *dst = color + SkAlphaMulQ(*src, scale);
+                src += 1;
+                dst += 1;
+            } while (--count);
+        }
+    }
+}
+
+
+