(SSE2) acceleration for rectangular opaque erases.
15% speedup for rectangles < 31 px wide, 5% for larger.

http://codereview.appspot.com/5843050/



git-svn-id: http://skia.googlecode.com/svn/trunk@3423 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gyp/opts.gyp b/gyp/opts.gyp
index 2ec076c..cf8e6dd 100644
--- a/gyp/opts.gyp
+++ b/gyp/opts.gyp
@@ -40,6 +40,7 @@
             '../src/opts/opts_check_SSE2.cpp',
             '../src/opts/SkBitmapProcState_opts_SSE2.cpp',
             '../src/opts/SkBlitRow_opts_SSE2.cpp',
+            '../src/opts/SkBlitRect_opts_SSE2.cpp',
             '../src/opts/SkUtils_opts_SSE2.cpp',
           ],
           'dependencies': [
diff --git a/include/core/SkBlitRow.h b/include/core/SkBlitRow.h
index fb62f5a..973ab4c 100644
--- a/include/core/SkBlitRow.h
+++ b/include/core/SkBlitRow.h
@@ -36,13 +36,6 @@
                          const SkPMColor* src,
                          int count, U8CPU alpha, int x, int y);
 
-   /** Function pointer that blends a single color with a row of 32-bit colors
-       onto a 32-bit destination
-   */
-   typedef void (*ColorProc)(SkPMColor* dst, const SkPMColor* src, int count,
-                             SkPMColor color);
-
-    //! Public entry-point to return a blit function ptr
     static Proc Factory(unsigned flags, SkBitmap::Config);
 
     ///////////// D32 version
@@ -64,6 +57,12 @@
 
     static Proc32 Factory32(unsigned flags32);
 
+   /** Function pointer that blends a single color with a row of 32-bit colors
+       onto a 32-bit destination
+   */
+   typedef void (*ColorProc)(SkPMColor* dst, const SkPMColor* src, int count,
+                             SkPMColor color);
+
     /** Blend a single color onto a row of S32 pixels, writing the result
         into a row of D32 pixels. src and dst may be the same memory, but
         if they are not, they may not overlap.
@@ -71,8 +70,20 @@
     static void Color32(SkPMColor dst[], const SkPMColor src[],
                         int count, SkPMColor color);
 
+    //! Public entry-point to return a blit function ptr
     static ColorProc ColorProcFactory();
 
+    /** Function pointer that blends a single color onto a 32-bit rectangle.  */
+    typedef void (*ColorRectProc)(SkPMColor* dst, int width, int height,
+                                  size_t rowBytes, SkPMColor color);
+
+    /** Blend a single color into a rectangle of D32 pixels. */
+    static void ColorRect32(SkPMColor* dst, int width, int height,
+                            size_t rowBytes, SkPMColor color);
+
+    //! Public entry-point to return a blit function ptr
+    static ColorRectProc ColorRectProcFactory();
+
     /** These static functions are called by the Factory and Factory32
         functions, and should return either NULL, or a
         platform-specific function-ptr to be used in place of the
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 97aa665..f1bf0ca 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -12,6 +12,8 @@
 
 #define UNROLL
 
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory();
+
 static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
                                  const SkPMColor* SK_RESTRICT src,
                                  int count, U8CPU alpha) {
@@ -178,3 +180,21 @@
     }
 }
 
+void SkBlitRow::ColorRect32(SkPMColor* dst, int width, int height,
+                            size_t rowBytes, SkPMColor color) {
+    SkBlitRow::ColorProc proc = SkBlitRow::ColorProcFactory();
+    while (--height >= 0) {
+        (*proc)(dst, dst, width, color);
+        dst = (SkPMColor*) ((char*)dst + rowBytes);
+    }
+}
+
+SkBlitRow::ColorRectProc SkBlitRow::ColorRectProcFactory() {
+    SkBlitRow::ColorRectProc proc = PlatformColorRectProcFactory();
+    if (NULL == proc) {
+        proc = ColorRect32;
+    }
+    SkASSERT(proc);
+    return proc;
+}
+
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index 24ab330..977c961 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -53,6 +53,7 @@
 
     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
     fColor32Proc = SkBlitRow::ColorProcFactory();
+    fColorRect32Proc = SkBlitRow::ColorRectProcFactory();
 }
 
 const SkBitmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
@@ -213,10 +214,14 @@
     uint32_t    color = fPMColor;
     size_t      rowBytes = fDevice.rowBytes();
 
-    while (--height >= 0) {
-        fColor32Proc(device, device, width, color);
-        device = (uint32_t*)((char*)device + rowBytes);
-    }
+    //if (255 == SkGetPackedA32(color)) {
+        fColorRect32Proc(device, width, height, rowBytes, color);
+    //} else {
+        //while (--height >= 0) {
+            //fColor32Proc(device, device, width, color);
+            //device = (uint32_t*)((char*)device + rowBytes);
+        //}
+    //}
 }
 
 #if defined _WIN32 && _MSC_VER >= 1300
diff --git a/src/core/SkCoreBlitters.h b/src/core/SkCoreBlitters.h
index 4947198..4a03a53 100644
--- a/src/core/SkCoreBlitters.h
+++ b/src/core/SkCoreBlitters.h
@@ -94,6 +94,7 @@
     SkColor                fColor;
     SkPMColor              fPMColor;
     SkBlitRow::ColorProc   fColor32Proc;
+    SkBlitRow::ColorRectProc fColorRect32Proc;
 
 private:
     unsigned fSrcA, fSrcR, fSrcG, fSrcB;
diff --git a/src/opts/SkBlitRect_opts_SSE2.cpp b/src/opts/SkBlitRect_opts_SSE2.cpp
new file mode 100644
index 0000000..9336951
--- /dev/null
+++ b/src/opts/SkBlitRect_opts_SSE2.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBlitRect_opts_SSE2.h"
+#include "SkBlitRow.h"
+#include "SkColorPriv.h"
+
+#include <emmintrin.h>
+
+/** Simple blitting of opaque rectangles less than 31 pixels wide:
+    inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+*/
+void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
+                                  int width, int height,
+                                  size_t rowBytes, uint32_t color) {
+    SkASSERT(255 == SkGetPackedA32(color));
+    SkASSERT(width > 0);
+    SkASSERT(width < 31);
+
+    while (--height >= 0) {
+        SkPMColor* dst = destination;
+        int count = width;
+
+        while (count > 4) {
+            *dst++ = color;
+            *dst++ = color;
+            *dst++ = color;
+            *dst++ = color;
+            count -= 4;
+        }
+
+        while (count > 0) {
+            *dst++ = color;
+            --count;
+        }
+
+        destination = (uint32_t*)((char*)destination + rowBytes);
+    }
+}
+
+/**
+  Fast blitting of opaque rectangles at least 31 pixels wide:
+  inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
+  A 31 pixel rectangle is guaranteed to have at least one
+  16-pixel aligned span that can take advantage of mm_store.
+*/
+void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
+                                int width, int height,
+                                size_t rowBytes, uint32_t color) {
+    SkASSERT(255 == SkGetPackedA32(color));
+    SkASSERT(width >= 31);
+
+    __m128i color_wide = _mm_set1_epi32(color);
+    while (--height >= 0) {
+        // Prefetching one row ahead to L1 cache can equal hardware
+        // performance for large/tall rects, but never *beats*
+        // hardware performance.
+        SkPMColor* dst = destination;
+        int count = width;
+
+        while (((size_t)dst) & 0x0F) {
+            *dst++ = color;
+            --count;
+        }
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+
+        // Googling suggests _mm_stream is only going to beat _mm_store
+        // for things that wouldn't fit in L2 cache anyway, typically
+        // >500kB, and precisely fill cache lines.  For us, with
+        // arrays > 100k elements _mm_stream is still 100%+ slower than
+        // mm_store.
+
+        // Unrolling to count >= 64 is a break-even for most
+        // input patterns; we seem to be saturating the bus and having
+        // low enough overhead at 32.
+
+        while (count >= 32) {
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            count -= 32;
+        }
+        if (count >= 16) {
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            _mm_store_si128(d++, color_wide);
+            count -= 16;
+        }
+        dst = reinterpret_cast<uint32_t*>(d);
+
+        // Unrolling the loop in the Narrow code is a significant performance
+        // gain, but unrolling this loop appears to make no difference in
+        // benchmarks with either mm_store_si128 or individual sets.
+
+        while (count > 0) {
+            *dst++ = color;
+            --count;
+        }
+
+        destination = (uint32_t*)((char*)destination + rowBytes);
+    }
+}
+
+void ColorRect32_SSE2(SkPMColor* destination,
+                      int width, int height,
+                      size_t rowBytes, uint32_t color) {
+    if (0 == height || 0 == width || 0 == color) {
+        return;
+    }
+    unsigned colorA = SkGetPackedA32(color);
+    //if (255 == colorA) {
+        //if (width < 31) {
+            //BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
+                                         //rowBytes, color);
+        //} else {
+            //BlitRect32_OpaqueWide_SSE2(destination, width, height,
+                                       //rowBytes, color);
+        //}
+    //} else {
+        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
+    //}
+}
+
diff --git a/src/opts/SkBlitRect_opts_SSE2.h b/src/opts/SkBlitRect_opts_SSE2.h
new file mode 100644
index 0000000..d3ec0e3
--- /dev/null
+++ b/src/opts/SkBlitRect_opts_SSE2.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkBlitRect_opts_SSE2_DEFINED
+#define SkBlitRect_opts_SSE2_DEFINED
+
+/*
+  These functions' implementations copy sections of both
+  SkBlitRow_opts_SSE2 and SkUtils_opts_SSE2.
+*/
+
+#include "SkColor.h"
+
+void ColorRect32_SSE2(SkPMColor* SK_RESTRICT dst,
+                      int width, int height,
+                      size_t rowBytes, uint32_t color);
+
+
+#endif
+
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 2adb88a..80ad517 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -8,6 +8,9 @@
 #include "SkBitmapProcState_opts_SSE2.h"
 #include "SkBitmapProcState_opts_SSSE3.h"
 #include "SkBlitMask.h"
+#include "SkBlitRect.h"
+#include "SkBlitRow.h"
+#include "SkBlitRect_opts_SSE2.h"
 #include "SkBlitRow_opts_SSE2.h"
 #include "SkUtils_opts_SSE2.h"
 #include "SkUtils.h"
@@ -209,3 +212,13 @@
         return NULL;
     }
 }
+
+SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
+    if (cachedHasSSE2()) {
+        return ColorRect32_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+