Fix perf regression in Color32.

The regression was due to the fact that we were calling PlatformColorProc() for
every span (which in turns makes CPUID, a fairly expensive call).  Since we draw
a lot of rects, and rects have 1-pixel wide spans for the vertical segments,
that's a lot of CPUID.

Fixed by cacheing the result of PlatformColorProc(), as is done for the other
platform-specific blitters.

Review URL:  http://codereview.appspot.com/3669042/



git-svn-id: http://skia.googlecode.com/svn/trunk@636 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/Makefile b/Makefile
index 65f0b71..146d11a 100644
--- a/Makefile
+++ b/Makefile
@@ -94,6 +94,7 @@
 
 # For these files, and these files only, compile with -msse2.
 SSE2_OBJS := out/src/opts/SkBlitRow_opts_SSE2.o \
+             out/src/opts/SkBitmapProcState_opts_SSE2.o \
              out/src/opts/SkUtils_opts_SSE2.o
 $(SSE2_OBJS) : CFLAGS := $(CFLAGS_SSE2)
 
diff --git a/include/core/SkBlitRow.h b/include/core/SkBlitRow.h
index a592167..2b652c2 100644
--- a/include/core/SkBlitRow.h
+++ b/include/core/SkBlitRow.h
@@ -55,24 +55,16 @@
                          const SkPMColor* SK_RESTRICT src,
                          int count, U8CPU alpha);
 
-    static void Color32_BlitRow32(SkPMColor dst[], const SkPMColor src[], 
-                                  int count, SkPMColor color);
-
     static Proc32 Factory32(unsigned flags32);
     
     /** Blend a single color onto a row of S32 pixels, writing the result
         into a row of D32 pixels. src and dst may be the same memory, but
         if they are not, they may not overlap.
      */
-    static void Color32(SkPMColor dst[], const SkPMColor src[], int count,
-                         SkPMColor color);
+    static void Color32(SkPMColor dst[], const SkPMColor src[], 
+                        int count, SkPMColor color);
 
-    /** Blend a single color onto a row of 32bit pixels, writing the result
-        into the same row.
-     */
-    static void Color32(SkPMColor row[], int count, SkPMColor color) {
-        Color32(row, row, count, color);
-    }
+    static ColorProc ColorProcFactory();
 
     /** These static functions are called by the Factory and Factory32
         functions, and should return either NULL, or a
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 1f154a4..f1dcb30 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -86,18 +86,17 @@
     return proc;
 }
 
-void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count,
-                        SkPMColor color) {
+SkBlitRow::Proc32 SkBlitRow::ColorProcFactory() {
     SkBlitRow::ColorProc proc = PlatformColorProc();
     if (NULL == proc) {
-        Color32_BlitRow32(dst, src, count, color);
-        return;
+        proc = Color32;
     }
-    proc(dst, src, count, color);
+    SkASSERT(proc);
+    return proc;
 }
 
-void SkBlitRow::Color32_BlitRow32(SkPMColor dst[], const SkPMColor src[], 
-                                  int count, SkPMColor color) {
+void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], 
+                        int count, SkPMColor color) {
     if (count > 0) {
         if (0 == color) {
             if (src != dst) {
diff --git a/src/core/SkBlitter_ARGB32.cpp b/src/core/SkBlitter_ARGB32.cpp
index 37bd0da..905ab6b 100644
--- a/src/core/SkBlitter_ARGB32.cpp
+++ b/src/core/SkBlitter_ARGB32.cpp
@@ -51,6 +51,7 @@
     fSrcB = SkAlphaMul(SkColorGetB(color), scale);
 
     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
+    fColor32Proc = SkBlitRow::ColorProcFactory();
 }
 
 const SkBitmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
@@ -69,7 +70,8 @@
 void SkARGB32_Blitter::blitH(int x, int y, int width) {
     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
 
-    SkBlitRow::Color32(fDevice.getAddr32(x, y), width, fPMColor);
+    uint32_t*   device = fDevice.getAddr32(x, y);
+    fColor32Proc(device, device, width, fPMColor);
 }
 
 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
@@ -94,7 +96,7 @@
                 sk_memset32(device, color, count);
             } else {
                 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
-                SkBlitRow::Color32(device, count, sc);
+                fColor32Proc(device, device, count, sc);
             }
         }
         runs += count;
@@ -286,7 +288,7 @@
     size_t      rowBytes = fDevice.rowBytes();
 
     while (--height >= 0) {
-        SkBlitRow::Color32(device, width, color);
+        fColor32Proc(device, device, width, color);
         device = (uint32_t*)((char*)device + rowBytes);
     }
 }
diff --git a/src/core/SkCoreBlitters.h b/src/core/SkCoreBlitters.h
index 6204b2c..32e8035 100644
--- a/src/core/SkCoreBlitters.h
+++ b/src/core/SkCoreBlitters.h
@@ -101,7 +101,8 @@
     virtual const SkBitmap* justAnOpaqueColor(uint32_t*);
 
 protected:
-    SkColor fPMColor;
+    SkColor                fPMColor;
+    SkBlitRow::ColorProc   fColor32Proc;
 
 private:
     unsigned fSrcA, fSrcR, fSrcG, fSrcB;
diff --git a/src/effects/SkColorFilters.cpp b/src/effects/SkColorFilters.cpp
index 733e1ae..a396d35 100644
--- a/src/effects/SkColorFilters.cpp
+++ b/src/effects/SkColorFilters.cpp
@@ -75,7 +75,8 @@
 
 class SkSrcOver_XfermodeColorFilter : public Sk_XfermodeColorFilter {
 public:
-    SkSrcOver_XfermodeColorFilter(SkColor color) : INHERITED(color) {}
+    SkSrcOver_XfermodeColorFilter(SkColor color)
+        : INHERITED(color), fColor32Proc(SkBlitRow::ColorProcFactory()) {}
 
     virtual uint32_t getFlags() {
         if (SkGetPackedA32(fPMColor) == 0xFF) {
@@ -87,7 +88,7 @@
     
     virtual void filterSpan(const SkPMColor shader[], int count,
                             SkPMColor result[]) {
-        SkBlitRow::Color32(result, shader, count, fPMColor);
+        fColor32Proc(result, shader, count, fPMColor);
     }
 
     virtual void filterSpan16(const uint16_t shader[], int count,
@@ -100,7 +101,7 @@
     virtual Factory getFactory() { return CreateProc;  }
     
     SkSrcOver_XfermodeColorFilter(SkFlattenableReadBuffer& buffer)
-        : INHERITED(buffer) {}
+        : INHERITED(buffer), fColor32Proc(SkBlitRow::ColorProcFactory()) {}
     
 private:
     static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) {
@@ -108,6 +109,7 @@
     }
     
     typedef Sk_XfermodeColorFilter INHERITED;
+    SkBlitRow::ColorProc fColor32Proc;
 };
 
 //////////////////////////////////////////////////////////////////////////////