SSE2 version of blit_lcd16, courtesy of Jin Yang.
Yields 25-30% speedup on Windows (32b), 4-7% on Linux (64b, less register
pressure), not invoked on Mac (lcd text is 32b instead of 16b).

Followup: GDI system settings on Windows can suppress LCD text for small
fonts, interfering with our benchmarks.
(http://code.google.com/p/skia/issues/detail?id=483)

http://codereview.appspot.com/5617058/



git-svn-id: http://skia.googlecode.com/svn/trunk@3189 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/core/SkBlitMask_D32.cpp b/src/core/SkBlitMask_D32.cpp
index 341627a..c97e9e6 100644
--- a/src/core/SkBlitMask_D32.cpp
+++ b/src/core/SkBlitMask_D32.cpp
@@ -64,106 +64,16 @@
     } while (--height != 0);
 }
 
-///////////////////////////////////////////////////////////////////////////////
-
-static inline int upscale31To32(int value) {
-    SkASSERT((unsigned)value <= 31);
-    return value + (value >> 4);
-}
-
-static inline int blend32(int src, int dst, int scale) {
-    SkASSERT((unsigned)src <= 0xFF);
-    SkASSERT((unsigned)dst <= 0xFF);
-    SkASSERT((unsigned)scale <= 32);
-    return dst + ((src - dst) * scale >> 5);
-}
-
-static void blit_lcd16_row(SkPMColor dst[], const uint16_t src[],
-                           SkColor color, int width, SkPMColor) {
-    int srcA = SkColorGetA(color);
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
-    
-    srcA = SkAlpha255To256(srcA);
-    
-    for (int i = 0; i < width; i++) {
-        uint16_t mask = src[i];
-        if (0 == mask) {
-            continue;
-        }
-        
-        SkPMColor d = dst[i];
-        
-        /*  We want all of these in 5bits, hence the shifts in case one of them
-         *  (green) is 6bits.
-         */
-        int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
-        int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
-        int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
-        
-        // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
-        
-        maskR = maskR * srcA >> 8;
-        maskG = maskG * srcA >> 8;
-        maskB = maskB * srcA >> 8;
-        
-        int dstR = SkGetPackedR32(d);
-        int dstG = SkGetPackedG32(d);
-        int dstB = SkGetPackedB32(d);
-        
-        // LCD blitting is only supported if the dst is known/required
-        // to be opaque
-        dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) {
+    BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque);
+    if (proc) {
+        return proc;
     }
-}
-
-static void blit_lcd16_opaque_row(SkPMColor dst[], const uint16_t src[],
-                                  SkColor color, int width, SkPMColor opaqueDst) {
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
     
-    for (int i = 0; i < width; i++) {
-        uint16_t mask = src[i];
-        if (0 == mask) {
-            continue;
-        }
-        if (0xFFFF == mask) {
-            dst[i] = opaqueDst;
-            continue;
-        }
-        
-        SkPMColor d = dst[i];
-        
-        /*  We want all of these in 5bits, hence the shifts in case one of them
-         *  (green) is 6bits.
-         */
-        int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
-        int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
-        int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
-        
-        // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
-        
-        int dstR = SkGetPackedR32(d);
-        int dstG = SkGetPackedG32(d);
-        int dstB = SkGetPackedB32(d);
-        
-        // LCD blitting is only supported if the dst is known/required
-        // to be opaque
-        dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+    if (isOpaque) {
+        return  SkBlitLCD16OpaqueRow;
+    } else {
+        return  SkBlitLCD16Row;
     }
 }
 
@@ -175,13 +85,14 @@
     const uint16_t* srcRow = (const uint16_t*)mask;
     SkPMColor       opaqueDst;
     
-    void (*proc)(SkPMColor dst[], const uint16_t src[],
-                 SkColor color, int width, SkPMColor);
-    if (0xFF == SkColorGetA(color)) {
-        proc = blit_lcd16_opaque_row;
+    SkBlitMask::BlitLCD16RowProc proc = NULL;
+    bool isOpaque = (0xFF == SkColorGetA(color));
+    proc = SkBlitMask::BlitLCD16RowFactory(isOpaque);
+    SkASSERT(proc != NULL);
+
+    if (isOpaque) {
         opaqueDst = SkPreMultiplyColor(color);
     } else {
-        proc = blit_lcd16_row;
         opaqueDst = 0;  // ignored
     }
     
@@ -546,9 +457,9 @@
         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
         
         // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
+        maskR = SkUpscale31To32(maskR);
+        maskG = SkUpscale31To32(maskG);
+        maskB = SkUpscale31To32(maskB);
         
         int dstR = SkGetPackedR32(d);
         int dstG = SkGetPackedG32(d);
@@ -557,9 +468,9 @@
         // LCD blitting is only supported if the dst is known/required
         // to be opaque
         dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+                              SkBlend32(srcR, dstR, maskR),
+                              SkBlend32(srcG, dstG, maskG),
+                              SkBlend32(srcB, dstB, maskB));
     }
 }