More SSE2-ification; fix for gcc -msse2.

Review URL:  http://codereview.appspot.com/154163



git-svn-id: http://skia.googlecode.com/svn/trunk@428 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/Makefile b/Makefile
index 0bf702c..7973c25 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
 CC := gcc
 C_INCLUDES := -Iinclude/config -Iinclude/core -Iinclude/effects -Iinclude/images -Iinclude/utils
 CFLAGS := -Wall -g # -O2 
+CFLAGS_SSE2 = $(CFLAGS) -msse2
 LINKER_OPTS := -lpthread
 DEFINES := -DSK_CAN_USE_FLOAT
 HIDE = @
@@ -76,6 +77,11 @@
 	SRC_LIST += src/images/SkScaledBitmapSampler.cpp
 endif
 
+# For these files, and these files only, compile with -msse2.
+SSE2_OBJS := out/src/opts/SkBlitRow_opts_SSE2.o \
+             out/src/opts/SkUtils_opts_SSE2.o
+$(SSE2_OBJS) : CFLAGS := $(CFLAGS_SSE2)
+
 out/%.o : %.cpp
 	@mkdir -p $(dir $@)
 	$(HIDE)$(CC) $(C_INCLUDES) $(CFLAGS) $(DEFINES) -c $< -o $@
diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h
index 9f3b1d6f..0700aeb 100644
--- a/include/core/SkUtils.h
+++ b/include/core/SkUtils.h
@@ -27,6 +27,8 @@
     @param count    The number of times value should be copied into the buffer.
 */
 void sk_memset16_portable(uint16_t dst[], uint16_t value, int count);
+typedef void (*SkMemset16Proc)(uint16_t dst[], uint16_t value, int count);
+SkMemset16Proc SkMemset16GetPlatformProc();
 
 /** Similar to memset(), but it assigns a 32bit value into the buffer.
     @param buffer   The memory to have value copied into it
@@ -34,6 +36,8 @@
     @param count    The number of times value should be copied into the buffer.
 */
 void sk_memset32_portable(uint32_t dst[], uint32_t value, int count);
+typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
+SkMemset32Proc SkMemset32GetPlatformProc();
 
 #ifdef ANDROID
     #include "cutils/memory.h"
@@ -43,14 +47,13 @@
 #endif
 
 #ifndef sk_memset16
-    #define sk_memset16(dst, value, count)  sk_memset16_portable(dst, value, count)
+extern SkMemset16Proc sk_memset16;
 #endif
 
 #ifndef sk_memset32
-    #define sk_memset32(dst, value, count)  sk_memset32_portable(dst, value, count)
+extern SkMemset32Proc sk_memset32;
 #endif
 
-
 ///////////////////////////////////////////////////////////////////////////
 
 #define kMaxBytesInUTF8Sequence     4
diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp
index edc5b74..a88233f 100644
--- a/src/core/SkUtils.cpp
+++ b/src/core/SkUtils.cpp
@@ -124,6 +124,27 @@
     }
 }
 
+#ifndef ANDROID
+static void sk_memset16_stub(uint16_t dst[], uint16_t value, int count)
+{
+    SkMemset16Proc proc = SkMemset16GetPlatformProc();
+    sk_memset16 = proc ? proc : sk_memset16_portable;
+    sk_memset16(dst, value, count);
+}
+
+SkMemset16Proc sk_memset16 = sk_memset16_stub;
+
+static void sk_memset32_stub(uint32_t dst[], uint32_t value, int count)
+{
+    SkMemset32Proc proc = SkMemset32GetPlatformProc();
+    sk_memset32 = proc ? proc : sk_memset32_portable;
+    sk_memset32(dst, value, count);
+}
+
+SkMemset32Proc sk_memset32 = sk_memset32_stub;
+
+#endif
+
 //////////////////////////////////////////////////////////////////////////////
 
 /*  0xxxxxxx    1 total
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 7428584..8983093 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -15,47 +15,17 @@
  ** limitations under the License.
  */
 
-#include "SkBlitRow.h"
+#include "SkBlitRow_opts_SSE2.h"
 #include "SkColorPriv.h"
-#include "SkDither.h"
 
 #include <emmintrin.h>
 
-#ifdef _MSC_VER
-static void getcpuid(int info_type, int info[4])
-{
-    __asm {
-        mov    eax, [info_type]
-        cpuid
-        mov    edi, [info]
-        mov    [edi], eax
-        mov    [edi+4], ebx
-        mov    [edi+8], ecx
-        mov    [edi+12], edx
-    }
-}
-#else
-static void getcpuid(int info_type, int info[4])
-{
-    // We save and restore ebx, so this code can be compatible with -fPIC
-    asm volatile (
-        "pushl %%ebx      \n\t"
-        "cpuid            \n\t"
-        "movl %%ebx, %1   \n\t"
-        "popl %%ebx       \n\t"
-        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
-        : "a"(info_type)
-        :
-    );
-}
-#endif
-
 /* SSE2 version of S32_Blend_BlitRow32()
  * portable version is in core/SkBlitRow_D32.cpp
  */
-static void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                                     const SkPMColor* SK_RESTRICT src,
-                                     int count, U8CPU alpha) {
+void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                              const SkPMColor* SK_RESTRICT src,
+                              int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count <= 0) {
         return;
@@ -108,7 +78,7 @@
 
     src = reinterpret_cast<const SkPMColor*>(s);
     dst = reinterpret_cast<SkPMColor*>(d);
-   while (count > 0) {
+    while (count > 0) {
         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
         src++;
         dst++;
@@ -116,9 +86,9 @@
     }
 }
 
-static void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                                       const SkPMColor* SK_RESTRICT src,
-                                       int count, U8CPU alpha) {
+void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                                const SkPMColor* SK_RESTRICT src,
+                                int count, U8CPU alpha) {
     SkASSERT(alpha == 255);
     if (count <= 0) {
         return;
@@ -228,9 +198,9 @@
     }
 }
 
-static void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
-                                      const SkPMColor* SK_RESTRICT src,
-                                      int count, U8CPU alpha) {
+void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                               const SkPMColor* SK_RESTRICT src,
+                               int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count <= 0) {
         return;
@@ -307,36 +277,3 @@
         count--;
     }
 }
-
-///////////////////////////////////////////////////////////////////////////////
-
-static const SkBlitRow::Proc32 platform_32_procs[] = {
-    NULL,                               // S32_Opaque,
-    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
-    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
-    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
-};
-
-SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
-    return NULL;
-}
-
-SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
-    return NULL;
-}
-
-SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
-    static bool once;
-    static bool hasSSE2;
-    if (!once) {
-        int cpu_info[4] = { 0 };
-        getcpuid(1, cpu_info);
-        hasSSE2 = (cpu_info[3] & (1<<26)) != 0;
-        once = true;
-    }
-    if (hasSSE2) {
-        return platform_32_procs[flags];
-    } else {
-        return NULL;
-    }
-}
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
new file mode 100644
index 0000000..c22edd8
--- /dev/null
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -0,0 +1,30 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License"); 
+ ** you may not use this file except in compliance with the License. 
+ ** You may obtain a copy of the License at 
+ **
+ **     http://www.apache.org/licenses/LICENSE-2.0 
+ **
+ ** Unless required by applicable law or agreed to in writing, software 
+ ** distributed under the License is distributed on an "AS IS" BASIS, 
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ ** See the License for the specific language governing permissions and 
+ ** limitations under the License.
+ */
+
+#include "SkBlitRow.h"
+
+void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                              const SkPMColor* SK_RESTRICT src,
+                              int count, U8CPU alpha);
+
+void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                                const SkPMColor* SK_RESTRICT src,
+                                int count, U8CPU alpha);
+
+void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
+                               const SkPMColor* SK_RESTRICT src,
+                               int count, U8CPU alpha);
diff --git a/src/opts/SkUtils_opts_SSE2.cpp b/src/opts/SkUtils_opts_SSE2.cpp
new file mode 100644
index 0000000..0537033
--- /dev/null
+++ b/src/opts/SkUtils_opts_SSE2.cpp
@@ -0,0 +1,77 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License"); 
+ ** you may not use this file except in compliance with the License. 
+ ** You may obtain a copy of the License at 
+ **
+ **     http://www.apache.org/licenses/LICENSE-2.0 
+ **
+ ** Unless required by applicable law or agreed to in writing, software 
+ ** distributed under the License is distributed on an "AS IS" BASIS, 
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ ** See the License for the specific language governing permissions and 
+ ** limitations under the License.
+ */
+
+#include <emmintrin.h>
+#include "SkUtils_opts_SSE2.h"
+ 
+void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count)
+{
+    SkASSERT(dst != NULL && count >= 0);
+
+    // dst must be 2-byte aligned.
+    SkASSERT((((size_t) dst) & 0x01) == 0);
+
+    if (count >= 32) {
+        while (((size_t)dst) & 0x0F) {
+            *dst++ = value;
+            --count;
+        }
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+        __m128i value_wide = _mm_set1_epi16(value);
+        while (count >= 32) {
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            count -= 32;
+        }
+        dst = reinterpret_cast<uint16_t*>(d);
+    }
+    while (count > 0) {
+        *dst++ = value;
+        --count;
+    }
+}
+ 
+void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count)
+{
+    SkASSERT(dst != NULL && count >= 0);
+
+    // dst must be 4-byte aligned.
+    SkASSERT((((size_t) dst) & 0x03) == 0);
+
+    if (count >= 16) {
+        while (((size_t)dst) & 0x0F) {
+            *dst++ = value;
+            --count;
+        }
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+        __m128i value_wide = _mm_set1_epi32(value);
+        while (count >= 16) {
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            _mm_store_si128(d++, value_wide);
+            count -= 16;
+        }
+        dst = reinterpret_cast<uint32_t*>(d);
+    }
+    while (count > 0) {
+        *dst++ = value;
+        --count;
+    }
+}
diff --git a/src/opts/SkUtils_opts_SSE2.h b/src/opts/SkUtils_opts_SSE2.h
new file mode 100644
index 0000000..a54e82f
--- /dev/null
+++ b/src/opts/SkUtils_opts_SSE2.h
@@ -0,0 +1,21 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License"); 
+ ** you may not use this file except in compliance with the License. 
+ ** You may obtain a copy of the License at 
+ **
+ **     http://www.apache.org/licenses/LICENSE-2.0 
+ **
+ ** Unless required by applicable law or agreed to in writing, software 
+ ** distributed under the License is distributed on an "AS IS" BASIS, 
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ ** See the License for the specific language governing permissions and 
+ ** limitations under the License.
+ */
+
+#include "SkTypes.h"
+ 
+void sk_memset16_SSE2(uint16_t *dst, uint16_t value, int count);
+void sk_memset32_SSE2(uint32_t *dst, uint32_t value, int count);
diff --git a/src/opts/SkUtils_opts_none.cpp b/src/opts/SkUtils_opts_none.cpp
new file mode 100644
index 0000000..108ce9c
--- /dev/null
+++ b/src/opts/SkUtils_opts_none.cpp
@@ -0,0 +1,26 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License"); 
+ ** you may not use this file except in compliance with the License. 
+ ** You may obtain a copy of the License at 
+ **
+ **     http://www.apache.org/licenses/LICENSE-2.0 
+ **
+ ** Unless required by applicable law or agreed to in writing, software 
+ ** distributed under the License is distributed on an "AS IS" BASIS, 
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ ** See the License for the specific language governing permissions and 
+ ** limitations under the License.
+ */
+
+#include "SkUtils.h"
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+    return NULL;
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+    return NULL;
+}
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
new file mode 100644
index 0000000..4757ed8
--- /dev/null
+++ b/src/opts/opts_check_SSE2.cpp
@@ -0,0 +1,104 @@
+/*
+ **
+ ** Copyright 2009, The Android Open Source Project
+ **
+ ** Licensed under the Apache License, Version 2.0 (the "License"); 
+ ** you may not use this file except in compliance with the License. 
+ ** You may obtain a copy of the License at 
+ **
+ **     http://www.apache.org/licenses/LICENSE-2.0 
+ **
+ ** Unless required by applicable law or agreed to in writing, software 
+ ** distributed under the License is distributed on an "AS IS" BASIS, 
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ ** See the License for the specific language governing permissions and 
+ ** limitations under the License.
+ */
+
+#include "SkBlitRow_opts_SSE2.h"
+#include "SkUtils_opts_SSE2.h"
+#include "SkUtils.h"
+
+/* This file must *not* be compiled with -msse or -msse2, otherwise
+   gcc may generate sse2 even for scalar ops (and thus give an invalid
+   instruction on Pentium3 on the code below).  Only files named *_SSE2.cpp
+   in this directory should be compiled with -msse2. */
+
+#ifdef __x86_64__
+/* All x86_64 machines have SSE2, so don't even bother checking. */
+static inline bool hasSSE2() {
+    return true;
+}
+#else
+#ifdef _MSC_VER
+static inline void getcpuid(int info_type, int info[4]) {
+    __asm {
+        mov    eax, [info_type]
+        cpuid
+        mov    edi, [info]
+        mov    [edi], eax
+        mov    [edi+4], ebx
+        mov    [edi+8], ecx
+        mov    [edi+12], edx
+    }
+}
+#else
+static inline void getcpuid(int info_type, int info[4]) {
+    // We save and restore ebx, so this code can be compatible with -fPIC
+    asm volatile (
+        "pushl %%ebx      \n\t"
+        "cpuid            \n\t"
+        "movl %%ebx, %1   \n\t"
+        "popl %%ebx       \n\t"
+        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
+        : "a"(info_type)
+        :
+    );
+}
+#endif
+
+static inline bool hasSSE2() {
+    int cpu_info[4] = { 0 };
+    getcpuid(1, cpu_info);
+    return (cpu_info[3] & (1<<26)) != 0;
+}
+#endif
+
+static SkBlitRow::Proc32 platform_32_procs[] = {
+    NULL,                               // S32_Opaque,
+    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
+    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
+    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
+};
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
+    return NULL;
+}
+
+SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
+    return NULL;
+}
+
+SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+    if (hasSSE2()) {
+        return platform_32_procs[flags];
+    } else {
+        return NULL;
+    }
+}
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+    if (hasSSE2()) {
+        return sk_memset16_SSE2;
+    } else {
+        return NULL;
+    }
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+    if (hasSSE2()) {
+        return sk_memset32_SSE2;
+    } else {
+        return NULL;
+    }
+}
diff --git a/src/opts/opts_files.mk b/src/opts/opts_files.mk
index d756f68..ae8fd77 100644
--- a/src/opts/opts_files.mk
+++ b/src/opts/opts_files.mk
@@ -1,4 +1,4 @@
 SOURCE := \
     SkBlitRow_opts_none.cpp \
-    SkBitmapProcState_opts_none.cpp
-
+    SkBitmapProcState_opts_none.cpp \
+    SkUtils_opts_none.cpp