Reorganize to keep similar code together.

This organizes memset16, memset32, and rsqrt the same way as the other code.  No functional change.

BUG=skia:4117
R=djsollen@google.com

Review URL: https://codereview.chromium.org/1264423002 .
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 14d28e1..815216b 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -7,8 +7,11 @@
 
 #include "SkOnce.h"
 #include "SkOpts.h"
+
 #define SK_OPTS_NS portable
 #include "SkBlurImageFilter_opts.h"
+#include "SkFloatingPoint_opts.h"
+#include "SkUtils_opts.h"
 #include "SkXfermode_opts.h"
 
 #if defined(SK_CPU_X86)
@@ -23,30 +26,14 @@
     #include <cpu-features.h>
 #endif
 
-namespace portable {  // This helps identify methods from this file when debugging / profiling.
-
-static float rsqrt(float x) {
-    // Get initial estimate.
-    int i = *SkTCast<int*>(&x);
-    i = 0x5F1FFFF9 - (i>>1);
-    float estimate = *SkTCast<float*>(&i);
-
-    // One step of Newton's method to refine.
-    const float estimate_sq = estimate*estimate;
-    estimate *= 0.703952253f*(2.38924456f-x*estimate_sq);
-    return estimate;
-}
-
-template <typename T>
-static void memsetT(T dst[], T val, int n) { while (n --> 0) { *dst++ = val; } }
-
-}  // namespace portable
-
 namespace SkOpts {
     // Define default function pointer values here...
+    // If our global compile options are set high enough, these 'portable' defaults might
+    // even be CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults.
+    // They'll still get a chance to be replaced with even better ones, e.g. using SSE4.1.
     decltype(rsqrt)                     rsqrt = portable::rsqrt;
-    decltype(memset16)               memset16 = portable::memsetT<uint16_t>;
-    decltype(memset32)               memset32 = portable::memsetT<uint32_t>;
+    decltype(memset16)               memset16 = portable::memset16;
+    decltype(memset32)               memset32 = portable::memset32;
     decltype(create_xfermode) create_xfermode = SkCreate4pxXfermode;
 
     static const auto x = portable::kX, y = portable::kY;
diff --git a/src/opts/SkFloatingPoint_opts.h b/src/opts/SkFloatingPoint_opts.h
new file mode 100644
index 0000000..8b6536a
--- /dev/null
+++ b/src/opts/SkFloatingPoint_opts.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkFloatingPoint_opts_DEFINED
+#define SkFloatingPoint_opts_DEFINED
+
+#include "SkFloatingPoint.h"
+
+namespace SK_OPTS_NS {
+
+#if defined(SK_ARM_HAS_NEON)
+    static float rsqrt(float x) {
+        return sk_float_rsqrt(x);  // This sk_float_rsqrt copy will take the NEON compile-time path.
+    }
+#else
+    static float rsqrt(float x) {
+        // Get initial estimate.
+        int i = *SkTCast<int*>(&x);
+        i = 0x5F1FFFF9 - (i>>1);
+        float estimate = *SkTCast<float*>(&i);
+
+        // One step of Newton's method to refine.
+        const float estimate_sq = estimate*estimate;
+        estimate *= 0.703952253f*(2.38924456f-x*estimate_sq);
+        return estimate;
+    }
+#endif
+
+}  // namespace SK_OPTS_NS
+
+#endif//SkFloatingPoint_opts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index aa71827..2db976d 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -5,68 +5,14 @@
  * found in the LICENSE file.
  */
 
-#include "SkFloatingPoint.h"
 #include "SkOpts.h"
+
 #define SK_OPTS_NS neon
 #include "SkBlurImageFilter_opts.h"
+#include "SkFloatingPoint_opts.h"
+#include "SkUtils_opts.h"
 #include "SkXfermode_opts.h"
 
-namespace neon {  // This helps identify methods from this file when debugging / profiling.
-
-static float rsqrt(float x) {
-    return sk_float_rsqrt(x);  // This sk_float_rsqrt copy will take the NEON compile-time path.
-}
-
-static void memset16(uint16_t* dst, uint16_t value, int n) {
-    uint16x8_t   v8  = vdupq_n_u16(value);
-    uint16x8x4_t v32 = {{ v8, v8, v8, v8 }};
-
-    while (n >= 32) {
-        vst4q_u16(dst, v32);  // This swizzles, but we don't care: all lanes are the same, value.
-        dst += 32;
-        n   -= 32;
-    }
-    switch (n / 8) {
-        case 3: vst1q_u16(dst, v8); dst += 8;
-        case 2: vst1q_u16(dst, v8); dst += 8;
-        case 1: vst1q_u16(dst, v8); dst += 8;
-    }
-    if (n & 4) {
-        vst1_u16(dst, vget_low_u16(v8));
-        dst += 4;
-    }
-    switch (n & 3) {
-        case 3: *dst++ = value;
-        case 2: *dst++ = value;
-        case 1: *dst   = value;
-    }
-}
-
-static void memset32(uint32_t* dst, uint32_t value, int n) {
-    uint32x4_t   v4  = vdupq_n_u32(value);
-    uint32x4x4_t v16 = {{ v4, v4, v4, v4 }};
-
-    while (n >= 16) {
-        vst4q_u32(dst, v16);  // This swizzles, but we don't care: all lanes are the same, value.
-        dst += 16;
-        n   -= 16;
-    }
-    switch (n / 4) {
-        case 3: vst1q_u32(dst, v4); dst += 4;
-        case 2: vst1q_u32(dst, v4); dst += 4;
-        case 1: vst1q_u32(dst, v4); dst += 4;
-    }
-    if (n & 2) {
-        vst1_u32(dst, vget_low_u32(v4));
-        dst += 2;
-    }
-    if (n & 1) {
-        *dst = value;
-    }
-}
-
-}  // namespace neon
-
 namespace SkOpts {
     void Init_neon() {
         rsqrt           = neon::rsqrt;
diff --git a/src/opts/SkOpts_sse2.cpp b/src/opts/SkOpts_sse2.cpp
index 5b4d6d6..ef0f96a 100644
--- a/src/opts/SkOpts_sse2.cpp
+++ b/src/opts/SkOpts_sse2.cpp
@@ -9,48 +9,9 @@
 
 #define SK_OPTS_NS sse2
 #include "SkBlurImageFilter_opts.h"
+#include "SkUtils_opts.h"
 #include "SkXfermode_opts.h"
 
-namespace sse2 {  // This helps identify methods from this file when debugging / profiling.
-
-static void memset16(uint16_t* dst, uint16_t val, int n) {
-    auto dst8 = (__m128i*)dst;
-    auto val8 = _mm_set1_epi16(val);
-    for ( ; n >= 8; n -= 8) {
-        _mm_storeu_si128(dst8++, val8);
-    }
-    dst = (uint16_t*)dst8;
-    if (n & 4) {
-        _mm_storel_epi64((__m128i*)dst, val8);
-        dst += 4;
-    }
-    if (n & 2) {
-        *(uint32_t*)dst = _mm_cvtsi128_si32(val8);
-        dst += 2;
-    }
-    if (n & 1) {
-        *dst = val;
-    }
-}
-
-static void memset32(uint32_t* dst, uint32_t val, int n) {
-    auto dst4 = (__m128i*)dst;
-    auto val4 = _mm_set1_epi32(val);
-    for ( ; n >= 4; n -= 4) {
-        _mm_storeu_si128(dst4++, val4);
-    }
-    dst = (uint32_t*)dst4;
-    if (n & 2) {
-        _mm_storel_epi64((__m128i*)dst, val4);
-        dst += 2;
-    }
-    if (n & 1) {
-        *dst = val;
-    }
-}
-
-}  // namespace sse2
-
 namespace SkOpts {
     void Init_sse2() {
         memset16        = sse2::memset16;
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 8e0500b..189810c 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "SkOpts.h"
+
 #define SK_OPTS_NS sse41
 #include "SkBlurImageFilter_opts.h"
 
diff --git a/src/opts/SkUtils_opts.h b/src/opts/SkUtils_opts.h
new file mode 100644
index 0000000..44fe643
--- /dev/null
+++ b/src/opts/SkUtils_opts.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkUtils_opts_DEFINED
+#define SkUtils_opts_DEFINED
+
+namespace SK_OPTS_NS {
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+static void memset16(uint16_t* dst, uint16_t val, int n) {
+    auto dst8 = (__m128i*)dst;
+    auto val8 = _mm_set1_epi16(val);
+    for ( ; n >= 8; n -= 8) {
+        _mm_storeu_si128(dst8++, val8);
+    }
+    dst = (uint16_t*)dst8;
+    if (n & 4) {
+        _mm_storel_epi64((__m128i*)dst, val8);
+        dst += 4;
+    }
+    if (n & 2) {
+        *(uint32_t*)dst = _mm_cvtsi128_si32(val8);
+        dst += 2;
+    }
+    if (n & 1) {
+        *dst = val;
+    }
+}
+
+static void memset32(uint32_t* dst, uint32_t val, int n) {
+    auto dst4 = (__m128i*)dst;
+    auto val4 = _mm_set1_epi32(val);
+    for ( ; n >= 4; n -= 4) {
+        _mm_storeu_si128(dst4++, val4);
+    }
+    dst = (uint32_t*)dst4;
+    if (n & 2) {
+        _mm_storel_epi64((__m128i*)dst, val4);
+        dst += 2;
+    }
+    if (n & 1) {
+        *dst = val;
+    }
+}
+
+#elif defined(SK_ARM_HAS_NEON)
+
+static void memset16(uint16_t* dst, uint16_t value, int n) {
+    uint16x8_t   v8  = vdupq_n_u16(value);
+    uint16x8x4_t v32 = {{ v8, v8, v8, v8 }};
+
+    while (n >= 32) {
+        vst4q_u16(dst, v32);  // This swizzles, but we don't care: all lanes are the same, value.
+        dst += 32;
+        n   -= 32;
+    }
+    switch (n / 8) {
+        case 3: vst1q_u16(dst, v8); dst += 8;
+        case 2: vst1q_u16(dst, v8); dst += 8;
+        case 1: vst1q_u16(dst, v8); dst += 8;
+    }
+    if (n & 4) {
+        vst1_u16(dst, vget_low_u16(v8));
+        dst += 4;
+    }
+    switch (n & 3) {
+        case 3: *dst++ = value;
+        case 2: *dst++ = value;
+        case 1: *dst   = value;
+    }
+}
+
+static void memset32(uint32_t* dst, uint32_t value, int n) {
+    uint32x4_t   v4  = vdupq_n_u32(value);
+    uint32x4x4_t v16 = {{ v4, v4, v4, v4 }};
+
+    while (n >= 16) {
+        vst4q_u32(dst, v16);  // This swizzles, but we don't care: all lanes are the same, value.
+        dst += 16;
+        n   -= 16;
+    }
+    switch (n / 4) {
+        case 3: vst1q_u32(dst, v4); dst += 4;
+        case 2: vst1q_u32(dst, v4); dst += 4;
+        case 1: vst1q_u32(dst, v4); dst += 4;
+    }
+    if (n & 2) {
+        vst1_u32(dst, vget_low_u32(v4));
+        dst += 2;
+    }
+    if (n & 1) {
+        *dst = value;
+    }
+}
+
+#else // Neither NEON nor SSE2.
+
+static void memset16(uint16_t* dst, uint16_t val, int n) { while (n --> 0) { *dst++ = val; } }
+static void memset32(uint32_t* dst, uint32_t val, int n) { while (n --> 0) { *dst++ = val; } }
+
+#endif
+
+}  // namespace SK_OPTS_NS
+
+#endif//SkUtils_opts_DEFINED