Move immintrin/arm_neon includes to where they are used.

On my Mac (so, immintrin), this improves compile time, both wall and cpu,
by about 16%.  To test I ran this on an SSD with files hot in their caches:

  $ env CC=/usr/bin/clang CXX=/usr/bin/clang++ ./gyp_skia && \
    ninja -C out/Release -t clean && \
    time ninja -C out/Release

  Before: 159 wall / 3367 cpu
          159 wall / 3368 cpu

  After:  137 wall / 2860 cpu
          136 wall / 2863 cpu

I also tried further refining immintrin down to emmintrin / tmmintrin / smmintrin etc.
That made no signficant difference, so I've kept immintrin for its simplicity.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2045633002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

TBR=reed@google.com
No public API changes.

Committed: https://skia.googlesource.com/skia/+/12dfaaa53c23f3d03050bde8f64136ac1f44164a
Review-Url: https://codereview.chromium.org/2045633002
diff --git a/include/core/SkTypes.h b/include/core/SkTypes.h
index a47225d..4e95c69 100644
--- a/include/core/SkTypes.h
+++ b/include/core/SkTypes.h
@@ -14,12 +14,6 @@
 #include "SkPostConfig.h"
 #include <stddef.h>
 #include <stdint.h>
-
-#if defined(SK_ARM_HAS_NEON)
-    #include <arm_neon.h>
-#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-    #include <immintrin.h>
-#endif
 // IWYU pragma: end_exports
 
 #include <string.h>
diff --git a/include/private/SkFloatingPoint.h b/include/private/SkFloatingPoint.h
index 6ed6144..a7aa50c 100644
--- a/include/private/SkFloatingPoint.h
+++ b/include/private/SkFloatingPoint.h
@@ -15,6 +15,12 @@
 #include <math.h>
 #include <float.h>
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
+    #include <xmmintrin.h>
+#elif defined(SK_ARM_HAS_NEON)
+    #include <arm_neon.h>
+#endif
+
 // For _POSIX_VERSION
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 #include <unistd.h>
diff --git a/src/core/SkSharedMutex.h b/src/core/SkSharedMutex.h
index 21c9f46..302940b 100644
--- a/src/core/SkSharedMutex.h
+++ b/src/core/SkSharedMutex.h
@@ -14,6 +14,18 @@
 
 #ifdef SK_DEBUG
     #include "SkMutex.h"
+
+    // On GCC 4.8, targeting ARMv7 with NEON, using libc++, we need to typedef float float32_t,
+    // (or include <arm_neon.h> which does that) before #including <memory> here.
+    // This makes no sense.  I'm not very interested in understanding why... this is an old,
+    // bizarre platform configuration that we should just let die.
+    #include <ciso646>  // Include something innocuous to define _LIBCPP_VERISON if it's libc++.
+    #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8 \
+     && defined(SK_CPU_ARM32) && defined(SK_ARM_HAS_NEON) \
+     && defined(_LIBCPP_VERSION)
+        typedef float float32_t;
+    #endif
+
     #include <memory>
 #endif  // SK_DEBUG
 
diff --git a/src/opts/SkBlurImageFilter_opts.h b/src/opts/SkBlurImageFilter_opts.h
index 8d22391..f62604c 100644
--- a/src/opts/SkBlurImageFilter_opts.h
+++ b/src/opts/SkBlurImageFilter_opts.h
@@ -11,6 +11,10 @@
 #include "SkColorPriv.h"
 #include "SkTypes.h"
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include <immintrin.h>
+#endif
+
 namespace SK_OPTS_NS {
 
 enum class BlurDirection { kX, kY };
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 66f8074..063b99f 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -8,6 +8,8 @@
 #ifndef SkNx_neon_DEFINED
 #define SkNx_neon_DEFINED
 
+#include <arm_neon.h>
+
 #define SKNX_IS_FAST
 
 // ARMv8 has vrndmq_f32 to floor 4 floats.  Here we emulate it:
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 1fc235d..0b22a5a 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -9,6 +9,7 @@
 #define SkNx_sse_DEFINED
 
 #include "SkCpu.h"
+#include <immintrin.h>
 
 // This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
 // If you do, make sure this is in a static inline function... anywhere else risks violating ODR.
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 15eec3a..a22e145 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -10,6 +10,12 @@
 
 #include "SkColorPriv.h"
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    #include <immintrin.h>
+#elif defined(SK_ARM_HAS_NEON)
+    #include <arm_neon.h>
+#endif
+
 namespace SK_OPTS_NS {
 
 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {