Tweak *mmintrin.h so that they don't make any bad assumptions about alignment (which probably has little effect in practice, but better to get it right).  Make the load in _mm_loadh_pi and _mm_loadl_pi a single LLVM IR instruction to make optimizing easier for CodeGen.

rdar://10054986



git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@139874 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 50f275d..a0bc0bb 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -501,31 +501,45 @@
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_loadh_pi(__m128 a, const __m64 *p)
 {
-  __m128 b;
-  b[0] = *(float*)p;
-  b[1] = *((float*)p+1);
-  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
+  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
+  return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
 }
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_loadl_pi(__m128 a, const __m64 *p)
 {
-  __m128 b;
-  b[0] = *(float*)p;
-  b[1] = *((float*)p+1);
-  return __builtin_shufflevector(a, b, 4, 5, 2, 3);
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
+  __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
+  return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
 }
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_load_ss(const float *p)
 {
-  return (__m128){ *p, 0, 0, 0 };
+  struct __mm_load_ss_struct {
+    float u;
+  } __attribute__((__packed__, __may_alias__));
+  float u = ((struct __mm_load_ss_struct*)p)->u;
+  return (__m128){ u, 0, 0, 0 };
 }
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_load1_ps(const float *p)
 {
-  return (__m128){ *p, *p, *p, *p };
+  struct __mm_load1_ps_struct {
+    float u;
+  } __attribute__((__packed__, __may_alias__));
+  float u = ((struct __mm_load1_ps_struct*)p)->u;
+  return (__m128){ u, u, u, u };
 }
 
 #define        _mm_load_ps1(p) _mm_load1_ps(p)
@@ -541,7 +555,7 @@
 {
   struct __loadu_ps {
     __m128 v;
-  } __attribute__((packed, may_alias));
+  } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_ps*)p)->v;
 }
 
@@ -604,7 +618,10 @@
 static __inline__ void __attribute__((__always_inline__))
 _mm_store_ss(float *p, __m128 a)
 {
-  *p = a[0];
+  struct __mm_store_ss_struct {
+    float u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)p)->u = a[0];
 }
 
 static __inline__ void __attribute__((__always_inline__, __nodebug__))