Tweak *mmintrin.h so that they don't make any bad assumptions about alignment (which probably has little effect in practice, but better to get it right). Make the load in _mm_loadh_pi and _mm_loadl_pi a single LLVM IR instruction to make optimizing easier for CodeGen.
rdar://10054986
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@139874 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 50f275d..a0bc0bb 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -501,31 +501,45 @@
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pi(__m128 a, const __m64 *p)
{
- __m128 b;
- b[0] = *(float*)p;
- b[1] = *((float*)p+1);
- return __builtin_shufflevector(a, b, 0, 1, 4, 5);
+ typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_loadh_pi_struct {
+ __mm_loadh_pi_v2f32 u;
+ } __attribute__((__packed__, __may_alias__));
+ __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
+ __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
+ return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
}
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pi(__m128 a, const __m64 *p)
{
- __m128 b;
- b[0] = *(float*)p;
- b[1] = *((float*)p+1);
- return __builtin_shufflevector(a, b, 4, 5, 2, 3);
+ typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_loadl_pi_struct {
+ __mm_loadl_pi_v2f32 u;
+ } __attribute__((__packed__, __may_alias__));
+ __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
+ __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
+ return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
}
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load_ss(const float *p)
{
- return (__m128){ *p, 0, 0, 0 };
+ struct __mm_load_ss_struct {
+ float u;
+ } __attribute__((__packed__, __may_alias__));
+ float u = ((struct __mm_load_ss_struct*)p)->u;
+ return (__m128){ u, 0, 0, 0 };
}
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load1_ps(const float *p)
{
- return (__m128){ *p, *p, *p, *p };
+ struct __mm_load1_ps_struct {
+ float u;
+ } __attribute__((__packed__, __may_alias__));
+ float u = ((struct __mm_load1_ps_struct*)p)->u;
+ return (__m128){ u, u, u, u };
}
#define _mm_load_ps1(p) _mm_load1_ps(p)
@@ -541,7 +555,7 @@
{
struct __loadu_ps {
__m128 v;
- } __attribute__((packed, may_alias));
+ } __attribute__((__packed__, __may_alias__));
return ((struct __loadu_ps*)p)->v;
}
@@ -604,7 +618,10 @@
static __inline__ void __attribute__((__always_inline__))
_mm_store_ss(float *p, __m128 a)
{
- *p = a[0];
+ struct __mm_store_ss_struct {
+ float u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_ss_struct*)p)->u = a[0];
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))