LLVM doesn't always optimize away the four loads from this:
(__m128){ p[0], p[1], p[2], p[3] }
which produces really bad code. This could be done in instcombine, but it's
probably better to do it in the front-end instead.
<rdar://problem/9424836>
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@131237 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 00760ed..42dd3e8 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -539,7 +539,7 @@
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadu_ps(const float *p)
{
- return (__m128){ p[0], p[1], p[2], p[3] };
+ return __builtin_ia32_loadups(p);
}
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))