LLVM doesn't always optimize away the four loads from this:

     (__m128){ p[0], p[1], p[2], p[3] }

which produces really bad code. This could be done in instcombine, but it's
probably better to do it in the front-end instead.
<rdar://problem/9424836>


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@131237 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 00760ed..42dd3e8 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -539,7 +539,7 @@
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_loadu_ps(const float *p)
 {
-  return (__m128){ p[0], p[1], p[2], p[3] };
+  return __builtin_ia32_loadups(p);
 }
 
 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))